In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show

import sklearn.preprocessing as preprocessing

%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500) # to see all columns

In [2]:
data = pd.read_csv('CrowdstormingDataJuly1st_preprocessed.csv')
data_total = data.copy()
print('Number of dayads', data.shape)
data.head()

Number of dayads (124468, 30)


Unnamed: 0.1,Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,photoID,rater1,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp,color_rating
0,0,901,1046,70,3,1382,177.0,72.0,0,1,0,0,1,0,0,0,0,1532,0.25,0.5,1,1,59,0.326391,712.0,0.000564,0.396,750.0,0.002696,1
1,1,739,919,51,1,320,179.0,82.0,12,1,0,0,1,0,1,0,0,497,0.75,0.75,2,2,153,0.203375,40.0,0.010875,-0.204082,49.0,0.061504,3
2,5,0,392,34,0,360,182.0,71.0,1,1,0,0,1,0,0,0,0,1081,0.25,0.0,4,4,87,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0
3,6,45,425,48,0,446,187.0,80.0,7,1,1,0,0,0,0,0,0,1175,0.0,0.25,4,4,87,0.325185,127.0,0.003297,0.538462,130.0,0.013752,0
4,7,64,440,54,0,158,180.0,68.0,4,1,0,0,1,0,0,0,0,803,1.0,1.0,4,4,87,0.325185,127.0,0.003297,0.538462,130.0,0.013752,4


# Aggregate the data

One solution is to group the data by player name. Then we need to find a solution to correctly group the remaining features:
    - club: we have to check if a player appear in 2 different club (in case of a transfer during the winter mercato ) or if the transfer are not taking into account. (-> one (several) hot encoding. or majority dyads per club)
    - leagueCountry: same as club
    - position: test if the player as different -> position with the majority of game? 
    - photoID: drop that information, the id is unique -> not relevant for our classification probleme
    - refNum: replace with the total of unique refs
    - refCountry: same as refNum
    - Alpha_3: remove: redundant information since it correspond to the refCountry
    - meanIAT: make new features
        - take mean 
        - take weighted mean (weight with nIAT)
        - take weighted mean (weight with game numers)
    - meanExp: same as IAT
    - seAIT: 
    - seExp:

In [3]:
data.columns

Index(['Unnamed: 0', 'playerShort', 'player', 'club', 'leagueCountry',
       'birthday', 'height', 'weight', 'position', 'games', 'victories',
       'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards',
       'photoID', 'rater1', 'rater2', 'refNum', 'refCountry', 'Alpha_3',
       'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp', 'color_rating'],
      dtype='object')

In [4]:
clubUnique = True
leagueUnique = True
positionUnique = True
def checkFunction(player):

    #check if the club is unique for one player
    if len(player.club.unique()) > 1:
        clubUnique = False
        print(player.player, 'plays for more than one team: ', player.club.unique())
    #check if the leagueCountry is unique
    if len(player.leagueCountry.unique()) > 1:
        leagueUnique = False
        print(player.player, 'plays for more than one league: ', player.leagueCountry.unique())
    #check if the position is unique
    if len(player.position.unique()) > 1:
        positionUnique = False
        print(player.player, 'plays for more than one position: ', player.position.unique())
           
data_group = pd.groupby(data, by=data.playerShort).apply(checkFunction)
print("Is the club for a player unique? ", clubUnique)
print("Is the league for a player unique? ", leagueUnique)
print("Is the position for a player unique? ", positionUnique)


Is the club for a player unique?  True
Is the league for a player unique?  True
Is the position for a player unique?  True


List of the columns

In [5]:
data.columns

Index(['Unnamed: 0', 'playerShort', 'player', 'club', 'leagueCountry',
       'birthday', 'height', 'weight', 'position', 'games', 'victories',
       'ties', 'defeats', 'goals', 'yellowCards', 'yellowReds', 'redCards',
       'photoID', 'rater1', 'rater2', 'refNum', 'refCountry', 'Alpha_3',
       'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp', 'color_rating'],
      dtype='object')

In [6]:
def aggreagtion(df):
    first_entry = df.head(1)
    # new aggregation entry
    new_entry = first_entry.copy()
    
    #sum of the info about the games
    new_entry.games = df.games.sum()
    new_entry.victories = df.victories.sum()
    new_entry.ties = df.ties.sum()
    new_entry.defeats = df.defeats.sum()
    new_entry.goals = df.goals.sum()
    new_entry.yellowCards = df.yellowCards.sum()
    new_entry.yellowReds = df.yellowReds.sum()
    new_entry.redCards = df.redCards.sum()
    
    #drop photoID and alpha_3
    new_entry.drop('photoID', inplace = True, axis=1)
    new_entry.drop('Alpha_3', inplace = True, axis=1)
    #refNum: number of unique ref
    new_entry = new_entry.rename(columns = {'refNum': 'refCount'})
    new_entry.refCount = len(df.refNum.unique()) 
    #refCountry: replace by number of unique country
    new_entry = new_entry.rename(columns = {'refCountry': 'refCountryCount'})
    new_entry.refCountryCount = len(df.refCountry.unique())
    
    #==Mean of the test result ===
    #- take mean 
    #- take weighted mean (weight with nIAT)
    #- take weighted mean (weight with game numers)
    
    new_entry.meanIAT = df.meanIAT.mean()
    new_entry.meanExp = df.meanExp.mean()
    
    new_entry['meanIAT_nIAT'] = (df.meanIAT * df.nIAT).sum() / df.nIAT.sum() 
    new_entry['meanExp_nExp'] = (df.meanExp * df.nExp).sum() / df.nExp.sum()
    
    new_entry['meanIAT_GameNbr'] = (df.meanIAT * df.games).sum() / df.games.sum()
    new_entry['meanExp_GameNbr'] = (df.meanExp * df.games).sum() / df.games.sum()
    
    
    #????????????????????? DROP nIART nExp or NOT ?????????????????????????????
    new_entry.drop('nIAT', inplace = True, axis=1)
    new_entry.drop('nExp', inplace = True, axis=1)
    
    # standard error = standard deviation / sqrt(n)
    #mean of the standard deviation: mean of the variance and then sqrt
    #varIAT = (df.seIAT * np.sqrt(df.nIAT)) ** 2
    #new_entry.seIAT = np.sqrt(np.mean(varIAT))/ np.sqrt(df.nIAT)
    #varExp = (df.seExp * np.sqrt(df.nExp)) ** 2
    #new_entry.seExp = np.sqrt(np.mean(varExp))/ np.sqrt(df.nExp)
    return new_entry
 
data_agr = pd.groupby(data, by=data.playerShort).apply(aggreagtion)
data_agr    

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,playerShort,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,defeats,goals,yellowCards,yellowReds,redCards,rater1,rater2,refCount,refCountryCount,meanIAT,seIAT,meanExp,seExp,color_rating,meanIAT_nIAT,meanExp_nExp,meanIAT_GameNbr,meanExp_GameNbr
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,2,5,0,392,34,0,360,182.0,71.0,1,654,247,179,228,9,19,0,0,0.25,0.00,166,37,0.346459,0.003297,0.494575,0.013752,0,0.328409,0.367721,0.333195,0.400637
1,483,745,1,393,91,2,176,183.0,73.0,0,336,141,73,122,62,42,0,1,0.00,0.25,99,25,0.348818,0.003297,0.449220,0.013752,1,0.329945,0.441615,0.341438,0.380811
2,484,746,2,394,83,0,719,165.0,63.0,11,412,200,97,115,31,11,0,0,0.25,0.25,101,28,0.345893,0.003297,0.491482,0.013752,1,0.328230,0.365628,0.332389,0.399459
3,485,747,3,395,6,0,1199,178.0,76.0,3,260,150,42,68,39,31,0,1,0.00,0.00,104,37,0.346821,0.003297,0.514693,0.013752,0,0.327775,0.412859,0.336638,0.433294
4,3890,4418,4,396,51,1,758,180.0,73.0,1,124,41,40,43,1,8,4,2,0.25,0.25,37,11,0.331600,0.000151,0.335587,0.000586,1,0.338847,0.379497,0.331882,0.328895
5,1900,2305,5,397,36,1,751,180.0,74.0,11,97,41,23,33,3,11,1,0,0.75,0.75,42,13,0.320079,0.000229,0.296562,0.001002,3,0.332117,0.366031,0.327985,0.317247
6,36,47,6,398,80,1,1341,189.0,80.0,5,24,8,8,8,0,0,0,0,0.75,1.00,20,5,0.341625,0.000151,0.400818,0.000586,4,0.337572,0.361394,0.343556,0.428271
7,3893,4423,7,399,36,1,872,188.0,83.0,5,3,0,1,2,0,0,0,0,0.75,1.00,2,2,0.355406,0.000151,0.417225,0.000586,4,0.341567,0.363284,0.348498,0.390184
8,3894,4424,8,400,78,1,897,180.0,68.0,4,121,34,25,62,3,21,0,2,1.00,1.00,56,3,0.348178,0.000151,0.429630,0.000586,4,0.344426,0.404517,0.344130,0.402314
9,13937,16617,9,401,77,2,82,179.0,70.0,7,50,17,8,25,0,3,0,1,0.75,1.00,27,7,0.342072,0.000055,0.361068,0.000225,4,0.337478,0.339232,0.339733,0.341395


## KMeans

In [7]:
from sklearn import metrics
from sklearn.cluster import KMeans

In [8]:
np.random.seed(1)
nbr_class = 2

In [10]:
kmeans = KMeans(init='k-means++', n_clusters=nbr_class, n_init=1)
kmeans.fit(data_agr)
metrics.silhouette_score(data_agr, kmeans.labels_, metric='euclidean')

0.96468210271189347

# 3. Machine learning

In [None]:
# encoders
le = pp.LabelEncoder() # for playernames etc.
ohe = pp.OneHotEncoder() # for categories with 'few' categories

In [None]:
rf_input_data = data_agr.copy()

rf_input_data['player'] = le.fit_transform(data_agr['player'])
rf_input_data['club'] = le.fit_transform(data_agr['club'])
rf_input_data['leagueCountry'] = le.fit_transform(data_agr['leagueCountry'])
rf_input_data['birthday'] = le.fit_transform(data_agr['birthday'])
rf_input_data['position'] = le.fit_transform(data_agr['position'])
rf_input_data['color_rating'] = le.fit_transform(data_agr['color_rating'])
rf_input_data.head(1)

In [None]:
player_colors = rf_input_data['color_rating']
rf_input_data = rf_input_data.drop(['rater1', 'rater2', 'color_rating', 'playerShort'], axis=1) 


rf_input_data.dtypes

In [None]:
rf = RandomForestClassifier()
# roc_scorer = make_scorer(roc_curve)
X = rf_input_data
print(rf_input_data.columns)
y = player_colors
result = cross_val_score(rf, X, y, cv=10, n_jobs=3, pre_dispatch='n_jobs+1', verbose=3)

"""
kf = KFold(n_splits=10)
for train, test in kf.split(rf_input_data):
    X_train, X_test, y_train, y_test = rf_input_data[train], rf_input_data[test], player_colors[train], player_colors[test]
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
"""

result

In [None]:
rf.fit(X, y)
    
    
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

cols = rf_input_data.columns
# Print the feature ranking
print("Feature ranking:")
for f in range(X.shape[1]):
    print("%d. feature n° %d %s (%f)" % (f + 1, indices[f], cols[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()    