# Imports 

In [4]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
import pickle

import seaborn as sns
from functools import reduce
from sklearn.cross_validation import train_test_split
from sklearn import ensemble

In [7]:
try:
    df = pickle.load(df, open( "data.pickle", "wb" ))
except:
    df = pd.read_csv('CrowdstormingDataJuly1st.csv', parse_dates = ['birthday'])
    pickle.dump(df, open( "data.pickle", "wb" ))

# Cleaning

In [8]:
# We first drop players that don't have a skin rating (caused by the absence of photos)
print ('we drop ' + str(round(100*df[df.photoID.isnull()].count()[1]/ df.count()[1], 2)) + '% of observation because they don\'t have a picture')

df = df[df.photoID.notnull()]
print(reduce(lambda x,y: x or y, df['rater1'].isnull()))
print(reduce(lambda x,y: x or y, df['rater2'].isnull())) # all ratings exist

we drop 14.66% of observation because they don't have a picture
False
False


In [42]:
# abs(df['rater1'] - df['rater2'])
c = sns.color_palette()
jitter_x = np.random.normal(0, 0.04, size=len(df.rater1))
jitter_y = np.random.normal(0, 0.04, size=len(df.rater2))
# For the time being it runs too long when we rerun everything :P.
#sns.jointplot(df.rater1 + jitter_x, df.rater2 + jitter_y, kind='kde')


As can be observed in the jointplot above, the only real difference in rating is between the 2 lightests colors). Since we cannot determine the skin color of a player where the 2 raters are different, we drop those observations.

In [10]:
# drop all players (so rows) that don't have the same skin color rating
print ('we drop ' + str(round(100*df[df['rater1'] != df['rater2']].count()[1] / df.count()[1], 2)) + '% of observation because they don\'t have the same rating')
df = df[df['rater1'] == df['rater2']]

we drop 23.2% of observation because they don't have the same rating


In [11]:
df.apply(lambda x: x.isnull(), axis=0).sum()

playerShort         0
player              0
club                0
leagueCountry       0
birthday            0
height             46
weight            701
position         6707
games               0
victories           0
ties                0
defeats             0
goals               0
yellowCards         0
yellowReds          0
redCards            0
photoID             0
rater1              0
rater2              0
refNum              0
refCountry          0
Alpha_3             1
meanIAT           110
nIAT              110
seIAT             110
meanExp           110
nExp              110
seExp             110
dtype: int64

We see that there are a few missing values. We can simply drop the players that don't have an height or weight (not many).
Since there are a lot of players without a position we create a category for them.

In [12]:
# drop players without reported height
df = df[df['height'].notnull()]
# drop players without reported weight
df = df[df['weight'].notnull()]

# create a noPosition category for players without a position
df.loc[df['position'].isnull(), 'position'] = 'noPosition'


In [15]:
df.groupby('playerShort').agg(lambda x: len(set(x)) == 1)

Unnamed: 0_level_0,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaron-lennon,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
aaron-ramsey,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdelhamid-el-kaoutari,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdou-traore_2,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdoulwhaid-sissoko,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdul-razak,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abelaziz-barrada,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abou-diaby,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
adam-bodzek,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,True,True,1.0,1.0,1.0,1.0,1.0,1.0
adam-federici,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df.groupby('playerShort').agg(lambda x: len(set(x)) == 1).apply(lambda col: reduce(lambda x, y: x and y, col), axis=0)

player            True
club              True
leagueCountry     True
birthday          True
height               1
weight               1
position          True
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
photoID           True
rater1               1
rater2               1
refNum           False
refCountry       False
Alpha_3          False
meanIAT              0
nIAT                 0
seIAT                0
meanExp              0
nExp                 0
seExp                0
dtype: object

As we can see, the club, league country and position stay constant so we can aggregate them.

We just check if it's also correct for height and weight:

In [17]:
df[['playerShort', 'height', 'weight']].groupby('playerShort').agg(lambda x: len(set(x)) == 1).mean()

height    1.0
weight    1.0
dtype: float64

# Feature generation

We aggregate the information on the player level (making the assumption that there is always only one referee so that games are not duplicated in the aggregation)

In [18]:
# OISFOGFJDGHJKDFHGOEWOIGIWEOIGHWEOIHGOHFDHGLJDFGJDFJG DONT LOOK AT IT
print(df.keys())
print(df.position.value_counts())

Index(['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height',
       'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals',
       'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
       'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT',
       'meanExp', 'nExp', 'seExp'],
      dtype='object')
Center Back             16393
Center Forward          12860
Defensive Midfielder    10834
Goalkeeper               8111
Attacking Midfielder     7722
Left Fullback            6670
noPosition               6542
Right Fullback           6352
Left Midfielder          4318
Center Midfielder        4031
Right Midfielder         3954
Left Winger              3866
Right Winger             3360
Name: position, dtype: int64


In [19]:
positionArray = list(df['position'].unique())
df['positionNum'] = list(map(positionArray.index, df['position']))
type(df['positionNum'])

pandas.core.series.Series

In [38]:
def genFeatures(player):
    sample = player.iloc(0)[0] # used to get general infos
    
    age = relativedelta(datetime.datetime.now(), sample['birthday']).years
    
    bmi = sample['weight'] / pow(sample['height']/100, 2)
    games = player['games'].sum()
    winRate = player['victories'].sum() / games
    tiesRate = player['ties'].sum() / games
    loseRate = player['defeats'].sum() / games
    yellowRedsRate = player['yellowReds'].sum() / games
    yellowRate = player['yellowCards'].sum() / games
    redRate = player['redCards'].sum() / games
    racism1 = (player['meanExp'] * player['games']).mean()
    racism2 = (player['meanIAT'] * player['games']).mean()
    position = sample['positionNum']
    
    
    
    
    # TODO something about yellow / red rate with racially biased refs vs non-biased
    
    goalRatio = player['goals'].sum() / games
    # TODO some feature with the goal ratio AND offensive position
    
    return pd.Series({
            'player': sample['player'], # constant
            'club': sample['club'],
            'leagueCountry': sample['leagueCountry'],
            'age': age,
            'height': sample['height'],
            'weight': sample['weight'], 
            'bmi': bmi,
 #           'games': games,
            'winRate': winRate,
            'tiesRate': tiesRate,
            'loseRate': loseRate,
            'yellowRate': yellowRate,
            'yellowRedsRate': yellowRedsRate,
            'yellowOverRed': redRate/(yellowRate+1),
            'redRate': redRate,
            'skinColor': sample['rater1'], # we only keep players where rater1 == rater2
            'goalRatio': goalRatio,
            'racism1': racism1,
            'racism2': racism2,
            'position': position,
            })

players = df.groupby('playerShort').apply(genFeatures)
players


Unnamed: 0_level_0,age,bmi,club,goalRatio,height,leagueCountry,loseRate,player,position,racism1,racism2,redRate,skinColor,tiesRate,weight,winRate,yellowOverRed,yellowRate,yellowRedsRate
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
aaron-lennon,29,23.140496,Tottenham Hotspur,0.075243,165.0,England,0.279126,Aaron Lennon,10,1.629476,1.355886,0.000000,0.25,0.235437,63.0,0.485437,0.000000,0.026699,0.000000
aaron-ramsey,25,23.986870,Arsenal FC,0.150000,178.0,England,0.261538,Aaron Ramsey,9,1.083235,0.841595,0.003846,0.00,0.161538,76.0,0.576923,0.003436,0.119231,0.000000
abdelhamid-el-kaoutari,26,22.530864,Montpellier HSC,0.008065,180.0,France,0.346774,Abdelhamid El-Kaoutari,5,1.102242,1.112254,0.016129,0.25,0.322581,73.0,0.330645,0.015152,0.064516,0.032258
abdou-traore_2,28,22.839506,Girondins Bordeaux,0.030928,180.0,France,0.340206,Abdou Traoré,10,0.732690,0.757490,0.000000,0.75,0.237113,74.0,0.422680,0.000000,0.113402,0.010309
abdoulwhaid-sissoko,26,20.987654,Stade Brest,0.024793,180.0,France,0.512397,Abdoulwhaid Sissoko,1,0.869286,0.743567,0.016529,1.00,0.206612,68.0,0.280992,0.014085,0.173554,0.000000
abdul-razak,24,24.074074,Manchester City,0.000000,180.0,England,0.333333,Abdul Razak,9,0.568812,0.384265,0.000000,1.00,0.222222,78.0,0.444444,0.000000,0.055556,0.000000
abelaziz-barrada,27,22.790358,Getafe CF,0.164706,185.0,Spain,0.376471,Abelaziz Barrada,11,1.093126,0.777826,0.011765,0.00,0.294118,78.0,0.329412,0.010417,0.129412,0.011765
abou-diaby,30,21.220009,Arsenal FC,0.100962,188.0,England,0.230769,Abou Diaby,9,1.248933,1.060750,0.009615,0.75,0.245192,75.0,0.524038,0.008621,0.115385,0.000000
adam-bodzek,31,23.334121,Fortuna Düsseldorf,0.042654,184.0,Germany,0.284360,Adam Bodzek,1,1.265875,1.268367,0.000000,0.25,0.298578,79.0,0.417062,0.000000,0.312796,0.004739
adam-federici,31,25.464011,Reading FC,0.004854,188.0,England,0.334951,Adam Federici,2,0.930228,0.834401,0.000000,0.00,0.262136,90.0,0.402913,0.000000,0.019417,0.000000


In [39]:
y = players['skinColor']
features = [
#    'games',
    'goalRatio',
    'age',
    'bmi',
    'height',
    'weight',
    'yellowRate',
    'yellowRedsRate',
    'redRate',
    'yellowOverRed', 
    'winRate',
    'loseRate',
    'tiesRate',
    'racism1',
    'racism2',
    'position',
]
X = players[features]

In [40]:
# generate the final test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)

In [41]:
def genLabels(s):
    return np.asarray(s, dtype="|S6")

rf = ensemble.RandomForestClassifier(max_depth = None, min_samples_split = 15, random_state = 42)
rf.fit(X_train, genLabels(y_train))
res = rf.predict(X_test) == genLabels(y_test)
len(list(filter(lambda x: x, res)))/len(res)


0.45454545454545453

In [361]:
list(df['birthday'][[1]])[0]

Timestamp('1982-08-01 00:00:00')