# Imports 

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
import pickle

import seaborn as sns
from functools import reduce
from sklearn.cross_validation import train_test_split
from sklearn import ensemble
from sklearn import metrics

import math

from sklearn import cross_validation

In [2]:
try:
    df = pickle.load(df, open( "data.pickle", "wb" ))
except:
    df = pd.read_csv('CrowdstormingDataJuly1st.csv', parse_dates = ['birthday'])
    pickle.dump(df, open( "data.pickle", "wb" ))

# Cleaning

## Drop referees who are present in less than 22 diyads

In [4]:
# ref_count = df.refNum.value_counts()
# refs = ref_count[ref_count>21]
# df=df[df['refNum'].isin(refs.index.values)]

In [3]:
# We first drop players that don't have a skin rating (caused by the absence of photos)
print ('we drop ' + str(round(100*df[df.photoID.isnull()].count()[1]/ df.count()[1], 2)) + '% of observation because they don\'t have a picture')

df = df[df.photoID.notnull()]
print(reduce(lambda x,y: x or y, df['rater1'].isnull()))
print(reduce(lambda x,y: x or y, df['rater2'].isnull())) # all ratings exist

we drop 14.66% of observation because they don't have a picture
False
False


In [6]:
# abs(df['rater1'] - df['rater2'])
# c = sns.color_palette()
# jitter_x = np.random.normal(0, 0.04, size=len(df.rater1))
# jitter_y = np.random.normal(0, 0.04, size=len(df.rater2))
# For the time being it runs too long when we rerun everything :P.
#sns.jointplot(df.rater1 + jitter_x, df.rater2 + jitter_y, kind='kde')


As can be observed in the jointplot above, the only real difference in rating is between the 2 lightests colors). Since we cannot determine the skin color of a player where the 2 raters are different, we drop those observations.

In [7]:
# drop all players (so rows) that don't have the same skin color rating
# print ('we drop ' + str(round(100*df[df['rater1'] != df['rater2']].count()[1] / df.count()[1], 2)) + '% of observation because they don\'t have the same rating')
# df = df[df['rater1'] == df['rater2']]

In [4]:
df.apply(lambda x: x.isnull(), axis=0).sum()

playerShort         0
player              0
club                0
leagueCountry       0
birthday            0
height             46
weight            753
position         8461
games               0
victories           0
ties                0
defeats             0
goals               0
yellowCards         0
yellowReds          0
redCards            0
photoID             0
rater1              0
rater2              0
refNum              0
refCountry          0
Alpha_3             1
meanIAT           153
nIAT              153
seIAT             153
meanExp           153
nExp              153
seExp             153
dtype: int64

We see that there are a few missing values. We can simply drop the players that don't have an height or weight (not many).
Since there are a lot of players without a position we create a category for them.

In [5]:
# drop players without reported height
df = df[df['height'].notnull()]
# drop players without reported weight
df = df[df['weight'].notnull()]

# create a noPosition category for players without a position
df.loc[df['position'].isnull(), 'position'] = 'noPosition'


In [10]:
df.groupby('playerShort').agg(lambda x: len(set(x)) == 1)

Unnamed: 0_level_0,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaron-hughes,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
aaron-hunt,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
aaron-lennon,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
aaron-ramsey,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdelhamid-el-kaoutari,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdou-traore_2,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdoulaye-diallo_2,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdoulaye-keita_2,True,True,True,True,1.0,1.0,True,False,True,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdoulwhaid-sissoko,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdul-rahman-baba,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.groupby('playerShort').agg(lambda x: len(set(x)) == 1).apply(lambda col: reduce(lambda x, y: x and y, col), axis=0)

player            True
club              True
leagueCountry     True
birthday          True
height               1
weight               1
position          True
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
photoID           True
rater1               1
rater2               1
refNum           False
refCountry       False
Alpha_3          False
meanIAT              0
nIAT                 0
seIAT                0
meanExp              0
nExp                 0
seExp                0
dtype: object

As we can see, the club, league country and position stay constant so we can aggregate them.

We just check if it's also correct for height and weight:

In [6]:
df[['playerShort', 'height', 'weight']].groupby('playerShort').agg(lambda x: len(set(x)) == 1).mean()

height    1.0
weight    1.0
dtype: float64

# Feature generation

We aggregate the information on the player level (making the assumption that there is always only one referee so that games are not duplicated in the aggregation)

In [13]:
# OISFOGFJDGHJKDFHGOEWOIGIWEOIGHWEOIHGOHFDHGLJDFGJDFJG DONT LOOK AT IT
print(df.keys())
print(df.position.value_counts())

Index(['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height',
       'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals',
       'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
       'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT',
       'meanExp', 'nExp', 'seExp'],
      dtype='object')
Center Back             20534
Center Forward          16612
Defensive Midfielder    13924
Goalkeeper              11015
Attacking Midfielder    10779
Left Fullback            8980
Right Fullback           8426
noPosition               8265
Left Midfielder          6214
Right Winger             5146
Center Midfielder        4835
Right Midfielder         4637
Left Winger              4501
Name: position, dtype: int64


In [7]:
positionArray = list(df['position'].unique())
df['positionNum'] = list(map(positionArray.index, df['position']))
# type(df['positionNum'])

countryArray = list(df['leagueCountry'].unique())
df['countryNum'] = list(map(countryArray.index, df['leagueCountry']))

In [8]:
def genFeatures(player):
    sample = player.iloc(0)[0] # used to get general infos
    
    age = relativedelta(datetime.datetime.now(), sample['birthday']).years
    
    bmi = sample['weight'] / pow(sample['height']/100, 2)
    games = player['games'].sum()
    winRate = player['victories'].sum() / games
    tiesRate = player['ties'].sum() / games
    loseRate = player['defeats'].sum() / games
    yellowRedsRate = player['yellowReds'].sum() / games
    yellowRate = player['yellowCards'].sum() / games
    redRate = player['redCards'].sum() / games
    IAT = (player['meanExp'] * player['games']).sum()/games
    Exp = (player['meanIAT'] * player['games']).sum()/games
    IAT2 = player['meanExp'].mean()
    Exp2 = player['meanIAT'].mean()
    seIAT = math.sqrt((player['nIAT'] * pow(player['seIAT'],2)).sum()) / (player['nIAT'].sum())
    seExp = math.sqrt((player['nExp'] * pow(player['seExp'],2)).sum()) / (player['nExp'].sum())

    yellowRateRacistIAT = (player['yellowCards'] * (player['meanIAT'] > 0.35)).sum() / (1e-3+(player['games'] * (player['meanIAT'] > 0.35)).sum())
    yellowRateNonRacistIAT = (player['yellowCards'] * (player['meanIAT'] < 0.35)).sum() / (1e-3+(player['games'] * (player['meanIAT'] < 0.35)).sum())
    racismIAT = yellowRateRacistIAT / (1e-3+yellowRateNonRacistIAT)
    yellowRateRacistExp = (player['yellowCards'] * (player['meanExp'] > 0.5)).sum() / (1e-3+(player['games'] * (player['meanExp'] > 0.5)).sum())
    yellowRateNonRacistExp = (player['yellowCards'] * (player['meanExp'] < 0.5)).sum() / (1e-3+(player['games'] * (player['meanExp'] < 0.5)).sum())
    racismExp = yellowRateRacistExp / (1e-3+yellowRateNonRacistExp)
    
    position = sample['positionNum']
    
    
        
    goalRatio = player['goals'].sum() / games
    
    # TODO some feature with the goal ratio AND offensive position
    
    return pd.Series({
            'player': sample['player'], # constant
            'age': age,
            'height': sample['height'],
            'weight': sample['weight'], 
            'bmi': bmi,
            'country': sample['countryNum'],
            'games': games,
            'winRate': winRate,
            'tiesRate': tiesRate,
            'loseRate': loseRate,
            'yellowRate': yellowRate,
            'yellowRedsRate': yellowRedsRate,
            'yellowOverRed': redRate/(yellowRate+1e-3),
            'redRate': redRate,
            'skinColor': (sample['rater1'] + sample['rater2'])/2, # we only keep players where rater1 == rater2
            'goalRatio': goalRatio,
            'IAT': IAT,
            'Exp': Exp,
            'IAT2': IAT2,
            'Exp2': Exp2,
            'seIAT': seIAT,
            'seExp': seExp,
            'racismIAT': racismIAT,
            'racismExp': racismExp,
            'position': position,
            })

players = df.groupby('playerShort').apply(genFeatures)
players


Unnamed: 0_level_0,Exp,Exp2,IAT,IAT2,age,bmi,country,games,goalRatio,height,...,redRate,seExp,seIAT,skinColor,tiesRate,weight,winRate,yellowOverRed,yellowRate,yellowRedsRate
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaron-hughes,0.333195,0.346459,0.400637,0.494575,37,21.434609,2,654,0.013761,182.0,...,0.000000,4.155690e-07,6.680331e-08,0.125,0.273700,71.0,0.377676,0.000000,0.029052,0.000000
aaron-hunt,0.341438,0.348818,0.380811,0.449220,30,21.798202,3,336,0.184524,183.0,...,0.002976,1.630653e-07,3.620245e-08,0.125,0.217262,73.0,0.419643,0.023621,0.125000,0.000000
aaron-lennon,0.332389,0.345893,0.399459,0.491482,29,23.140496,2,412,0.075243,165.0,...,0.000000,3.239259e-07,5.878861e-08,0.250,0.235437,63.0,0.485437,0.000000,0.026699,0.000000
aaron-ramsey,0.336638,0.346821,0.433294,0.514693,25,23.986870,2,260,0.150000,178.0,...,0.003846,4.289817e-07,1.092870e-07,0.000,0.161538,76.0,0.576923,0.031990,0.119231,0.000000
abdelhamid-el-kaoutari,0.331882,0.331600,0.328895,0.335587,26,22.530864,1,124,0.008065,180.0,...,0.016129,5.330254e-06,1.476342e-06,0.250,0.322581,73.0,0.330645,0.246184,0.064516,0.032258
abdou-traore_2,0.327985,0.320079,0.317247,0.296562,28,22.839506,1,97,0.030928,180.0,...,0.000000,3.769416e-06,1.281884e-06,0.750,0.237113,74.0,0.422680,0.000000,0.113402,0.010309
abdoulaye-diallo_2,0.343556,0.341625,0.428271,0.400818,24,22.395790,1,24,0.000000,189.0,...,0.000000,3.439109e-06,8.437459e-07,0.875,0.333333,80.0,0.333333,0.000000,0.000000,0.000000
abdoulaye-keita_2,0.348498,0.355406,0.390184,0.417225,26,23.483477,1,3,0.000000,188.0,...,0.000000,2.207109e-05,5.478587e-06,0.875,0.333333,83.0,0.000000,0.000000,0.000000,0.000000
abdoulwhaid-sissoko,0.344130,0.348178,0.402314,0.429630,26,20.987654,1,121,0.024793,180.0,...,0.016529,1.904677e-06,4.724571e-07,1.000,0.206612,68.0,0.280992,0.094692,0.173554,0.000000
abdul-rahman-baba,0.339733,0.342072,0.341395,0.361068,22,21.847009,3,50,0.000000,179.0,...,0.020000,1.349087e-06,2.550173e-07,0.875,0.160000,70.0,0.340000,0.327869,0.060000,0.000000


In [9]:
y = players['skinColor'] >= 0.5 # 2 classes, {0, 0.25}-> white (no racial bias) and {0.5, 0.75, 1} -> black (racial bias)
features = [
    'games',
    'goalRatio',
    'country',
    'age',
    'bmi',
    'height',
    'weight',
    'yellowRate',
    'yellowRedsRate',
    'redRate',
    'yellowOverRed', 
    'winRate',
    'loseRate',
    'tiesRate',
    'IAT',
    'Exp',
    'IAT2',
    'Exp2',
    'seIAT',
    'seExp',
    'racismIAT',
    'racismExp',
    'position',
]
X = players[features]

In [10]:
# generate the final test set
# we stratify so that we have the same class proportion in both train and test set
X_w, X_val, y_w, y_val = train_test_split(X, y, test_size=0.25, random_state=0, stratify = y)
X_w = np.asanyarray(X_w)

In [11]:
# first element of the tuple is the accuracy, second is F1
def scorer(estimator, X, y):
    pred = estimator.predict(X)
    A = metrics.accuracy_score(y, pred)
#     print(metrics.classification_report(y, pred))
    F1 = metrics.f1_score(y, pred)
    return (A, F1)

def CVstep(model, X, y, train_index, test_index, scorer):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    estimator = model.fit(X_train, y_train)
    return scorer(estimator, X_test, y_test)

# takes an array of models and return their cross-validation results for each split
# uses the same splits for each model so they can be directly compared
def CV(modelData, scorer, folds):
    kf = cross_validation.KFold(len(X_w), n_folds = folds)
    return [[CVstep(model, X, y, train_index, test_index, scorer) for model, X, y in modelData] for train_index, test_index in kf]


In [12]:
# balanced so that we do not neglect black values and only predict white

# gini vs entropy comparison
rf = ensemble.RandomForestClassifier(n_estimators = 100, max_depth = None, min_samples_split = 20, criterion = 'gini', class_weight = 'balanced')
rf2 = ensemble.RandomForestClassifier(n_estimators = 100, max_depth = None, min_samples_split = 20, criterion = 'entropy', class_weight = 'balanced')


# cross-validation (average over 5 different 10-fold splits to smooth the result)
scores = np.concatenate(np.asarray([CV([[rf, X_w, y_w], [rf2, X_w, y_w]], scorer, 10) for i in range(0, 5)]))



#     cross_validation.cross_val_score (rf, X_train, y_train, scoring = scorer, cv = 10)
#     score.mean()

SyntaxError: invalid syntax (<ipython-input-12-1310f0b17f85>, line 9)

In [151]:
print(scores[:, 0, 0].mean()) # acc of first model
print(scores[:, 1, 0].mean()) # acc of second model

print(scores[:, 0, 1].mean()) # f1 of first model
print(scores[:, 1, 1].mean()) # f1 of second model

0.770876430537
0.773930175286
0.470876058895
0.463977547874


We don't see a real difference between the 2

In [146]:
rf.fit(X_w, y_w)
pred = rf.predict(X_val)
A = metrics.accuracy_score(y_val, pred)
print('A = ' + str(A))
F1 = metrics.f1_score(y_val, pred)
print('F1 = ' + str(F1))

print(metrics.classification_report(y_val, pred))



# rf.feature_importances_

A = 0.789743589744
F1 = 0.534090909091
             precision    recall  f1-score   support

      False       0.85      0.88      0.86       296
       True       0.57      0.50      0.53        94

avg / total       0.78      0.79      0.78       390

