# Imports 

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta
import pickle

import seaborn as sns
from functools import reduce
from sklearn.cross_validation import train_test_split
from sklearn import ensemble

import math

In [3]:
try:
    df = pickle.load(df, open( "data.pickle", "wb" ))
except:
    df = pd.read_csv('CrowdstormingDataJuly1st.csv', parse_dates = ['birthday'])
    pickle.dump(df, open( "data.pickle", "wb" ))

# Cleaning

## Drop referees who are present in less than 22 diyads

In [4]:
ref_count = df.refNum.value_counts()
refs = ref_count[ref_count>21]
df=df[df['refNum'].isin(refs.index.values)]

In [5]:
# We first drop players that don't have a skin rating (caused by the absence of photos)
print ('we drop ' + str(round(100*df[df.photoID.isnull()].count()[1]/ df.count()[1], 2)) + '% of observation because they don\'t have a picture')

df = df[df.photoID.notnull()]
print(reduce(lambda x,y: x or y, df['rater1'].isnull()))
print(reduce(lambda x,y: x or y, df['rater2'].isnull())) # all ratings exist

we drop 14.71% of observation because they don't have a picture
False
False


In [6]:
# abs(df['rater1'] - df['rater2'])
c = sns.color_palette()
jitter_x = np.random.normal(0, 0.04, size=len(df.rater1))
jitter_y = np.random.normal(0, 0.04, size=len(df.rater2))
# For the time being it runs too long when we rerun everything :P.
#sns.jointplot(df.rater1 + jitter_x, df.rater2 + jitter_y, kind='kde')


As can be observed in the jointplot above, the only real difference in rating is between the 2 lightests colors). Since we cannot determine the skin color of a player where the 2 raters are different, we drop those observations.

In [7]:
# drop all players (so rows) that don't have the same skin color rating
print ('we drop ' + str(round(100*df[df['rater1'] != df['rater2']].count()[1] / df.count()[1], 2)) + '% of observation because they don\'t have the same rating')
df = df[df['rater1'] == df['rater2']]

we drop 23.2% of observation because they don't have the same rating


In [8]:
df.apply(lambda x: x.isnull(), axis=0).sum()

playerShort         0
player              0
club                0
leagueCountry       0
birthday            0
height             23
weight            591
position         6148
games               0
victories           0
ties                0
defeats             0
goals               0
yellowCards         0
yellowReds          0
redCards            0
photoID             0
rater1              0
rater2              0
refNum              0
refCountry          0
Alpha_3             0
meanIAT            83
nIAT               83
seIAT              83
meanExp            83
nExp               83
seExp              83
dtype: int64

We see that there are a few missing values. We can simply drop the players that don't have an height or weight (not many).
Since there are a lot of players without a position we create a category for them.

In [9]:
# drop players without reported height
df = df[df['height'].notnull()]
# drop players without reported weight
df = df[df['weight'].notnull()]

# create a noPosition category for players without a position
df.loc[df['position'].isnull(), 'position'] = 'noPosition'


In [10]:
df.groupby('playerShort').agg(lambda x: len(set(x)) == 1)

Unnamed: 0_level_0,player,club,leagueCountry,birthday,height,weight,position,games,victories,ties,...,rater2,refNum,refCountry,Alpha_3,meanIAT,nIAT,seIAT,meanExp,nExp,seExp
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaron-lennon,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
aaron-ramsey,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdelhamid-el-kaoutari,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdou-traore_2,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdoulwhaid-sissoko,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abdul-razak,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abelaziz-barrada,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
abou-diaby,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0
adam-bodzek,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,True,True,1.0,1.0,1.0,1.0,1.0,1.0
adam-federici,True,True,True,True,1.0,1.0,True,False,False,False,...,1.0,False,False,False,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.groupby('playerShort').agg(lambda x: len(set(x)) == 1).apply(lambda col: reduce(lambda x, y: x and y, col), axis=0)

player            True
club              True
leagueCountry     True
birthday          True
height               1
weight               1
position          True
games            False
victories        False
ties             False
defeats          False
goals            False
yellowCards      False
yellowReds       False
redCards         False
photoID           True
rater1               1
rater2               1
refNum           False
refCountry       False
Alpha_3          False
meanIAT              0
nIAT                 0
seIAT                0
meanExp              0
nExp                 0
seExp                0
dtype: object

As we can see, the club, league country and position stay constant so we can aggregate them.

We just check if it's also correct for height and weight:

In [12]:
df[['playerShort', 'height', 'weight']].groupby('playerShort').agg(lambda x: len(set(x)) == 1).mean()

height    1.0
weight    1.0
dtype: float64

# Feature generation

We aggregate the information on the player level (making the assumption that there is always only one referee so that games are not duplicated in the aggregation)

In [13]:
# OISFOGFJDGHJKDFHGOEWOIGIWEOIGHWEOIHGOHFDHGLJDFGJDFJG DONT LOOK AT IT
print(df.keys())
print(df.position.value_counts())

Index(['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height',
       'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals',
       'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
       'refNum', 'refCountry', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT',
       'meanExp', 'nExp', 'seExp'],
      dtype='object')
Center Back             14905
Center Forward          11577
Defensive Midfielder     9992
Goalkeeper               7314
Attacking Midfielder     7004
Left Fullback            6137
noPosition               5997
Right Fullback           5864
Left Midfielder          4003
Center Midfielder        3728
Right Midfielder         3665
Left Winger              3599
Right Winger             3068
Name: position, dtype: int64


In [14]:
positionArray = list(df['position'].unique())
df['positionNum'] = list(map(positionArray.index, df['position']))
# type(df['positionNum'])

countryArray = list(df['leagueCountry'].unique())
df['countryNum'] = list(map(countryArray.index, df['leagueCountry']))

In [15]:
def genFeatures(player):
    sample = player.iloc(0)[0] # used to get general infos
    
    age = relativedelta(datetime.datetime.now(), sample['birthday']).years
    
    bmi = sample['weight'] / pow(sample['height']/100, 2)
    games = player['games'].sum()
    winRate = player['victories'].sum() / games
    tiesRate = player['ties'].sum() / games
    loseRate = player['defeats'].sum() / games
    yellowRedsRate = player['yellowReds'].sum() / games
    yellowRate = player['yellowCards'].sum() / games
    redRate = player['redCards'].sum() / games
    IAT = (player['meanExp'] * player['games']).sum()/games
    Exp = (player['meanIAT'] * player['games']).sum()/games
    IAT2 = player['meanExp'].mean()
    Exp2 = player['meanIAT'].mean()
    seIAT = math.sqrt((player['nIAT'] * pow(player['seIAT'],2)).sum()) / (player['nIAT'].sum())
    seExp = math.sqrt((player['nExp'] * pow(player['seExp'],2)).sum()) / (player['nExp'].sum())

    yellowRateRacistIAT = (player['yellowCards'] * (player['meanIAT'] > 0.35)).sum() / (1e-3+(player['games'] * (player['meanIAT'] > 0.35)).sum())
    yellowRateNonRacistIAT = (player['yellowCards'] * (player['meanIAT'] < 0.35)).sum() / (1e-3+(player['games'] * (player['meanIAT'] < 0.35)).sum())
    racismIAT = yellowRateRacistIAT / (1e-3+yellowRateNonRacistIAT)
    yellowRateRacistExp = (player['yellowCards'] * (player['meanExp'] > 0.5)).sum() / (1e-3+(player['games'] * (player['meanExp'] > 0.5)).sum())
    yellowRateNonRacistExp = (player['yellowCards'] * (player['meanExp'] < 0.5)).sum() / (1e-3+(player['games'] * (player['meanExp'] < 0.5)).sum())
    racismExp = yellowRateRacistExp / (1e-3+yellowRateNonRacistExp)
    
    position = sample['positionNum']
    
    
        
    goalRatio = player['goals'].sum() / games
    
    # TODO some feature with the goal ratio AND offensive position
    
    return pd.Series({
            'player': sample['player'], # constant
            'age': age,
            'height': sample['height'],
            'weight': sample['weight'], 
            'bmi': bmi,
            'country': sample['countryNum'],
            'games': games,
            'winRate': winRate,
            'tiesRate': tiesRate,
            'loseRate': loseRate,
            'yellowRate': yellowRate,
            'yellowRedsRate': yellowRedsRate,
            'yellowOverRed': redRate/(yellowRate+1e-3),
            'redRate': redRate,
            'skinColor': sample['rater1'], # we only keep players where rater1 == rater2
            'goalRatio': goalRatio,
            'IAT': IAT,
            'Exp': Exp,
            'IAT2': IAT2,
            'Exp2': Exp2,
            'seIAT': seIAT,
            'seExp': seExp,
            'racismIAT': racismIAT,
            'racismExp': racismExp,
            'position': position,
            })

players = df.groupby('playerShort').apply(genFeatures)
players


Unnamed: 0_level_0,Exp,Exp2,IAT,IAT2,age,bmi,country,games,goalRatio,height,...,redRate,seExp,seIAT,skinColor,tiesRate,weight,winRate,yellowOverRed,yellowRate,yellowRedsRate
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
aaron-lennon,0.332389,0.345893,0.399459,0.491482,29,23.140496,0,412,0.075243,165.0,...,0.000000,3.239259e-07,5.878861e-08,0.25,0.235437,63.0,0.485437,0.000000,0.026699,0.000000
aaron-ramsey,0.336386,0.346790,0.430197,0.511650,25,23.986870,0,254,0.153543,178.0,...,0.003937,2.068484e-07,3.716598e-08,0.00,0.165354,76.0,0.570866,0.031996,0.122047,0.000000
abdelhamid-el-kaoutari,0.331882,0.331600,0.328895,0.335587,26,22.530864,3,124,0.008065,180.0,...,0.016129,5.330254e-06,1.476342e-06,0.25,0.322581,73.0,0.330645,0.246184,0.064516,0.032258
abdou-traore_2,0.327985,0.320079,0.317247,0.296562,28,22.839506,3,97,0.030928,180.0,...,0.000000,3.769416e-06,1.281884e-06,0.75,0.237113,74.0,0.422680,0.000000,0.113402,0.010309
abdoulwhaid-sissoko,0.344130,0.348178,0.402314,0.429630,26,20.987654,3,121,0.024793,180.0,...,0.016529,1.904677e-06,4.724571e-07,1.00,0.206612,68.0,0.280992,0.094692,0.173554,0.000000
abdul-razak,0.346396,0.348507,0.445206,0.454716,24,24.074074,0,31,0.000000,180.0,...,0.000000,9.158161e-07,1.481080e-07,1.00,0.193548,78.0,0.483871,0.000000,0.032258,0.000000
abelaziz-barrada,0.359727,0.346365,0.516785,0.422798,27,22.790358,2,81,0.135802,185.0,...,0.012346,1.737975e-05,3.451025e-06,0.00,0.296296,78.0,0.320988,0.090245,0.135802,0.012346
abou-diaby,0.331986,0.338614,0.392714,0.431211,30,21.220009,0,207,0.101449,188.0,...,0.009662,2.724043e-07,6.199085e-08,0.75,0.241546,75.0,0.526570,0.082621,0.115942,0.000000
adam-bodzek,0.336628,0.336628,0.335967,0.335967,31,23.334121,1,207,0.038647,184.0,...,0.000000,3.495117e-07,8.591968e-08,0.25,0.304348,79.0,0.410628,0.000000,0.318841,0.004831
adam-federici,0.327947,0.329913,0.362800,0.372730,31,25.464011,0,205,0.004878,188.0,...,0.000000,2.236042e-08,5.218539e-09,0.00,0.263415,90.0,0.400000,0.000000,0.019512,0.000000


In [16]:
y = players['skinColor']
features = [
    'games',
    'goalRatio',
    'country',
    'age',
    'bmi',
    'height',
    'weight',
    'yellowRate',
    'yellowRedsRate',
    'redRate',
    'yellowOverRed', 
    'winRate',
    'loseRate',
    'tiesRate',
    'IAT',
    'Exp',
    'IAT2',
    'Exp2',
    'seIAT',
    'seExp',
    'racismIAT',
    'racismExp',
    'position',
]
X = players[features]

In [17]:
# generate the final test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [18]:
def genLabels(s):
    return np.asarray(s, dtype="|S6")

rf = ensemble.RandomForestClassifier(n_estimators = 500, max_depth = None, min_samples_split = 50) #, random_state = 0)
rf.fit(X_train, genLabels(y_train))
res = rf.predict(X_test) == genLabels(y_test)
print(len(list(filter(lambda x: x, res)))/len(res))

rf.feature_importances_

0.4107744107744108


array([ 0.04906736,  0.05564179,  0.05909796,  0.04553132,  0.05031732,
        0.04150692,  0.03292075,  0.04870048,  0.02252045,  0.02895788,
        0.02748917,  0.03656548,  0.03813995,  0.04892688,  0.06344124,
        0.07680152,  0.06534222,  0.07478335,  0.0110711 ,  0.04291628,
        0.03145496,  0.02995993,  0.0188457 ])