# Exercise 4a
## 3 Red Cards Study

In [121]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

### 3.1  Loading and Cleaning the Data

In [122]:
#load data with pandas
dataDyad=pd.read_csv('data/CrowdstormingDataJuly1st.csv')

In [123]:
#example
dataDyad.ix[[1200]].T

Unnamed: 0,1200
playerShort,toni-kroos
player,Toni Kroos
club,Bayern München
leagueCountry,Germany
birthday,04.01.1990
height,182
weight,78
position,Attacking Midfielder
games,1
victories,1


All features for Toni Kroos - ref 66 - dyad. The column <tt>games</tt> stands for the number of games in the player-referee dyad

We don't need all the features for our purposes. We can drop the features like <tt>player</tt>, <tt>club</tt>, <tt>height</tt>, <tt>yellowCards</tt> and <tt>photoID</tt>. <tt>yellowReds</tt> also gets dropped since it is not the same as a red card.

From <tt>Alpha_3</tt> to <tt>seExp</tt> the features are referee only. Since in the next step we will restructure the data set in a way that it is not dyad based anymore but playerbased. This step can be made because the question was to answer if "players with dark skin [are more likely to get red cards] than [...] players with light skin". Also in the next part we want to compute the fraction of games where one player gets a red card.

In [124]:
dataDyad.drop(dataDyad.columns[[1,2,5,6,7,9,10,11,12,13,14,16,19,20,21,22,23,24,25,26,27]],1,inplace=True)

In [125]:
#now the data looks likt this:
dataDyad.ix[[1200]].T

Unnamed: 0,1200
playerShort,toni-kroos
leagueCountry,Germany
birthday,04.01.1990
games,1
redCards,0
rater1,0
rater2,0


In [126]:
#combine all instances of one player into one entry. Add all cards and games in the process
players=np.unique(dataDyad['playerShort']) #all the player names
data = dataDyad.groupby(dataDyad['playerShort']).agg({'playerShort':'first','leagueCountry':'first', 'birthday':'first',
                                                      'games': 'sum', 'redCards': 'sum','rater1':'first','rater2':'first'})

In [127]:
#transform birthday to age:
#season: 2012-2013, so at the end it was 2013
data['age'] = data['birthday'].apply(lambda x:2013- int(str(x)[-4:]))
#we can now drop the birthday column
data.drop('birthday',1,inplace=True)

In [128]:
data.ix[:19,['rater1','rater2']].T

playerShort,aaron-hughes,aaron-hunt,aaron-lennon,aaron-ramsey,abdelhamid-el-kaoutari,abdon-prats,abdou-dampha,abdou-traore_2,abdoul-camara,abdoulaye-diallo_2,abdoulaye-diallo_3,abdoulaye-keita_2,abdoulaye-sane,abdoulwhaid-sissoko,abdul-rahman-baba,abdul-razak,abel-aguilar,abel-khaled,abelaziz-barrada
rater1,0.25,0.0,0.25,0.0,0.25,,,0.75,,0.75,,0.75,,1.0,0.75,1.0,0.5,,0.0
rater2,0.0,0.25,0.25,0.0,0.25,,,0.75,,1.0,,1.0,,1.0,1.0,1.0,0.25,,0.0


In [129]:
np.mean(abs(data.ix[:,'rater1']-data.ix[:,'rater2'])),np.var(abs(data.ix[:,'rater1']-data.ix[:,'rater2']))

(0.06009463722397476, 0.011570022589537095)

I've looked at a small cut of the data set and the two raters do disagree occasionally by $0.25$. Bhe overall disagreement is relativley low with $0.060\pm0.012$

In [130]:
np.count_nonzero(np.isnan(data.ix[:,'rater1']))/len(data.ix[:,'rater1'])

0.22795908426692646

$\Rightarrow$ So around $22\%$ of the instances don't have a picture attached to them. All those instances don't help our case so we will drop them.

In [131]:
#drop dyads with no skin color rating
data.drop(data.index[np.where(np.isnan(data['rater1']))],inplace=True)

In [132]:
leagues=np.unique(data.ix[:,'leagueCountry'])
print(leagues)

['England' 'France' 'Germany' 'Spain']


There are only the four leagues above in the data set. So a One-Hot could be the following:


|                     | England | France | Germany | Spain |
|---------------------|---------|--------|---------|-------|
| player from England | 1       | 0      | 0       | 0     |
| player from France  | 0       | 1      | 0       | 0     |
| player from Germany | 0       | 0      | 1       | 0     |
| player from Spain   | 0       | 0      | 0       | 1     |

We add to each player these columns and fill it with respect to his league Country

In [133]:
#create column for each country, set 1 if player is playing in that league and 0 if not
for country in leagues:
    data[country]=data['leagueCountry'].apply(lambda x:int(country==x))
#we can now drop the leagueCountry column
data.drop('leagueCountry',1,inplace=True)

In [134]:
#calculate labels...
labels=data['redCards']/data['games']
#...and drop all card related columns
data.drop(data.columns[:3],1,inplace=True)

In [135]:
#centralization of the data:
data=data-np.mean(data)
labels=labels-np.mean(labels)

In [136]:
#that's how the data looks now
data.head()

Unnamed: 0_level_0,rater1,rater2,age,England,France,Germany,Spain
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
aaron-hughes,-0.018612,-0.31041,6.899685,0.756467,-0.177918,-0.302208,-0.276341
aaron-hunt,-0.268612,-0.06041,-0.100315,-0.243533,-0.177918,0.697792,-0.276341
aaron-lennon,-0.018612,-0.06041,-1.100315,0.756467,-0.177918,-0.302208,-0.276341
aaron-ramsey,-0.268612,-0.31041,-4.100315,0.756467,-0.177918,-0.302208,-0.276341
abdelhamid-el-kaoutari,-0.018612,-0.06041,-4.100315,-0.243533,0.822082,-0.302208,-0.276341


### 3.2 Model Creation

#### Linear regression
$$\sum_{j=1}^DX_{ij}\beta_j=y_i,\ (\mathbb N \ni i\leq N)$$
$$\Rightarrow\mathbf X\mathbf \beta=\mathbf y,\ (\text{matrices})$$
$$\hat{\mathbf {\beta}}= \left(\mathbf{X^TX}\right)^{-1}\mathbf{X^T y}$$

In [137]:
class linearReg:       
    def train(slef,x,y):
        '''
        Takes centralized data and finds the best beta (Dx1) vector
        x: (NxD) matrix with data
        y: (Nx1) vector with labels
        '''
        #centralize data:
        x=x-np.mean(x)
        y=y-np.mean(y)
        #calculate the moore penrose pseudo inverse:
        xPlus=np.linalg.pinv(x)
        self.beta=xPlus@y
        
    def predict(self,x):
        return self.beta@x

#### Regression tree

In [138]:
class Node:
    pass

class Tree:
    def __init__(self):
        self.root = Node()
    
    def find_leaf(self, x):
        node = self.root
        while hasattr(node, "feature"):
            j = node.feature
            if x[j] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node

In [139]:
#since our solution to the 4th exercise was not working we took the solution from moodle and adjusted it 
class RegressionTree(Tree):
    def __init__(self):
        super(RegressionTree, self).__init__()
        
    def train(self, data, labels, n_min=20):
        '''
        data: the feature matrix for all digits
        labels: the corresponding ground-truth responses
        n_min: termination criterion (don't split if a node contains fewer instances)
        '''
        N, D = data.shape
        D_try = int(np.sqrt(D)) # how many features to consider 

        # initialize the root node
        self.root.data = data
        self.root.labels = labels
        
        #put root in stack
        stack = [self.root]
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0] # number of instances in present node
            if n >= n_min:
                # Call 'make_regression_split_node()' with 'D_try' randomly selected 
                # feature indices. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                
                perm = np.random.permutation(D)   # permute D indices
                left, right = make_regression_split_node(node, perm[:D_try]) #select :D_try of permuted indices
                                                       #for 'make_regression_split_node()'
                # put children in stack
                stack.append(left)
                stack.append(right)
            else:
                # Call 'make_regression_leaf_node()' to turn 'node' into a leaf node.
                make_regression_leaf_node(node)
                
    def predict(self, x):
        leaf = self.find_leaf(x)
        # compute p(y | x)
        return leaf.response 

In [140]:
def make_regression_split_node(node, feature_indices):
    '''
    node: the node to be split
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape

    # find best feature j (among 'feature_indices') and best threshold t for the split
    e_min = 1e100
    j_min, t_min = 0, 0
    for j in feature_indices:
        # remove duplicate features
        dj = np.sort(np.unique(node.data[:,j]))
        # compute candidate thresholds in the middle between consecutive feature values
        tj = 0.5 * (dj[1:] + dj[:-1]) 
        # each candidate threshold we need to compute squared error of the resulting children node
        for t in tj:
            left_indices = node.data[:,j] <= t
            nl = np.sum(left_indices)
            ll = node.labels[left_indices]
            el = np.sum(np.square(ll-np.mean(ll)))/nl
            nr = n - nl
            lr = node.labels[node.data[:,j] > t]
            er = np.sum(np.square(lr-np.mean(lr)))/nl
            # choose the the best threshold that minimizes sum of the squared error.
            if el + er < e_min:
                e_min = el + er
                j_min = j
                t_min = t


    # create children
    left = Node()
    right = Node()
    
    # initialize 'left' and 'right' with the data subsets and labels
    # according to the optimal split found above
    left.data = node.data[node.data[:,j_min] <= t_min, :]
    left.labels = node.labels[node.data[:,j_min] <= t_min]
    right.data = node.data[node.data[:,j_min] > t_min, :]
    right.labels = node.labels[node.data[:,j_min] > t_min]

    # turn the current 'node' into a split node
    # (store children and split condition)
    node.left = left
    node.right = right
    node.feature = j_min
    node.threshold = t_min

    # return the children (to be placed on the stack)
    return left, right    

In [145]:
def make_regression_leaf_node(node):
    '''
    node: the node to become a leaf
    '''
    node.N = node.data.shape[0]
    if node.N ==0:
        node.response=0
    else:
        node.response = np.sum(node.labels) / node.N

In [146]:
#forest
class Forest:
    def __init__(self,n=10):
        # create n instances of Regression tree 
        self.trees = [RegressionTree() for i in range(n)]
    
    def train(self, data, target, n_min=20):
        # train all trees
        for i, tree in enumerate(self.trees):
            tree.train(data, target, n_min)
            
    def predict(self, x):
        # return the digit for the DensityTree that maximizes p(x | y) * p(y)
        return np.mean([tree.predict(x) for tree in self.trees])

In [147]:
#crossvalidation
X=np.array(data)
from sklearn.model_selection import train_test_split, cross_val_score
def cross_validation(methode,num_sample=10):
    """
    Measure the correct accuracy with cross validation
    methode: class Forest or linearReg or similar
    """
    mean_rate = np.zeros(num_sample)
    for i in range(num_sample):
        x_train, x_test, y_train, y_test = train_test_split(X,labels, test_size=0.33, random_state=None)
        cl=None
        f=methode()
        f.train(x_train,y_train)
        predicted_labels = f.predict(x_test)
        mean_rate[i] = np.mean(predicted_labels == y_test)
    
    print(methode.__name__,"Mean Accuracy Cross Validation: %f +/- %f"%(np.mean(mean_rate), np.std(mean_rate)))



In [148]:
#test models:
cross_validation(Forest)

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

# TODO: 
1. ~~get x and y from the data set~~
2. test the code
3. do we need centralized data for the forest?
4. determine the squared test errors by means of cross-validation
5. ~~how to handle missing data?~~

### 3.3 Answering the Research Question

In [None]:
#permutation test
def shuffleSkinColors(data):
    newData=data.copy()
    seed=np.random.randint(2e9)
    np.random.seed(seed)
    np.random.shuffle(newData['rater1'].values)
    np.random.seed(seed)
    np.random.shuffle(newData['rater2'].values)
    return newData