# Exercise 4a
## 3 Red Cards Study

In [40]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

### 3.1  Loading and Cleaning the Data

In [41]:
#load data with pandas
data=pd.read_csv('data/CrowdstormingDataJuly1st.csv')

In [42]:
data.ix[1200,:]

playerShort                toni-kroos
player                     Toni Kroos
club                   Bayern München
leagueCountry                 Germany
birthday                   04.01.1990
height                            182
weight                             78
position         Attacking Midfielder
games                               1
victories                           1
ties                                0
defeats                             0
goals                               0
yellowCards                         0
yellowReds                          0
redCards                            0
photoID                     84724.jpg
rater1                              0
rater2                              0
refNum                             66
refCountry                          4
Alpha_3                           LUX
meanIAT                      0.325185
nIAT                              127
seIAT                      0.00329681
meanExp                      0.538462
nExp        

All features for Toni Kroos - ref 66 - dyad. The column $\texttt{games}$ stands for the number of games in the player-referee dyad

We don't need all the features for our purposes. We can drop the features like $\texttt{playerShort}$, $\texttt{club}$, $\texttt{height}$, $\texttt{photoID}$.

In [43]:
data.drop(data.columns[[0,1,2,5,6,7,8,9,10,11,12,13,16]],1,inplace=True)

In [44]:
#transform birthday to age:
#season: 2012-2013, so at the end it was 2013
data['age'] = data['birthday'].apply(lambda x:2013- int(str(x)[-4:]))
#we can now drop the birthday column
data.drop('birthday',1,inplace=True)

In [45]:
data.ix[:19,['rater1','rater2']].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
rater1,0.25,0.75,,,,0.25,0.0,1.0,0.25,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0
rater2,0.5,0.75,,,,0.0,0.25,1.0,0.25,0.0,0.0,0.5,0.0,0.0,0.25,0.0,0.5,0.0,0.0,0.25


In [46]:
np.mean(abs(data.ix[:,'rater1']-data.ix[:,'rater2'])),np.var(abs(data.ix[:,'rater1']-data.ix[:,'rater2']))

(0.05831882267033646, 0.011343119350169325)

I've looked at a small cut of the data set and the two raters do disagree occasionally by $0.25$. Bhe overall disagreement is relativley low with $0.058\pm0.011$

In [47]:
np.count_nonzero(np.isnan(data.ix[:,'rater1']))/len(data.ix[:,'rater1'])

0.14659517352836443

$\Rightarrow$ So around $15\%$ of the instances don't have a picture attached to them. But those must not be all individual players but only individual dyads

In [48]:
leagues=np.unique(data.ix[:,'leagueCountry'])
print(leagues)

['England' 'France' 'Germany' 'Spain']


There are only the four leagues above in the data set. So a One-Hot could be the following:

In [49]:
pd.DataFrame([leagues,np.identity(4,dtype=int)]).T

Unnamed: 0,0,1
0,England,"[1, 0, 0, 0]"
1,France,"[0, 1, 0, 0]"
2,Germany,"[0, 0, 1, 0]"
3,Spain,"[0, 0, 0, 1]"


In [50]:
#create column for each country, set 1 if player is playing in that league and 0 if not
for country in leagues:
    data[country]=data['leagueCountry'].apply(lambda x:int(country==x))
#we can now drop the leagueCountry column
data.drop('leagueCountry',1,inplace=True)

### 3.2 Model Creation

#### Linear regression
$$\sum_{j=1}^DX_{ij}\beta_j=y_i,\ (\mathbb N \ni i\leq N)$$
$$\Rightarrow\mathbf X\mathbf \beta=\mathbf y,\ (\text{matrices})$$
$$\hat{\mathbf {\beta}}= \left(\mathbf{X^TX}\right)^{-1}\mathbf{X^T y}$$

In [75]:
def linearReg(x,y):
    '''
    Takes centralized data and finds the best beta (Dx1) vector
    x: (NxD) matrix with data
    y: (Nx1) vector with labels
    '''
    #centralize data:
    x=x-np.mean(x)
    y=y-np.mean(y)
    print(x.T,y)
    #calculate the moore penrose pseudo inverse:
    xPlus=np.linalg.pinv(x)
    return xPlus@y

#### Regression tree

In [79]:
class Node:
    pass

class Tree:
    def __init__(self):
        self.root = Node()
    
    def find_leaf(self, x):
        node = self.root
        while hasattr(node, "feature"):
            j = node.feature
            if x[j] <= node.threshold:
                node = node.left
            else:
                node = node.right
        return node

In [82]:
#since our solution to the 4th exercise was not working we took the solution from moodle and adjusted it 
class RegressionTree(Tree):
    def __init__(self):
        super(DecisionTree, self).__init__()
        
    def train(self, data, labels, n_min=20):
        '''
        data: the feature matrix for all digits
        labels: the corresponding ground-truth responses
        n_min: termination criterion (don't split if a node contains fewer instances)
        '''
        N, D = data.shape
        D_try = int(np.sqrt(D)) # how many features to consider for each split decision

        # initialize the root node
        self.root.data = data
        self.root.labels = labels
        
        #put root in stack
        stack = [self.root]
        while len(stack):
            node = stack.pop()
            n = node.data.shape[0] # number of instances in present node
            if n >= n_min:
                # Call 'make_decision_split_node()' with 'D_try' randomly selected 
                # feature indices. This turns 'node' into a split node
                # and returns the two children, which must be placed on the 'stack'.
                
                perm = np.random.permutation(D)   # permute D indices
                left, right = make_decision_split_node(node, perm[:D_try]) #select :D_try of permuted indices
                                                       #for 'make_decision_split_node()'
                # put children in stack
                stack.append(left)
                stack.append(right)
            else:
                # Call 'make_decision_leaf_node()' to turn 'node' into a leaf node.
                make_decision_leaf_node(node)
                
    def predict(self, x):
        leaf = self.find_leaf(x)
        # compute p(y | x)
        return leaf.response 

In [80]:
def make_regression_split_node(node, N, feature_indices):
    '''
    node: the node to be split
    feature_indices: a numpy array of length 'D_try', containing the feature 
                     indices to be considered in the present split
    '''
    n, D = node.data.shape

    # find best feature j (among 'feature_indices') and best threshold t for the split
    e_min = 1e100
    j_min, t_min = 0, 0
    for j in feature_indices:
        # remove duplicate features
        dj = np.sort(np.unique(node.data[:,j]))
        # compute candidate thresholds in the middle between consecutive feature values
        tj = 0.5 * (dj[1:] + dj[:-1]) 
        # each candidate threshold we need to compute squared error of the resulting children node
        for t in tj:
            left_indices = node.data[:,j] <= t
            nl = np.sum(left_indices)
            ll = node.labels[left_indices]
            el = np.sum(np.square(ll-np.mean(ll)))/nl
            nr = n - nl
            lr = node.labels[node.data[:,j] > t]
            er = np.sum(np.square(lr-np.mean(lr)))/nl
            # choose the the best threshold that minimizes sum of the squared error.
            if el + er < e_min:
                e_min = el + er
                j_min = j
                t_min = t


    # create children
    left = Node()
    right = Node()
    
    # initialize 'left' and 'right' with the data subsets and labels
    # according to the optimal split found above
    left.data = node.data[node.data[:,j_min] <= t_min, :]
    left.labels = node.labels[node.data[:,j_min] <= t_min]
    right.data = node.data[node.data[:,j_min] > t_min, :]
    right.labels = node.labels[node.data[:,j_min] > t_min]

    # turn the current 'node' into a split node
    # (store children and split condition)
    node.left = left
    node.right = right
    node.feature = j_min
    node.threshold = t_min

    # return the children (to be placed on the stack)
    return left, right    

In [81]:
def make_regression_leaf_node(node, N):
    '''
    node: the node to become a leaf
    '''
    node.N = node.labels.shape[0]
    node.response = np.sum(node.labels) / node.N

In [83]:
#forest
class Forest:
    def __init__(self,n):
        # create n instances of Densiry tree 
        self.trees = [RegressionTree() for i in range(n)]
    
    def train(self, data, target, n_min=20):
        # train all trees
        for i, tree in enumerate(self.trees):
            tree.train(data, target, n_min)
            
    def predict(self, x):
        # return the digit for the DensityTree that maximizes p(x | y) * p(y)
        return np.mean([tree.predict(x) for tree in self.trees])

# TODO: 
1. get x and y from the data set
2. test the code
3. do we need centralized data for the forest?
4. determine the squared test errors by means of cross-validation
5. how to handle missing data?

### 3.3 Answering the Research Question