In [1]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
import numpy as np
import pandas as pd
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
sns.set_context("poster", font_scale=1.3)

import missingno as msno
import pandas_profiling

from sklearn.datasets import make_blobs
import time

In [2]:
def save_subgroup(dataframe, g_index, subgroup_name, prefix='raw_'):
    save_subgroup_filename = "".join([prefix, subgroup_name, ".csv.gz"])
    dataframe.to_csv(save_subgroup_filename, compression='gzip', encoding='UTF-8')
    test_df = pd.read_csv(save_subgroup_filename, compression='gzip', index_col=g_index, encoding='UTF-8')
    # Test that we recover what we send in
    if dataframe.equals(test_df):
        print("Test-passed: we recover the equivalent subgroup dataframe.")
    else:
        print("Warning -- equivalence test!!! Double-check.")

In [3]:
def load_subgroup(filename, index_col=[0]):
    return pd.read_csv(filename, compression='gzip', index_col=index_col)

# Tidy Dyads and Starting Joins

In [4]:
clean_players = load_subgroup("cleaned_players.csv.gz")
players = load_subgroup("raw_players.csv.gz", )
countries = load_subgroup("raw_countries.csv.gz")
referees = load_subgroup("raw_referees.csv.gz")
agg_dyads = pd.read_csv("raw_dyads.csv.gz", compression='gzip', index_col=[0, 1])

In [5]:
agg_dyads.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,redCards,victories,defeats,goals,games,yellowCards,ties,yellowReds
refNum,playerShort,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,lucas-wilchez,0,0,1,0,1,0,0,0
2,john-utaka,0,0,1,0,1,1,0,0
3,abdon-prats,0,0,0,0,1,1,1,0
3,pablo-mari,0,1,0,0,1,0,0,0
3,ruben-pena,0,1,0,0,1,0,0,0
4,aaron-hughes,0,0,1,0,1,0,0,0
4,aleksandar-kolarov,0,1,0,0,1,0,0,0
4,alexander-tettey,0,0,1,0,1,0,0,0
4,anders-lindegaard,0,0,0,0,1,0,1,0
4,andreas-beck,0,1,0,0,1,0,0,0


In [6]:
# Test if the number of games is equal to the victories + ties + defeats in the dataset

In [7]:
all(agg_dyads['games'] == agg_dyads.victories + agg_dyads.ties + agg_dyads.defeats)

True

In [8]:
# Sanity check passes

In [9]:
len(agg_dyads.reset_index().set_index('playerShort'))

146028

In [10]:
agg_dyads['totalRedCards'] = agg_dyads['yellowReds'] + agg_dyads['redCards']
agg_dyads.rename(columns={'redCards': 'strictRedCards'}, inplace=True)

In [11]:
agg_dyads.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,strictRedCards,victories,defeats,goals,games,yellowCards,ties,yellowReds,totalRedCards
refNum,playerShort,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,lucas-wilchez,0,0,1,0,1,0,0,0,0
2,john-utaka,0,0,1,0,1,1,0,0,0
3,abdon-prats,0,0,0,0,1,1,1,0,0
3,pablo-mari,0,1,0,0,1,0,0,0,0
3,ruben-pena,0,1,0,0,1,0,0,0,0


## Remove records that come from players who don't have a skintone rating

There are a couple of ways to do this -- set operations and joins are two ways demonstrated below: 

In [12]:
clean_players.head()

Unnamed: 0_level_0,height,weight,skintone,position_agg,weightclass,heightclass,skintoneclass,age_years
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aaron-hughes,182.0,71.0,0.125,Defense,low_weight,mid_height,"(-0.001, 0.125]",33.149897
aaron-hunt,183.0,73.0,0.125,Forward,low_weight,mid_height,"(-0.001, 0.125]",26.327173
aaron-lennon,165.0,63.0,0.25,Midfield,vlow_weight,vlow_height,"(0.125, 0.25]",25.713895
aaron-ramsey,178.0,76.0,0.0,Midfield,mid_weight,low_height,"(-0.001, 0.125]",22.017796
abdelhamid-el-kaoutari,180.0,73.0,0.25,Defense,low_weight,low_height,"(0.125, 0.25]",22.795346


In [13]:
agg_dyads.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,strictRedCards,victories,defeats,goals,games,yellowCards,ties,yellowReds,totalRedCards
refNum,playerShort,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,lucas-wilchez,0,0,1,0,1,0,0,0,0
2,john-utaka,0,0,1,0,1,1,0,0,0
3,abdon-prats,0,0,0,0,1,1,1,0,0
3,pablo-mari,0,1,0,0,1,0,0,0,0
3,ruben-pena,0,1,0,0,1,0,0,0,0


In [14]:
agg_dyads.reset_index().head()

Unnamed: 0,refNum,playerShort,strictRedCards,victories,defeats,goals,games,yellowCards,ties,yellowReds,totalRedCards
0,1,lucas-wilchez,0,0,1,0,1,0,0,0,0
1,2,john-utaka,0,0,1,0,1,1,0,0,0
2,3,abdon-prats,0,0,0,0,1,1,1,0,0
3,3,pablo-mari,0,1,0,0,1,0,0,0,0
4,3,ruben-pena,0,1,0,0,1,0,0,0,0


In [15]:
agg_dyads.reset_index().set_index('playerShort').head()

Unnamed: 0_level_0,refNum,strictRedCards,victories,defeats,goals,games,yellowCards,ties,yellowReds,totalRedCards
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
lucas-wilchez,1,0,0,1,0,1,0,0,0,0
john-utaka,2,0,0,1,0,1,1,0,0,0
abdon-prats,3,0,0,0,0,1,1,1,0,0
pablo-mari,3,0,1,0,0,1,0,0,0,0
ruben-pena,3,0,1,0,0,1,0,0,0,0


In [16]:
player_dyad = (clean_players.merge(agg_dyads.reset_index().set_index('playerShort'),
                                   left_index=True,
                                   right_index=True))

In [17]:
player_dyad.head()

Unnamed: 0_level_0,height,weight,skintone,position_agg,weightclass,heightclass,skintoneclass,age_years,refNum,strictRedCards,victories,defeats,goals,games,yellowCards,ties,yellowReds,totalRedCards
playerShort,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
aaron-hughes,182.0,71.0,0.125,Defense,low_weight,mid_height,"(-0.001, 0.125]",33.149897,4,0,0,1,0,1,0,0,0,0
aaron-hughes,182.0,71.0,0.125,Defense,low_weight,mid_height,"(-0.001, 0.125]",33.149897,66,0,1,0,0,1,0,0,0,0
aaron-hughes,182.0,71.0,0.125,Defense,low_weight,mid_height,"(-0.001, 0.125]",33.149897,77,0,13,5,0,26,0,8,0,0
aaron-hughes,182.0,71.0,0.125,Defense,low_weight,mid_height,"(-0.001, 0.125]",33.149897,163,0,1,0,0,2,0,1,0,0
aaron-hughes,182.0,71.0,0.125,Defense,low_weight,mid_height,"(-0.001, 0.125]",33.149897,194,0,3,8,0,16,2,5,0,0


In [18]:
clean_dyads = (agg_dyads.reset_index()[agg_dyads.reset_index()
                                   .playerShort
                                   .isin(set(clean_players.index))
                                  ]).set_index(['refNum', 'playerShort'])

In [19]:
clean_dyads.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,strictRedCards,victories,defeats,goals,games,yellowCards,ties,yellowReds,totalRedCards
refNum,playerShort,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,lucas-wilchez,0,0,1,0,1,0,0,0,0
2,john-utaka,0,0,1,0,1,1,0,0,0
4,aaron-hughes,0,0,1,0,1,0,0,0,0
4,aleksandar-kolarov,0,1,0,0,1,0,0,0,0
4,alexander-tettey,0,0,1,0,1,0,0,0,0


In [20]:
clean_dyads.shape, agg_dyads.shape, player_dyad.shape

((124621, 9), (146028, 9), (124621, 18))

## Disaggregate

The dyads are currently an aggregated metric summarizing all times a particular referee-player pair play were matched. To properly handle the data, we have to disaggregate the data into a tidy/long format. This means that each game is a row.

In [21]:
# inspired by https://github.com/mathewzilla/redcard/blob/master/Crowdstorming_visualisation.ipynb
colnames = ['games', 'totalRedCards']
j = 0
out = [0 for _ in range(sum(clean_dyads['games']))]

for index, row in clean_dyads.reset_index().iterrows():
    n = row['games']
    d = row['totalRedCards']
    ref = row['refNum']
    player = row['playerShort']
    for _ in range(n):
        row['totalRedCards'] = 1 if (d-_) > 0 else 0
        rowlist=list([ref, player, row['totalRedCards']])
        out[j] = rowlist
        j += 1

tidy_dyads = pd.DataFrame(out, columns=['refNum', 'playerShort', 'redcard'],).set_index(['refNum', 'playerShort'])

In [22]:
# 3092
tidy_dyads.redcard.sum()

3092

In [23]:
# Notice this is longer than before
clean_dyads.games.sum()

373067

In [24]:
tidy_dyads.shape

(373067, 1)

In [25]:
# Ok, this is a bit crazy... tear it apart and figure out what each piece is doing if it's not clear
clean_referees = (referees.reset_index()[referees.reset_index()
                                                 .refNum.isin(tidy_dyads.reset_index().refNum
                                                                                       .unique())
                                        ]).set_index('refNum')

In [26]:
clean_referees.shape, referees.shape

((2978, 1), (3147, 1))

In [27]:
clean_countries = (countries.reset_index()[countries.reset_index()
                                           .refCountry
                                           .isin(clean_referees.refCountry
                                                 .unique())
                                          ].set_index('refCountry'))

In [28]:
clean_countries.shape, countries.shape

((160, 7), (161, 7))

In [None]:
tidy_dyads.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,redcard
refNum,playerShort,Unnamed: 2_level_1
1,lucas-wilchez,0
2,john-utaka,0
4,aaron-hughes,0
4,aleksandar-kolarov,0
4,alexander-tettey,0


In [None]:
tidy_dyads.to_csv("cleaned_dyads.csv.gz", compression='gzip')

In [None]:
tidy_dyads.shape