# Pipeline

1. Preprocessing
2. **Data Extraction**

This file performs data extraction. It generates several features and returns a .pkl file containing a dataframe with said features.

# Imports

In [1]:
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Data extraction into new Dataframe

In [2]:
# We don't need HomeID and AwayID here, but I put it in just for exploration
results = pd.read_pickle('preprocessed_results.pkl')
df = results[['HomeID', 'AwayID', 'FTHG', 'FTAG', 'FTR']].copy()
df

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR
0,0,15,3,0,1
1,1,12,1,0,1
2,2,16,0,0,0
3,3,14,6,0,1
4,4,11,2,2,0
...,...,...,...,...,...
13015,97,105,2,2,0
13016,111,106,2,0,1
13017,120,109,0,2,2
13018,122,107,2,2,0


In [3]:
# auskommentieren wenn nicht gewünscht (ist nur temporär, damit man nicht jedes mal alles neu ausführen muss)
df = pd.read_pickle('feature_frame.pkl')

In [4]:
results.sample(3)

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
10916,SP1,2014-05-01,111,110,3,0,1,1,0,H,...,7,1,13,14,9,2,2,2,0,0
6334,I1,2010-12-18,73,72,1,0,1,1,0,H,...,3,0,17,19,4,6,2,1,0,0
10947,SP1,2014-01-26,102,106,1,5,2,1,2,A,...,1,9,24,13,1,8,3,4,1,0


In [5]:
#slc = df['AwayID'] == 105
#df[slc][['FTR']].value_counts() / sum(slc)

# Implementing non-relational Features

__For the moment, the features are closely aligned to those implemented by Hubáček et al. That means that most features also get implemented twice, once for the home team and once for that away team. The naming convention for this approach is FEATURE_NAME_home/away.__

Example: <br>
A_WIN_PCT_home would be the historical away winning percentage of the team that plays the current game at home. <br>
H_WIN_PCT_home would be the historical home winning percentage of that same team.

That means the word after the last underscore indicates if the value of this feature represents the home team or the away team of the current(!) game.

The current approach is to prefer readability over performance which means that it can take long to add the features to the dataframe.

In [43]:
# TODO: evtl. die feature-erstellungen in Funktionen packen um sie an und aus togglen zu können
# Alle nicht-kursiven Features aus dem Hubacek Paper müssen doppelt implementiert werden, siehe oben.

## Historical strength

In [6]:
# Returns all games that were played by a specific team between date minus two years and date.
def get_historical_games(team_id, date):
    games_by_team = results[((results['HomeID'] == team_id) | (results['AwayID'] == team_id))
                           & (results['Date'] < date) & (results['Date'] > date - datetime.timedelta(730))]
    return games_by_team

In [8]:
get_historical_games(8, datetime.date(2016, 5, 20)).head(3)

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
1348,E1,2014-12-01,13,8,3,5,2,2,2,D,...,4,9,10,8,11,2,1,1,0,0
1383,E1,2014-08-02,8,10,5,1,1,4,0,H,...,12,6,14,14,6,6,1,2,0,0
1395,E1,2014-12-02,16,8,2,3,2,1,1,D,...,3,7,7,9,7,5,4,3,0,0


### Winning and drawing percentages

In [17]:
# "Constants"
suffixes = {'HomeID':'_home', 'AwayID':'_away'}    # suffix dict for the feature names

id_names = ['HomeID', 'AwayID']                    # list of ID columns for iteration purposes

In [7]:
# Add percentages to dataframe
df['H_WIN_PCT_home'] = 0
df['H_DRAW_PCT_home'] = 0
df['A_WIN_PCT_home'] = 0
df['A_DRAW_PCT_home'] = 0
df['H_WIN_PCT_away'] = 0
df['H_DRAW_PCT_away'] = 0
df['A_WIN_PCT_away'] = 0
df['A_DRAW_PCT_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]
        
        hist_games = get_historical_games(team_id, game_date)
        game_count = len(hist_games)

        # total counts of home and away games in the last two years
        home_game_count = len(hist_games[hist_games['HomeID'] == team_id])
        away_game_count = len(hist_games[hist_games['AwayID'] == team_id])

        home_games_won = len(hist_games[(hist_games['HomeID'] == team_id) & (hist_games['FTR'] == 1)])
        away_games_won = len(hist_games[(hist_games['AwayID'] == team_id) & (hist_games['FTR'] == 2)])

        home_games_drawn = len(hist_games[(hist_games['HomeID'] == team_id) & (hist_games['FTR'] == 0)])
        away_games_drawn = len(hist_games[(hist_games['AwayID'] == team_id) & (hist_games['FTR'] == 0)])

        if home_game_count > 0:         # TODO should the threshold be higher to only use significant enough data?
            df.loc[row, 'H_WIN_PCT' + suffix] = home_games_won / home_game_count
            df.loc[row, 'H_DRAW_PCT' + suffix] = home_games_drawn / home_game_count
        else:
            df.loc[row, 'H_WIN_PCT' + suffix] = 0          # TODO maybe choose some standard value like 0.25 instead of 0?
            df.loc[row, 'H_DRAW_PCT' + suffix] = 0         # Overall Median/Mean could also work.

        if away_game_count > 0:
            df.loc[row, 'A_WIN_PCT' + suffix] = away_games_won / away_game_count
            df.loc[row, 'A_DRAW_PCT' + suffix] = away_games_drawn / away_game_count
        else:
            df.loc[row, 'A_WIN_PCT' + suffix] = 0          
            df.loc[row, 'A_DRAW_PCT' + suffix] = 0 

### Goal averages and standard deviations

In [10]:
# Add goal averages and deviations to dataframe
df['H_GS_AVG_home'] = 0
df['H_GC_AVG_home'] = 0
df['A_GS_AVG_home'] = 0
df['A_GC_AVG_home'] = 0
df['H_GS_AVG_away'] = 0
df['H_GC_AVG_away'] = 0
df['A_GS_AVG_away'] = 0
df['A_GC_AVG_away'] = 0

df['H_GS_STD_home'] = 0
df['H_GC_STD_home'] = 0
df['A_GS_STD_home'] = 0
df['A_GC_STD_home'] = 0
df['H_GS_STD_away'] = 0
df['H_GC_STD_away'] = 0
df['A_GS_STD_away'] = 0
df['A_GC_STD_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]
        
        hist_games = get_historical_games(team_id, game_date)
        game_count = len(hist_games)
        
        # calculate averages
        df.loc[row, 'H_GS_AVG' + suffix] = hist_games[hist_games['HomeID'] == team_id]['FTHG'].mean()
        df.loc[row, 'H_GC_AVG' + suffix] = hist_games[hist_games['HomeID'] == team_id]['FTAG'].mean()
        df.loc[row, 'A_GS_AVG' + suffix] = hist_games[hist_games['AwayID'] == team_id]['FTAG'].mean()
        df.loc[row, 'A_GC_AVG' + suffix] = hist_games[hist_games['AwayID'] == team_id]['FTHG'].mean()
        
        # calculate standard deviations
        df.loc[row, 'H_GS_STD' + suffix] = hist_games[hist_games['HomeID'] == team_id]['FTHG'].std()
        df.loc[row, 'H_GC_STD' + suffix] = hist_games[hist_games['HomeID'] == team_id]['FTAG'].std()
        df.loc[row, 'A_GS_STD' + suffix] = hist_games[hist_games['AwayID'] == team_id]['FTAG'].std()
        df.loc[row, 'A_GC_STD' + suffix] = hist_games[hist_games['AwayID'] == team_id]['FTHG'].std()

In [5]:
df.sample(5)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,H_WIN_PCT_away,...,A_GS_AVG_away,A_GC_AVG_away,H_GS_STD_home,H_GC_STD_home,A_GS_STD_home,A_GC_STD_home,H_GS_STD_away,H_GC_STD_away,A_GS_STD_away,A_GC_STD_away
10784,118,107,0,4,2,0.447368,0.105263,0.297297,0.108108,0.921053,...,2.358974,1.102564,1.389864,1.534107,1.157947,1.833947,1.529464,0.794719,1.477679,0.967767
7053,69,83,0,0,0,0.472222,0.305556,0.243902,0.268293,0.37037,...,1.041667,1.5,1.049565,0.954521,0.961452,1.364623,1.318291,1.14105,0.907896,1.14208
8936,78,95,4,1,1,0.902439,0.073171,0.657895,0.131579,0.166667,...,0.5,1.666667,1.183731,0.536474,1.325159,0.898869,1.21106,0.752773,0.547723,1.032796
3159,8,16,2,0,1,0.631579,0.315789,0.538462,0.282051,0.166667,...,0.75,2.75,1.443088,0.87846,1.328602,1.209499,1.47196,1.505545,0.886405,1.035098
5737,36,53,1,1,0,0.393939,0.242424,0.176471,0.382353,0.277778,...,1.057143,1.485714,1.294657,1.17985,1.264136,1.159707,0.956183,1.157447,0.968409,1.314432


## Current form (data from the last five games)

In [16]:
def get_last_five_games(team_id, date):
    games_by_team = results[((results['HomeID'] == team_id) | (results['AwayID'] == team_id))
                       & (results['Date'] < date)].sort_values(by='Date', ascending=False).head(5)
    return games_by_team

### Winning and drawing percentages

In [None]:
df['WIN_PCT_home'] = 0
df['WIN_PCT_away'] = 0
df['DRAW_PCT_home'] = 0
df['DRAW_PCT_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        recent = get_last_five_games(team_id, game_date)
        print(len(recent))
        # game_count could be less than five if not enough data is available; that's the only reason for this line
        game_count = len(recent)
        
        games_won = len(recent[(recent['HomeID'] == team_id) & (recent['FTR'] == 1)
                    | (recent['AwayID'] == team_id) & (recent['FTR'] == 2)])
        
        games_drawn = len(recent[recent['FTR'] == 0])
        
        suffix = suffixes[id_name]
        
        if game_count > 0:
            df.loc[row, 'WIN_PCT' + suffix] = games_won / game_count
            df.loc[row, 'DRAW_PCT' + suffix] = games_drawn / game_count
        else:
            df.loc[row, 'WIN_PCT' + suffix] = 0
            df.loc[row, 'DRAW_PCT' + suffix] = 0
            

### Goal averages and standard deviations

In [66]:
df['GS_AVG_home'] = 0
df['GC_AVG_home'] = 0
df['GS_AVG_away'] = 0
df['GC_AVG_away'] = 0

df['GS_STD_home'] = 0
df['GC_STD_home'] = 0
df['GS_STD_away'] = 0
df['GC_STD_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]
        
        recent = get_last_five_games(team_id, game_date)
        game_count = len(recent)
        
        if game_count > 0:    # TODO gucken ob man vielleicht höheren threshold nehmen will und dann einfach mean assigned
            df.loc[row, 'GS_AVG' + suffix] = ((recent[recent['HomeID'] == team_id]['FTHG'].sum())
                                                + (recent[recent['AwayID'] == team_id]['FTAG'].sum())) / game_count

            df.loc[row, 'GC_AVG' + suffix] = ((recent[recent['HomeID'] == team_id]['FTAG'].sum())
                                                + (recent[recent['AwayID'] == team_id]['FTHG'].sum())) / game_count
            
            df.loc[row, 'GS_STD' + suffix] = np.std(np.append(recent[recent['HomeID'] == team_id]['FTHG'].tolist(),
                                                recent[recent['AwayID'] == team_id]['FTAG'].tolist()))
        
            df.loc[row, 'GC_STD' + suffix] = np.std(np.append(recent[recent['HomeID'] == team_id]['FTAG'].tolist(),
                                                recent[recent['AwayID'] == team_id]['FTHG'].tolist()))
            
        else:
            df.loc[row, 'GS_AVG' + suffix] = 0 
            df.loc[row, 'GC_AVG' + suffix] = 0
            df.loc[row, 'GS_STD' + suffix] = 0 
            df.loc[row, 'GC_STD' + suffix] = 0

### Rest days

In [128]:
df['REST_home'] = -1
df['REST_away'] = -1

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]
        
        game_count = len(recent)
        
        # overall not the nicest solution, but works
        if game_count > 0:
            last_game = get_last_five_games(team_id, game_date).head(1)     
            try:
                rest_days = (game_date - last_game.iloc[0, 1]).days      # 0 is row, 1 is column (date)
            except:
                rest_days = -1
            
            if rest_days < 0 or rest_days > 21:             # manipulate outliers
                rest_days = 7
            
            df.loc[row, 'REST' + suffix] = rest_days
        else:
            df.loc[row, 'REST' + suffix] = 0    # TODO anderen Wert nehmen, 0 ist dämlich

In [138]:
df.sample(10)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,H_WIN_PCT_away,...,GS_AVG_home,GC_AVG_home,GS_AVG_away,GC_AVG_away,GS_STD_home,GC_STD_home,GS_STD_away,GC_STD_away,REST_home,REST_away
10911,98,105,0,1,2,0.5,0.225,0.257143,0.314286,0.775,...,1.0,1.0,1.2,0.8,0.894427,0.632456,0.748331,1.16619,1,3
1969,31,25,0,1,2,0.333333,0.666667,0.2,0.4,0.363636,...,0.8,1.0,1.2,1.4,0.979796,0.894427,0.748331,0.489898,8,7
5619,45,52,2,1,1,0.4,0.314286,0.28125,0.125,0.5,...,0.8,1.0,0.4,0.8,0.4,0.894427,0.489898,0.979796,7,7
4931,36,47,2,0,1,0.2,0.55,0.380952,0.190476,0.628571,...,1.0,1.0,1.2,0.8,0.632456,0.632456,0.748331,1.16619,10,9
6971,74,83,0,2,2,0.694444,0.138889,0.461538,0.153846,0.36,...,2.0,0.2,1.2,1.6,0.632456,0.4,0.748331,1.019804,7,7
4780,35,37,2,0,1,0.939394,0.030303,0.787879,0.181818,0.34375,...,1.8,0.6,1.0,1.6,1.83303,1.2,0.894427,1.496663,3,3
9878,113,112,1,1,0,0.75,0.1875,0.333333,0.25,0.285714,...,1.0,1.0,0.2,0.8,0.894427,0.632456,0.4,1.16619,1,1
4786,40,51,0,1,2,0.454545,0.30303,0.212121,0.30303,0.588235,...,1.8,2.0,2.6,1.2,1.469694,1.414214,1.496663,0.979796,5,3
6346,74,81,3,1,1,0.625,0.1875,0.411765,0.176471,0.647059,...,1.6,1.2,1.2,1.4,1.019804,0.4,0.4,0.8,10,10
12211,112,99,1,1,0,0.322581,0.193548,0.064516,0.322581,0.470588,...,1.8,2.2,0.2,1.2,0.748331,1.16619,0.4,0.748331,3,3


# Save dataframe as .pkl

In [139]:
df.to_pickle('feature_frame.pkl')