# Pipeline

1. Preprocessing
2. **Data Extraction**
3. Data Exploration
4. Model

This file performs data extraction. It generates several features and returns a .pkl file containing a dataframe with said features.

# Imports

In [4]:
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict

# Data extraction into new Dataframe

In [14]:
# We don't need HomeID and AwayID here, but I put it in just for exploration
results = pd.read_pickle('preprocessed_results.pkl')
df = results[['HomeID', 'AwayID', 'FTHG', 'FTAG', 'FTR']].copy()
df

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR
0,0,15,3,0,1
1,1,12,1,0,1
2,2,16,0,0,0
3,3,14,6,0,1
4,4,11,2,2,0
...,...,...,...,...,...
13007,97,105,2,2,0
13008,111,106,2,0,1
13009,120,109,0,2,2
13010,122,107,2,2,0


In [6]:
df = pd.read_pickle('feature_frame.pkl')

In [15]:
# Only run this if the above cell hasn't been run!!
df['Date'] = results['Date']

In [16]:
results.sample(3)

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
10375,SP1,2012-09-01,114,98,0,1,2,0,0,D,...,4,3,23,14,10,10,1,4,0,0
12264,SP1,2017-08-26,126,107,0,2,2,0,0,D,...,2,11,12,10,1,14,2,3,0,0
7416,I1,2013-11-03,79,77,0,2,2,0,0,D,...,4,3,16,18,5,4,2,1,0,0


In [17]:
# "Constants"
suffixes = {'HomeID':'_home', 'AwayID':'_away'}    # suffix dict for the feature names

id_names = ['HomeID', 'AwayID']                    # list of ID columns for iteration purposes

# suffix that's appended to all league-wide features
league_suffix = '_league'

In [5]:
#slc = df['AwayID'] == 105
#df[slc][['FTR']].value_counts() / sum(slc)

# Implementing non-relational Features from the Hubáček et al. paper

__For the moment, the features are closely aligned to those implemented by Hubáček et al. That means that most features also get implemented twice, once for the home team and once for that away team. The naming convention for this approach is FEATURE_NAME_home/away.__

Example: <br>
A_WIN_PCT_home would be the historical away winning percentage of the team that plays the current game at home. <br>
H_WIN_PCT_home would be the historical home winning percentage of that same team.

That means the word after the last underscore indicates if the value of this feature represents the home team or the away team of the current(!) game.

The current approach is to prefer readability over performance which means that it can take long to add the features to the dataframe.

In [43]:
# TODO: evtl. die feature-erstellungen in Funktionen packen um sie an und aus togglen zu können
# Alle nicht-kursiven Features aus dem Hubacek Paper müssen doppelt implementiert werden, siehe oben.

## Historical strength

In [18]:
# Returns all games that were played by a specific team between date minus two years and date.
def get_historical_games(team_id, date):
    games_by_team = results[((results['HomeID'] == team_id) | (results['AwayID'] == team_id))
                           & (results['Date'] < date) & (results['Date'] > date - datetime.timedelta(730))]
    return games_by_team

In [19]:
get_historical_games(8, datetime.date(2016, 5, 20)).head(6)

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
1527,E1,2014-08-17,8,24,2,1,1,1,0,H,...,5,6,8,11,2,6,1,2,0,0
1539,E1,2014-08-25,18,8,3,1,1,1,0,H,...,4,3,13,7,6,7,1,1,0,0
1549,E1,2014-08-31,5,8,0,3,2,0,1,A,...,1,8,14,11,7,3,0,4,0,0
1553,E1,2014-09-13,8,0,0,1,2,0,1,A,...,1,1,9,10,7,6,2,1,0,0
1565,E1,2014-09-20,15,8,3,1,1,2,1,H,...,7,5,16,11,7,5,4,1,0,0
1574,E1,2014-09-27,8,12,1,1,0,0,0,D,...,8,5,8,9,8,2,2,1,0,0


### Winning and drawing percentages

In [20]:
# Add percentages to dataframe
df['H_WIN_PCT_home'] = 0
df['H_DRAW_PCT_home'] = 0
df['A_WIN_PCT_home'] = 0
df['A_DRAW_PCT_home'] = 0
df['H_WIN_PCT_away'] = 0
df['H_DRAW_PCT_away'] = 0
df['A_WIN_PCT_away'] = 0
df['A_DRAW_PCT_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]
        
        hist_games = get_historical_games(team_id, game_date)
        game_count = len(hist_games)

        # total counts of home and away games in the last two years
        home_game_count = len(hist_games[hist_games['HomeID'] == team_id])
        away_game_count = len(hist_games[hist_games['AwayID'] == team_id])

        home_games_won = len(hist_games[(hist_games['HomeID'] == team_id) & (hist_games['FTR'] == 1)])
        away_games_won = len(hist_games[(hist_games['AwayID'] == team_id) & (hist_games['FTR'] == 2)])

        home_games_drawn = len(hist_games[(hist_games['HomeID'] == team_id) & (hist_games['FTR'] == 0)])
        away_games_drawn = len(hist_games[(hist_games['AwayID'] == team_id) & (hist_games['FTR'] == 0)])

        if home_game_count > 0:         # TODO should the threshold be higher to only use significant enough data?
            df.loc[row, 'H_WIN_PCT' + suffix] = home_games_won / home_game_count
            df.loc[row, 'H_DRAW_PCT' + suffix] = home_games_drawn / home_game_count
        else:
            df.loc[row, 'H_WIN_PCT' + suffix] = 0          # TODO maybe choose some standard value like 0.25 instead of 0?
            df.loc[row, 'H_DRAW_PCT' + suffix] = 0         # Overall Median/Mean could also work.

        if away_game_count > 0:
            df.loc[row, 'A_WIN_PCT' + suffix] = away_games_won / away_game_count
            df.loc[row, 'A_DRAW_PCT' + suffix] = away_games_drawn / away_game_count
        else:
            df.loc[row, 'A_WIN_PCT' + suffix] = 0          
            df.loc[row, 'A_DRAW_PCT' + suffix] = 0 

### Goal averages and standard deviations

In [23]:
# Add goal averages and deviations to dataframe
df['H_GS_AVG_home'] = 0
df['H_GC_AVG_home'] = 0
df['A_GS_AVG_home'] = 0
df['A_GC_AVG_home'] = 0
df['H_GS_AVG_away'] = 0
df['H_GC_AVG_away'] = 0
df['A_GS_AVG_away'] = 0
df['A_GC_AVG_away'] = 0

df['H_GS_STD_home'] = 0
df['H_GC_STD_home'] = 0
df['A_GS_STD_home'] = 0
df['A_GC_STD_home'] = 0
df['H_GS_STD_away'] = 0
df['H_GC_STD_away'] = 0
df['A_GS_STD_away'] = 0
df['A_GC_STD_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]
        
        hist_games = get_historical_games(team_id, game_date)
        game_count = len(hist_games)
        
        # calculate averages
        df.loc[row, 'H_GS_AVG' + suffix] = hist_games[hist_games['HomeID'] == team_id]['FTHG'].mean()
        df.loc[row, 'H_GC_AVG' + suffix] = hist_games[hist_games['HomeID'] == team_id]['FTAG'].mean()
        df.loc[row, 'A_GS_AVG' + suffix] = hist_games[hist_games['AwayID'] == team_id]['FTAG'].mean()
        df.loc[row, 'A_GC_AVG' + suffix] = hist_games[hist_games['AwayID'] == team_id]['FTHG'].mean()
        
        # calculate standard deviations
        df.loc[row, 'H_GS_STD' + suffix] = hist_games[hist_games['HomeID'] == team_id]['FTHG'].std()
        df.loc[row, 'H_GC_STD' + suffix] = hist_games[hist_games['HomeID'] == team_id]['FTAG'].std()
        df.loc[row, 'A_GS_STD' + suffix] = hist_games[hist_games['AwayID'] == team_id]['FTAG'].std()
        df.loc[row, 'A_GC_STD' + suffix] = hist_games[hist_games['AwayID'] == team_id]['FTHG'].std()
        
# During this process, many NaNs occur in the beginning due to insufficient data.
# We replace them with 0.
df.fillna(0, inplace=True)

In [31]:
df.sample(5)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,Date,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,...,A_GS_AVG_away,A_GC_AVG_away,H_GS_STD_home,H_GC_STD_home,A_GS_STD_home,A_GC_STD_home,H_GS_STD_away,H_GC_STD_away,A_GS_STD_away,A_GC_STD_away
6101,51,49,1,3,2,2019-03-17,0.454545,0.272727,0.416667,0.222222,...,1.5,1.558824,1.136515,1.314978,1.769898,1.133543,1.231246,1.379319,1.212311,1.049998
7430,78,81,3,0,1,2013-11-10,0.763158,0.184211,0.625,0.275,...,1.552632,1.157895,1.109893,0.724004,1.347124,0.921259,1.413987,1.025008,1.288145,1.000711
6634,76,81,2,1,1,2011-10-29,0.565217,0.26087,0.043478,0.391304,...,1.391304,1.130435,0.934622,1.083473,0.886883,1.029217,1.222322,1.042572,1.117592,1.057628
7268,68,74,1,0,1,2013-04-28,0.405405,0.27027,0.078947,0.368421,...,1.324324,1.405405,1.437904,1.168917,1.010964,1.148937,1.411445,1.172225,1.055516,1.300843
12964,120,127,1,0,1,2019-04-23,0.1875,0.3125,0.235294,0.352941,...,1.228571,1.628571,0.885061,1.264911,0.951006,1.169464,1.222475,1.17108,0.972738,1.628527


## Current form (data from the last five games)

In [32]:
# df_mode indicates whether to return the df dataframe or the results dataframe
def get_last_n_games(team_id, date, n=5, df_mode=False):
    if df_mode == False:
        games_by_team = results[((results['HomeID'] == team_id) | (results['AwayID'] == team_id))
                           & (results['Date'] < date)].sort_values(by='Date', ascending=False).head(n)
    else:
        games_by_team = df[((df['HomeID'] == team_id) | (df['AwayID'] == team_id))
                           & (df['Date'] < date)].sort_values(by='Date', ascending=False).head(n)
        
    return games_by_team

In [33]:
get_last_n_games(19, datetime.date(2011, 6, 1))

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
374,E1,2011-05-22,9,19,4,2,1,1,1,D,...,11,7,8,5,8,8,2,1,0,0
361,E1,2011-05-14,19,2,4,3,1,3,2,H,...,8,9,16,12,5,6,2,4,0,0
353,E1,2011-05-07,5,19,1,1,0,0,0,D,...,11,7,5,13,7,6,3,2,0,0
340,E1,2011-04-30,19,13,0,0,0,0,0,D,...,6,6,10,11,4,8,0,0,0,0
328,E1,2011-04-23,19,17,1,1,0,1,1,D,...,7,6,17,8,4,4,1,1,0,0


### Winning and drawing percentages

In [34]:
df['WIN_PCT_home'] = 0
df['WIN_PCT_away'] = 0
df['DRAW_PCT_home'] = 0
df['DRAW_PCT_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        recent = get_last_n_games(team_id, game_date)
        #print(len(recent))
        # game_count could be less than five if not enough data is available; that's the only reason for this line
        game_count = len(recent)
        
        games_won = len(recent[(recent['HomeID'] == team_id) & (recent['FTR'] == 1)
                    | (recent['AwayID'] == team_id) & (recent['FTR'] == 2)])
        
        games_drawn = len(recent[recent['FTR'] == 0])
        
        suffix = suffixes[id_name]
        
        if game_count > 0:
            df.loc[row, 'WIN_PCT' + suffix] = games_won / game_count
            df.loc[row, 'DRAW_PCT' + suffix] = games_drawn / game_count
        else:
            df.loc[row, 'WIN_PCT' + suffix] = 0
            df.loc[row, 'DRAW_PCT' + suffix] = 0
            

### Goal averages and standard deviations

In [35]:
df['GS_AVG_home'] = 0
df['GC_AVG_home'] = 0
df['GS_AVG_away'] = 0
df['GC_AVG_away'] = 0

df['GS_STD_home'] = 0
df['GC_STD_home'] = 0
df['GS_STD_away'] = 0
df['GC_STD_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]
        
        recent = get_last_n_games(team_id, game_date)
        game_count = len(recent)
        
        if game_count > 0:    # TODO gucken ob man vielleicht höheren threshold nehmen will und dann einfach mean assigned
            df.loc[row, 'GS_AVG' + suffix] = ((recent[recent['HomeID'] == team_id]['FTHG'].sum())
                                                + (recent[recent['AwayID'] == team_id]['FTAG'].sum())) / game_count

            df.loc[row, 'GC_AVG' + suffix] = ((recent[recent['HomeID'] == team_id]['FTAG'].sum())
                                                + (recent[recent['AwayID'] == team_id]['FTHG'].sum())) / game_count
            
            df.loc[row, 'GS_STD' + suffix] = np.std(np.append(recent[recent['HomeID'] == team_id]['FTHG'].tolist(),
                                                recent[recent['AwayID'] == team_id]['FTAG'].tolist()))
        
            df.loc[row, 'GC_STD' + suffix] = np.std(np.append(recent[recent['HomeID'] == team_id]['FTAG'].tolist(),
                                                recent[recent['AwayID'] == team_id]['FTHG'].tolist()))
            
        else:
            df.loc[row, 'GS_AVG' + suffix] = 0 
            df.loc[row, 'GC_AVG' + suffix] = 0
            df.loc[row, 'GS_STD' + suffix] = 0 
            df.loc[row, 'GC_STD' + suffix] = 0

### Rest days

In [53]:
df['REST_home'] = -1
df['REST_away'] = -1

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]

        recent = get_last_n_games(team_id, game_date)
        game_count = len(recent)
        
        # overall not the nicest solution, but works
        if game_count > 0:
            last_game = get_last_n_games(team_id, game_date).head(1)     
            try:
                rest_days = (game_date - last_game.iloc[0, 1]).days      # 0 is row, 1 is column (date)
            except:
                rest_days = -1
            
            if rest_days < 0 or rest_days > 21:             # manipulate outliers
                rest_days = 7
            
            df.loc[row, 'REST' + suffix] = rest_days
        else:
            df.loc[row, 'REST' + suffix] = 7    # if now data is available, just one week of rest

## League averages

In [54]:
def get_league_games(league, date):
    games = results[(results['Div'] == league) & (results['Date'] < date) & (results['Date'] > date - datetime.timedelta(730))]
    return games

In [55]:
df['H_GS_AVG' + league_suffix] = 0
df['A_GS_AVG' + league_suffix] = 0
df['H_GS_STD' + league_suffix] = 0
df['A_GS_STD' + league_suffix] = 0
df['H_WIN_PCT' + league_suffix] = 0
df['DRAW_PCT' + league_suffix] = 0

for row, game in results.iterrows():
    league_name = game['Div']
    game_date = game['Date']

    league_games = get_league_games(league_name, game_date)
    game_count = len(league_games)

    # goals scored means
    df.loc[row, 'H_GS_AVG' + league_suffix] = league_games[league_games['Div'] == league_name]['FTHG'].mean()
    df.loc[row, 'A_GS_AVG' + league_suffix] = league_games[league_games['Div'] == league_name]['FTAG'].mean()
    
    # goals scored standard deviations
    df.loc[row, 'H_GS_STD' + league_suffix] = league_games[league_games['Div'] == league_name]['FTHG'].std()
    df.loc[row, 'A_GS_STD' + league_suffix] = league_games[league_games['Div'] == league_name]['FTAG'].std()
   
    home_games_won = len(league_games[(league_games['Div'] == league_name) & (league_games['FTR'] == 1)])
    games_drawn = len(league_games[(league_games['Div'] == league_name) & (league_games['FTR'] == 0)])
   
    if game_count > 0:
        df.loc[row, 'H_WIN_PCT' + league_suffix] = home_games_won / game_count
        df.loc[row, 'DRAW_PCT' + league_suffix] = games_drawn / game_count
    else:
        df.loc[row, 'H_WIN_PCT' + league_suffix] = 0
        df.loc[row, 'DRAW_PCT' + league_suffix] = 0

### Other league features

In [60]:
df['TEAM_CNT' + league_suffix] = 0
df['GD_STD' + league_suffix] = 0
df['RND_CNT' + league_suffix] = 0
for row, game in results.iterrows():
    league_name = game['Div']
    game_date = game['Date']
    
    # all leagues consist of 20 teams, except the Bundesliga (D1)
    if league_name == 'D1':
        df.loc[row, 'TEAM_CNT' + league_suffix] = 18
        df.loc[row, 'RND_CNT' + league_suffix] = 34
    else:
        df.loc[row, 'TEAM_CNT' + league_suffix] = 20
        df.loc[row, 'RND_CNT' + league_suffix] = 38
        
    games = get_league_games(league_name, game_date)
    
    # goal difference standard deviation
    df.loc[row, 'GD_STD' + league_suffix] = abs(games['FTHG'] - games['FTAG']).std() # muss hier wirklich abs() hin?

## Pi-ratings

### Functions

In [61]:
# Constant and error function (called psi) from the pi-ratings paper by Constantinou and Fenton
c = 3
def psi(error):
    return c * np.log10(1 + error)

def psi_h(error, exp_gd, actual_gd):
    if (exp_gd - actual_gd) < 0:
        return psi(error)
    else:
        return -1 * psi(error)

def psi_a(error, exp_gd, actual_gd):
    if (exp_gd - actual_gd) > 0:
        return psi(error)
    else:
        return -1 * psi(error)
    

# Constant and goal difference prediction function
b = 10
def predict_gd(rating):
    if rating >= 0:
        return np.power(b, np.abs(rating) / c) - 1
    else:
        return -(np.power(b, np.abs(rating) / c) - 1)

'''
Returns a dict with keys H_RTG_home, H_RTG_away, A_RTG_away, A_RTG_away
Used for one game!

cur_ratings: input in the same form as the output (dict with 4 elems); current ratings
'''
def update_pi_ratings(cur_ratings, actual_gd):
    H_RTG_home, H_RTG_away, A_RTG_home, A_RTG_away = cur_ratings['H_RTG_home'], cur_ratings['H_RTG_away'], cur_ratings['A_RTG_home'], cur_ratings['A_RTG_away']
    
    exp_gd = predict_gd(H_RTG_home) - predict_gd(A_RTG_away)    # Calculate expected goal difference
    
    error = np.abs(actual_gd - exp_gd)
    
    # The two learning rates from the pi-ratings paper (may have to be adjusted)
    lr_lambda = 0.1
    lr_gamma = 0.3
    
    # Update the pi-ratings
    new_H_RTG_home = H_RTG_home + psi_h(error, exp_gd, actual_gd) * lr_lambda 
    new_H_RTG_away = H_RTG_away + (new_H_RTG_home - H_RTG_home) * lr_gamma
    
    new_A_RTG_away = A_RTG_away + psi_a(error, exp_gd, actual_gd) * lr_lambda
    new_A_RTG_home = A_RTG_home + (new_A_RTG_away - A_RTG_away) * lr_gamma
    
    new_ratings = {'H_RTG_home': new_H_RTG_home, 'H_RTG_away': new_H_RTG_away, 'A_RTG_away': new_A_RTG_away, 'A_RTG_home': new_A_RTG_home}
    
    return new_ratings

In [62]:
update_pi_ratings({'H_RTG_home': 1.6, 'H_RTG_away': 0.4, 'A_RTG_home': 0.3, 'A_RTG_away': -1.2}, 3)

{'H_RTG_home': 1.5145736713398419,
 'H_RTG_away': 0.37437210140195254,
 'A_RTG_away': -1.1145736713398415,
 'A_RTG_home': 0.3256278985980475}

In [63]:
predict_gd(-1.5)

-2.1622776601683795

### Implementation into dataframe

In [64]:
# Returns the ID of the next game of a given team.
def get_next_game_id(team_id, date):
    games = results[((results['HomeID'] == team_id) | (results['AwayID'] == team_id))
                           & (results['Date'] > date)].sort_values(by='Date')
    
    if len(games) > 0:
        return games.index[0]    # [0] refers to the first element of the array returned by .index here
    else:
        print("LAST GAME:", date, team_id)
        return None

In [66]:
get_next_game_id(31, datetime.date(2017, 4, 2))

2576

In [None]:
df['H_RTG_home'] = 0
df['A_RTG_home'] = 0
df['H_RTG_away'] = 0
df['A_RTG_away'] = 0

for row, game in results.iterrows():
    league_name = game['Div']
    game_date = game['Date']    
    
    # Get current pi ratings from dataframe
    cur_ratings = {
        'H_RTG_home': df.loc[row, 'H_RTG_home'],
        'H_RTG_away': df.loc[row, 'H_RTG_away'],
        'A_RTG_home': df.loc[row, 'A_RTG_home'],
        'A_RTG_away': df.loc[row, 'A_RTG_away']
    }
    
    actual_gd = game['FTHG'] - game['FTAG']
    
    new_ratings = update_pi_ratings(cur_ratings, actual_gd)
    
    # Update all pi ratings for the next game (if it wasn't a teams last game)

    next_game_id_H = get_next_game_id(game['HomeID'], game_date)
    next_game_id_A = get_next_game_id(game['AwayID'], game_date)
    
    if next_game_id_H != None:
        df.loc[next_game_id_H, 'H_RTG_home'] = new_ratings['H_RTG_home']
        df.loc[next_game_id_H, 'H_RTG_away'] = new_ratings['H_RTG_away']
    
    if next_game_id_A != None:
        df.loc[next_game_id_A, 'A_RTG_home'] = new_ratings['A_RTG_home']
        df.loc[next_game_id_A, 'A_RTG_away'] = new_ratings['A_RTG_away']
    
    # TODO: Gucken ob die Indizes passen oder ob da kleine errors drin sind die alles durcheinander wirbeln
    

### Expected goal difference for the match

In [70]:
for row, game in results.iterrows():
    exp_gd_match = predict_gd(df.loc[row, 'H_RTG_home']) - predict_gd(df.loc[row, 'A_RTG_away'])
    df.loc[row, 'EGD'] = exp_gd_match

## PageRank

To save computation time, we don't calculate an entire matrix every time. Instead, we just calculate the average points of the two given teams against each other from the last two years (like described in the Hubacek paper). 

In [20]:
# We initialize it with 1 point per game as the default value.
df['EPTS_PR_home'] = 1
df['EPTS_PR_away'] = 1

for row, game in results.iterrows():
    home_id = game['HomeID']
    away_id = game['AwayID']
    
    game_date = game['Date']

    hist_games = get_historical_games(home_id, game_date)  # Could also use away_id here, it doesn't matter

    # Previous matches of the two current teams
    previous_matches = hist_games[((home_id == hist_games['HomeID']) | (home_id == hist_games['AwayID']))
              & ((away_id == hist_games['HomeID']) | (away_id == hist_games['AwayID']))]
    
    game_count = len(previous_matches)
    
    if game_count == 0:
        continue              # Avoid division by zero by just letting the PR for both teams be 1
    
    # Number of wins of the current(!) home team in previous_matches
    win_count_H = len(previous_matches[((previous_matches['HomeID'] == home_id) & previous_matches['FTR'] == 1)
                                      |((previous_matches['AwayID'] == home_id) & previous_matches['FTR'] == 2)])
    
    # Number of draws in previous_matches
    draw_count = len(previous_matches[previous_matches['FTR'] == 0])
    
    # Number of wins of the current(!) away team in previous_matches
    win_count_A = len(previous_matches[((previous_matches['HomeID'] == away_id) & previous_matches['FTR'] == 1)
                                      |((previous_matches['AwayID'] == away_id) & previous_matches['FTR'] == 2)])
    
    df.loc[row, 'EPTS_PR_home'] = (3 * win_count_H + draw_count) / game_count
    df.loc[row, 'EPTS_PR_away'] = (3 * win_count_A + draw_count) / game_count

# Implementing own features

**In the following section, some of my own feature ideas are implemented into the dataframe.**

These can be based on historical data, current form data, league data or whatever else.

## Red Cards

Here, we accumulate the red cards each team collected in the last three games. Missing a player due to a red card can be a potentially important factor.

In [71]:
df['RED_home'] = 0
df['RED_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']
        
        recent = get_last_n_games(team_id, game_date, n=3)
        
        suffix = suffixes[id_name]
        
        df.loc[row, 'RED' + suffix] = recent['HR'].sum()

## Difference between own shots and shots conceded (historical)

In [72]:
df['H_ST_home'] = 0
df['A_ST_home'] = 0
df['H_ST_away'] = 0
df['A_ST_away'] = 0

for row, game in results.iterrows():
    for id_name in id_names:
        team_id = game[id_name]
        game_date = game['Date']

        hist_games = get_historical_games(team_id, game_date)
        
        home_shots_mean = hist_games[hist_games['HomeID'] == team_id]['HS'].mean()
        away_shots_mean = hist_games[hist_games['AwayID'] == team_id]['AS'].mean()
        
        if np.isnan(home_shots_mean):
            home_shots_mean = 0       # TODO: hier besseren filler value suchen
            
        if np.isnan(away_shots_mean):
            away_shots_mean = 0       # TODO: hier besseren filler value suchen
            
        df.loc[row, 'H_ST' + suffixes[id_name]] = home_shots_mean
        df.loc[row, 'A_ST' + suffixes[id_name]] = away_shots_mean

## Match importance (similar to Hubacek)

### Season

In [73]:
# Returns the season corresponding to a certain date.
def get_season_from_date(date):
    for i in range(9):
        if date < datetime.date(2010 + i+1, 7, 15):
            return i+1

In [74]:
# To build up tables, there needs to be a cut where one season ends and the next begins.
# This is the 15th of July every year here.
df['season'] = 0

for row, game in results.iterrows():
    game_date = game['Date']
    #print(get_season_from_date(game_date))
    df.loc[row, 'season'] = get_season_from_date(game_date)

### Round (1-34 / 1-38)

In [102]:
df['RND'] = 0

prev_season = 1
cur_matchday = 1

modulo_add = 1

df = df.sort_index()
    
if excepted:
    df.index = range(len(df.index))

for i in range(len(df.index)):
    try:
        game_season = df.loc[i, 'season']
        
        if game_season != prev_season:       # check if new season has begun
            cur_matchday = 1

        matches_per_md = int(np.round(df.loc[i, 'TEAM_CNT_league'] / 2))
    
        df.loc[i, 'RND'] = cur_matchday

        # Change modulo_add for the switch from Germany (9 games per matchday) to Italy (10 games per matchday).
        # Used to avoid getting bugs in current matchday.
        if i == 6174:
            modulo_add = -3
            
        if (i + modulo_add) % matches_per_md == 0:      # check if last game of a matchday has been played
            cur_matchday += 1

        prev_season = game_season
        
    except:
        df.loc[6968, 'season'] = 3
        df.loc[6968, 'RND'] = 4
        
        df.loc[8471, 'season'] = 7
        df.loc[8471, 'RND'] = 2
        continue

### Current standings

In [None]:
# Gets all games of a team that were played before a specific date in the current season
def get_team_season_games(team_id, date, season):
    games_by_team = results[((results['HomeID'] == team_id) | (results['AwayID'] == team_id))
                           & (results['Date'] < date) & (results[])]
    return games_by_team

In [32]:
get_last_n_games(8, datetime.date(2010,8,22), n=1, df_mode=True)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,H_WIN_PCT_away,...,RED_away,H_ST_home,H_ST_away,A_ST_home,A_ST_away,season,RND,Date,PTS_AFTER_GAME_home,PTS_AFTER_GAME_away
8,8.0,10.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2010-08-15,1,1


In [95]:
# Points after the current game
df['PTS_AFTER_GAME_home'] = 0
df['PTS_AFTER_GAME_away'] = 0

for row, game in df.iterrows():
    #print(row)
    # Game result
    game_res = game['FTR']
    game_date = game['Date']
    
    # Get previous points of both teams (0 if this game is part of the first matchday)
    if game['RND'] == 1:
        prev_points_home = 0
        prev_points_away = 0
    else:
        # Last game played by the home team
        last_game_home = get_last_n_games(game['HomeID'], game_date, n=1, df_mode=True)
        # Last game played by the away team
        last_game_away = get_last_n_games(game['AwayID'], game_date, n=1, df_mode=True)
        
        #print(last_game_home.iloc[0])
        print(row)
        
        # Get points from after the teams' last games
        if game['HomeID'] == last_game_home.iloc[0]['HomeID']:
            prev_points_home = last_game_home.iloc[0]['PTS_AFTER_GAME_home']
        elif game['HomeID'] == last_game_home.iloc[0]['AwayID']:
            prev_points_home = last_game_home.iloc[0]['PTS_AFTER_GAME_away']
        
        if game['AwayID'] == last_game_away.iloc[0]['HomeID']:
            prev_points_away = last_game_away.iloc[0]['PTS_AFTER_GAME_home']
        elif game['AwayID'] == last_game_away.iloc[0]['AwayID']:
            prev_points_away = last_game_away.iloc[0]['PTS_AFTER_GAME_away']
        
    if game_res == 1:      # Home win
        df.loc[row, 'PTS_AFTER_GAME_home'] = prev_points_home + 3
    elif game_res == 0:    # Draw
        df.loc[row, 'PTS_AFTER_GAME_home'] = prev_points_home + 1
        df.loc[row, 'PTS_AFTER_GAME_away'] = prev_points_away + 1
    elif game_res == 2:    # Away win
        df.loc[row, 'PTS_AFTER_GAME_away'] = prev_points_away + 3

10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
28

1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127


3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697
3698
3699
3700
3701
3702
3703
3704
3705
3706
3707
3708
3709
3710
3711
3712
3713
3714
3715
3716
3717
3718
3719
3720
3721
3722
3723
3724
3725
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
3751
3752
3753
3754
3755
3756
3757
3758
3759
3760
3761
3762
3763
3764
3765
3766
3767
3768
3769
3770
3771
3772
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
3799
3800
3801
3802
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829


5318
5319
5320
5321
5322
5323
5324
5325
5326
5327
5328
5329
5330
5331
5332
5333
5334
5335
5336
5337
5338
5339
5340
5341
5342
5343
5344
5345
5346
5347
5348
5349
5350
5351
5352
5353
5354
5355
5356
5357
5358
5359
5360
5361
5362
5363
5364
5365
5366
5367
5368
5369
5370
5371
5372
5373
5374
5375
5376
5377
5378
5379
5380
5381
5382
5383
5384
5385
5386
5387
5388
5389
5390
5391
5392
5393
5394
5395
5396
5397
5398
5399
5400
5401
5402
5403
5404
5405
5406
5407
5408
5409
5410
5411
5412
5413
5414
5415
5416
5417
5418
5419
5420
5421
5422
5423
5424
5425
5426
5427
5428
5429
5430
5431
5432
5433
5434
5435
5436
5437
5438
5439
5440
5441
5442
5443
5444
5445
5446
5447
5448
5449
5450
5451
5452
5453
5454
5455
5456
5457
5458
5459
5460
5461
5462
5463
5464
5465
5466
5467
5468
5469
5470
5471
5472
5473
5474
5475
5476
5477
5478
5479
5480
5481
5482
5483
5484
5485
5486
5487
5488
5489
5490
5491
5492
5493
5494
5495
5496
5497
5498
5499
5500
5501
5502
5503
5504
5505
5506
5507
5508
5509
5510
5511
5512
5513
5514
5515
5516
5517


IndexError: single positional indexer is out-of-bounds

In [84]:
df[8460:8485]

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,Date,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,...,H_RTG_away,A_RTG_away,EGD,RED_home,RED_away,H_ST_home,A_ST_home,H_ST_away,A_ST_away,season
8460,67,86,3,2,1,2016-08-21,0.473684,0.289474,0.263158,0.368421,...,0.049481,-0.486491,0.587613,1,1,15.131579,11.157895,14.657895,12.0,7
8461,68,89,0,1,2,2016-08-21,0.368421,0.263158,0.210526,0.315789,...,0.092982,0.0,0.268565,1,0,12.947368,10.5,13.631579,10.921053,7
8462,85,81,2,2,0,2016-08-21,0.0,0.0,0.0,0.0,...,0.116622,0.0,0.34766,0,1,0.0,0.0,18.921053,14.710526,7
8463,79,78,0,1,2,2016-08-27,0.578947,0.105263,0.384615,0.282051,...,0.049571,-0.031592,0.159762,0,0,15.289474,12.410256,17.717949,13.947368,7
8464,81,67,4,2,1,2016-08-27,0.710526,0.210526,0.410256,0.230769,...,0.062976,0.038874,0.14454,1,2,18.921053,14.897436,15.025641,11.157895,7
8465,72,62,2,2,0,2016-08-28,0.210526,0.210526,0.2,0.3,...,0.123009,-0.150919,0.492673,0,0,13.684211,12.0,15.897436,13.5,7
8466,93,77,1,3,2,2016-08-28,0.0,0.0,0.0,0.0,...,-0.02096,-0.68439,0.635858,0,0,0.0,5.0,13.692308,11.684211,7
8467,66,65,1,0,1,2016-08-28,0.526316,0.289474,0.410256,0.230769,...,-0.016575,0.099228,-0.122453,0,0,18.052632,13.025641,12.871795,10.289474,7
8468,74,68,1,1,0,2016-08-28,0.526316,0.236842,0.358974,0.282051,...,0.060964,-0.522686,0.662373,0,2,16.394737,13.435897,12.794872,10.5,7
8469,70,82,2,1,1,2016-08-28,0.394737,0.368421,0.230769,0.333333,...,-0.022544,-0.362112,0.261022,1,0,12.973684,11.564103,13.948718,10.315789,7


In [83]:
df[6960:6980]

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,Date,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,...,H_RTG_away,A_RTG_away,EGD,RED_home,RED_away,H_ST_home,A_ST_home,H_ST_away,A_ST_away,season
6960,85,70,2,3,2,2012-09-16,0.0,0.0,0.0,0.0,...,0.01819,-0.427004,0.435463,0,0,13.0,1.0,12.789474,11.105263,3
6961,62,71,2,3,2,2012-09-16,0.552632,0.263158,0.368421,0.184211,...,-0.042505,0.358029,-0.431143,0,0,17.605263,14.315789,12.763158,10.289474,3
6962,83,63,2,2,0,2012-09-16,0.4,0.25,0.15,0.35,...,0.103898,-0.155489,0.431252,0,1,10.65,9.35,14.736842,13.105263,3
6963,86,74,0,2,2,2012-09-16,1.0,0.0,0.0,1.0,...,0.051833,0.0,0.141804,0,0,16.0,8.0,18.684211,14.236842,3
6964,78,65,2,0,1,2012-09-22,0.594595,0.297297,0.473684,0.421053,...,0.107771,0.161844,0.18522,1,0,18.27027,15.578947,12.868421,11.513514,3
6965,69,66,1,1,0,2012-09-22,0.459459,0.297297,0.236842,0.289474,...,0.145547,-0.13831,0.563169,0,0,14.702703,11.052632,15.473684,12.0,3
6966,82,68,1,0,1,2012-09-23,0.45,0.3,0.238095,0.380952,...,0.088646,0.239299,0.052956,0,0,11.95,9.714286,16.0,12.675676,3
6967,71,85,1,1,0,2012-09-23,0.378378,0.27027,0.263158,0.315789,...,-0.016602,0.416704,-0.420287,0,0,12.756757,10.131579,14.0,1.0,3
6968,76,81,0,0,0,2012-09-23,0.513514,0.243243,0.078947,0.421053,...,0.081966,-0.137137,0.344308,0,0,15.27027,12.868421,14.473684,12.324324,3
6969,74,83,0,2,2,2012-09-23,0.621622,0.189189,0.421053,0.157895,...,0.089884,0.149154,0.137262,0,0,18.702703,13.894737,10.809524,9.35,3


In [82]:
df.sample(10)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,Date,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,...,H_RTG_away,A_RTG_away,EGD,RED_home,RED_away,H_ST_home,A_ST_home,H_ST_away,A_ST_away,season
2043,24,0,1,1,0,2015-12-05,0.473684,0.236842,0.368421,0.236842,...,-0.050279,-0.149211,-0.015935,0,0,15.5,12.105263,11.5,10.0,6
222,0,18,1,0,1,2011-01-22,0.363636,0.363636,0.090909,0.272727,...,0.103653,0.028488,0.28157,1,1,13.0,11.0,13.833333,10.545455,1
6717,71,76,2,0,1,2012-01-08,0.333333,0.296296,0.222222,0.259259,...,-0.042716,0.05365,-0.157519,1,0,12.62963,10.555556,14.777778,13.148148,2
567,4,18,1,0,1,2012-01-01,0.321429,0.321429,0.25,0.285714,...,0.036509,-0.206681,0.269816,0,0,13.035714,11.714286,15.142857,13.0,2
5064,50,49,6,0,1,2015-11-21,0.705882,0.235294,0.382353,0.294118,...,0.0,0.078507,-0.062109,0,0,16.911765,13.352941,13.514286,10.242424,6
9896,96,99,0,0,0,2011-04-10,0.428571,0.142857,0.125,0.1875,...,-0.036112,-0.196598,0.066081,0,0,11.357143,8.25,12.733333,12.533333,1
3290,12,18,0,2,2,2019-02-06,0.564103,0.179487,0.157895,0.315789,...,0.154286,0.0,0.483985,0,0,12.871795,9.578947,19.205128,15.684211,9
3215,34,24,1,3,2,2018-12-22,0.25,0.25,0.148148,0.259259,...,0.032865,0.0,0.087718,0,0,10.785714,9.296296,14.131579,11.868421,9
1295,17,24,1,1,0,2013-12-14,0.526316,0.157895,0.289474,0.184211,...,-0.034117,-0.293094,0.161061,0,0,14.894737,10.631579,14.222222,11.269231,4
12491,125,108,1,3,2,2018-02-21,0.344828,0.275862,0.1875,0.25,...,0.0,-0.096194,0.076626,1,0,11.482759,9.75,19.108108,16.702703,8


In [22]:
df.columns

Index(['HomeID', 'AwayID', 'FTHG', 'FTAG', 'FTR', 'Date', 'H_WIN_PCT_home',
       'H_DRAW_PCT_home', 'A_WIN_PCT_home', 'A_DRAW_PCT_home',
       'H_WIN_PCT_away', 'H_DRAW_PCT_away', 'A_WIN_PCT_away',
       'A_DRAW_PCT_away'],
      dtype='object')

# Save dataframe as .pkl

In [85]:
df.to_pickle('feature_frame.pkl')