In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp feature_engineering

from pathlib import Path
import pandas as pd

# Features
> All things feature engineering

In [None]:
df = pd.read_csv('../data/football_data_uk/germany/D1_1415.csv', )
df.Date = pd.to_datetime(df.Date, format='%d/%m/%y')
df.sort_values('Date')
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,D1,2014-08-22,Bayern Munich,Wolfsburg,2,1,H,1,0,H,...,3.0,22,-2.0,2.35,2.24,1.73,1.68,1.29,6.67,10.58
1,D1,2014-08-23,Dortmund,Leverkusen,0,2,A,0,1,A,...,2.39,26,-1.0,2.08,2.02,1.87,1.84,1.75,4.18,4.77
2,D1,2014-08-23,Ein Frankfurt,Freiburg,1,0,H,1,0,H,...,2.01,22,-0.5,2.05,2.01,1.9,1.86,2.01,3.74,3.92
3,D1,2014-08-23,FC Koln,Hamburg,0,0,D,0,0,D,...,2.0,21,0.0,1.5,1.47,2.92,2.7,2.06,3.62,3.86
4,D1,2014-08-23,Hannover,Schalke 04,2,1,H,0,0,D,...,2.15,22,0.25,1.89,1.84,2.08,2.04,3.1,3.6,2.37


### Extract teams

In [None]:
# export
def extract_teams(df, home_team='HomeTeam', away_team='AwayTeam'):
    home_teams = df[home_team].unique()
    away_teams = df[away_team].unique()
    assert set(home_teams)==set(away_teams), f"home/away team list is different:{set(home_teams)^set(away_teams)}"

    n_games = len(df)
    n_teams = len(home_teams)
    expected_n_games = n_teams*(n_teams-1)
    assert n_games==expected_n_games, f"Number of games {n_games} does not match number of teams ({n_teams}, {expected_n_games}"
    
    return list(home_teams)

In [None]:
teams = extract_teams(df)
teams

['Bayern Munich',
 'Dortmund',
 'Ein Frankfurt',
 'FC Koln',
 'Hannover',
 'Hertha',
 'Hoffenheim',
 "M'gladbach",
 'Paderborn',
 'Augsburg',
 'Hamburg',
 'Leverkusen',
 'Schalke 04',
 'Stuttgart',
 'Werder Bremen',
 'Wolfsburg',
 'Freiburg',
 'Mainz']

### Game day

There should be n_teams/2 games per gameday.

In [None]:
# export
def add_gamedays(df, home_team='HomeTeam', away_team='AwayTeam'):
    teams = extract_teams(df, home_team=home_team, away_team=away_team)
    
    n_teams = len(teams)
    n_gamedays = len(df)*2//n_teams
    games_per_day = n_teams//2
    
    gamedays = sum(([i]*games_per_day for i in range(n_gamedays)), start=[])
    assert len(gamedays)==len(df)
    
    df['gameday'] = gamedays
    
    for gameday, group in df.groupby('gameday'):
        assert set(group.HomeTeam.unique()).union(set(group.AwayTeam.unique()))==set(teams), "Each team should play once during each gameday"

In [None]:
add_gamedays(df)

## Points

- 3 points for the winning team
- 0 points for the losing team
- 1 point for each team when they draw

- Extract all playing teams (sanity check)
- current_points: start with 0 points for each team
- home/away_points: empty list
- Go trough the matches in order of date (=time of kickoff)
    - note current_points for each team
    - update current points according to match outcome
- add columns for home/away points

### Points accumulation

In [None]:
# export
def add_points(df, home_team='HomeTeam', away_team='AwayTeam', date='Date'):
    current_points = {team: 0 for team in teams}
    total_home_points = []
    total_away_points = []

    df = df.sort_values(date)

    for _,row in df.iterrows():
        if row.FTR=='H':
            home_points = 3
            away_points = 0
        if row.FTR=='D':
            home_points = 1
            away_points = 1
        if row.FTR=='A':
            home_points = 0
            away_points = 3

        total_home_points.append(current_points[row[home_team]])
        total_away_points.append(current_points[row[away_team]])

        current_points[row.HomeTeam] += home_points
        current_points[row.AwayTeam] += away_points

    assert len(total_home_points)==len(total_away_points)==len(df)

    df['home_points'] = total_home_points
    df['away_points'] = total_away_points
    
    return df

In [None]:
df = add_points(df)
df.tail()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA,gameday,home_points,away_points
299,D1,2015-05-23,Ein Frankfurt,Leverkusen,2,1,H,2,1,H,...,2.04,1.97,1.93,1.87,4.15,4.27,1.84,33,40,61
298,D1,2015-05-23,Dortmund,Werder Bremen,3,2,H,3,1,H,...,1.82,1.75,2.23,2.13,1.26,7.47,10.8,33,43,43
297,D1,2015-05-23,Bayern Munich,Mainz,2,0,H,1,0,H,...,1.96,1.91,2.0,1.93,1.17,9.48,15.98,33,76,40
300,D1,2015-05-23,FC Koln,Wolfsburg,2,2,D,1,2,A,...,2.03,1.97,1.93,1.88,4.19,3.75,1.94,33,39,68
305,D1,2015-05-23,Paderborn,Stuttgart,1,2,A,1,1,D,...,2.09,2.02,1.88,1.83,3.56,4.13,2.01,33,31,33


### Positions

In [None]:
# export
def _positions_from_sorted_points(sorted_teams):
    current_position = 0
    current_points = 1000
    gameday_positions = {}

    for team,points in sorted_teams:
        if points<current_points:
            current_position += 1
        gameday_positions[team] = current_position
        current_points = points
    return gameday_positions

def add_positions(df):
    teams = extract_teams(df)
    all_positions = {}

    for gameday,group in df.groupby('gameday'):
        gameday_positions = {}
        points = []
        for _,row in group.iterrows():
            points.append([row['HomeTeam'], row['home_points']])
            points.append([row['AwayTeam'], row['away_points']])

        sorted_teams = sorted(points, key=lambda el: el[1], reverse=True)
        gameday_positions = _positions_from_sorted_points(sorted_teams)
        all_positions[gameday] = gameday_positions
        
    df['home_position'] = df.apply(lambda row: all_positions[row.gameday][row.HomeTeam], axis=1)
    df['away_position'] = df.apply(lambda row: all_positions[row.gameday][row.AwayTeam], axis=1)
    
    return df

In [None]:
df = add_positions(df)
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA,gameday,home_points,away_points,home_position,away_position
0,D1,2014-08-22,Bayern Munich,Wolfsburg,2,1,H,1,0,H,...,1.73,1.68,1.29,6.67,10.58,0,0,0,1,1
1,D1,2014-08-23,Dortmund,Leverkusen,0,2,A,0,1,A,...,1.87,1.84,1.75,4.18,4.77,0,0,0,1,1
2,D1,2014-08-23,Ein Frankfurt,Freiburg,1,0,H,1,0,H,...,1.9,1.86,2.01,3.74,3.92,0,0,0,1,1
3,D1,2014-08-23,FC Koln,Hamburg,0,0,D,0,0,D,...,2.92,2.7,2.06,3.62,3.86,0,0,0,1,1
4,D1,2014-08-23,Hannover,Schalke 04,2,1,H,0,0,D,...,2.08,2.04,3.1,3.6,2.37,0,0,0,1,1


### Simple diffs

In [None]:
# export
def add_simple_diffs(df):
    df['points_diff'] = df.home_points - df.away_points
    df['position_diff'] = df.home_position - df.away_position
    return df

In [None]:
df = add_simple_diffs(df)
df.tail()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,PSCH,PSCD,PSCA,gameday,home_points,away_points,home_position,away_position,points_diff,position_diff
299,D1,2015-05-23,Ein Frankfurt,Leverkusen,2,1,H,2,1,H,...,4.15,4.27,1.84,33,40,61,9,4,-21,5
298,D1,2015-05-23,Dortmund,Werder Bremen,3,2,H,3,1,H,...,1.26,7.47,10.8,33,43,43,7,7,0,0
297,D1,2015-05-23,Bayern Munich,Mainz,2,0,H,1,0,H,...,1.17,9.48,15.98,33,76,40,1,9,36,-8
300,D1,2015-05-23,FC Koln,Wolfsburg,2,2,D,1,2,A,...,4.19,3.75,1.94,33,39,68,10,2,-29,8
305,D1,2015-05-23,Paderborn,Stuttgart,1,2,A,1,1,D,...,3.56,4.13,2.01,33,31,33,15,13,-2,2
