In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp feature_engineering

In [None]:
# export
from pathlib import Path
import pandas as pd
import torch.nn.functional as F

In [None]:
import torch
import numpy as np

# Features
> All things feature engineering

## Read the example df

In [None]:
# export
def read_football_csv(path):
    df = pd.read_csv(path, encoding='latin1')
    try:
        df.Date = pd.to_datetime(df.Date, format='%d/%m/%y')
    except:
        df.Date = pd.to_datetime(df.Date, format='%d/%m/%Y')
    df = df.sort_values('Date')
    
    return df

In [None]:
df = read_football_csv(Path('../data/football_data_uk/raw/germany/D1_1415.csv'))
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA
0,D1,2014-08-22,Bayern Munich,Wolfsburg,2,1,H,1,0,H,...,3.0,22,-2.0,2.35,2.24,1.73,1.68,1.29,6.67,10.58
1,D1,2014-08-23,Dortmund,Leverkusen,0,2,A,0,1,A,...,2.39,26,-1.0,2.08,2.02,1.87,1.84,1.75,4.18,4.77
2,D1,2014-08-23,Ein Frankfurt,Freiburg,1,0,H,1,0,H,...,2.01,22,-0.5,2.05,2.01,1.9,1.86,2.01,3.74,3.92
3,D1,2014-08-23,FC Koln,Hamburg,0,0,D,0,0,D,...,2.0,21,0.0,1.5,1.47,2.92,2.7,2.06,3.62,3.86
4,D1,2014-08-23,Hannover,Schalke 04,2,1,H,0,0,D,...,2.15,22,0.25,1.89,1.84,2.08,2.04,3.1,3.6,2.37


## Extract teams

In [None]:
# export
def extract_teams(df, home_team='HomeTeam', away_team='AwayTeam'):
    home_teams = df[home_team].unique()
    away_teams = df[away_team].unique()
    assert set(home_teams)==set(away_teams), f"home/away team list is different:{set(home_teams)^set(away_teams)}"

    n_games = len(df)
    n_teams = len(home_teams)
    expected_n_games = n_teams*(n_teams-1)
    assert n_games==expected_n_games, f"Number of games {n_games} does not match number of teams ({n_teams}, {expected_n_games}"
    
    return list(home_teams)

In [None]:
teams = extract_teams(df)
teams

['Bayern Munich',
 'Dortmund',
 'Ein Frankfurt',
 'FC Koln',
 'Hannover',
 'Hertha',
 'Hoffenheim',
 'Paderborn',
 "M'gladbach",
 'Augsburg',
 'Hamburg',
 'Leverkusen',
 'Schalke 04',
 'Stuttgart',
 'Werder Bremen',
 'Wolfsburg',
 'Freiburg',
 'Mainz']

## Game day

There should be n_teams/2 games per gameday.

In [None]:
# export
def add_gamedays(df, home_team='HomeTeam', away_team='AwayTeam'):
    teams = extract_teams(df, home_team=home_team, away_team=away_team)
    
    n_teams = len(teams)
    n_gamedays = len(df)*2//n_teams
    games_per_day = n_teams//2
    
    gamedays = sum(([i]*games_per_day for i in range(n_gamedays)), start=[])
    assert len(gamedays)==len(df)
    
    df['gameday'] = gamedays
    return df
    #for gameday, group in df.groupby('gameday'):
    #    assert set(group.HomeTeam.unique()).union(set(group.AwayTeam.unique()))==set(teams), "Each team should play once during each gameday"

In [None]:
df = add_gamedays(df)

## Points

- 3 points for the winning team
- 0 points for the losing team
- 1 point for each team when they draw

- Extract all playing teams (sanity check)
- current_points: start with 0 points for each team
- home/away_points: empty list
- Go trough the matches in order of date (=time of kickoff)
    - note current_points for each team
    - update current points according to match outcome
- add columns for home/away points

### Points accumulation

In [None]:
# export
def add_points(df, home_team='HomeTeam', away_team='AwayTeam', date='Date'):
    current_points = {team: 0 for team in teams}
    total_home_points = []
    total_away_points = []

    df = df.sort_values(date)

    for _,row in df.iterrows():
        if row.FTR=='H':
            home_points = 3
            away_points = 0
        if row.FTR=='D':
            home_points = 1
            away_points = 1
        if row.FTR=='A':
            home_points = 0
            away_points = 3

        total_home_points.append(current_points[row[home_team]])
        total_away_points.append(current_points[row[away_team]])

        current_points[row.HomeTeam] += home_points
        current_points[row.AwayTeam] += away_points

    assert len(total_home_points)==len(total_away_points)==len(df)

    df['home_points'] = total_home_points
    df['away_points'] = total_away_points
    
    return df

In [None]:
df = add_points(df)
df.tail()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA,gameday,home_points,away_points
301,D1,2015-05-23,Hamburg,Schalke 04,2,0,H,0,0,D,...,2.11,2.04,1.85,1.8,2.09,3.89,3.54,33,32,48
303,D1,2015-05-23,Hoffenheim,Hertha,2,1,H,1,0,H,...,2.0,1.95,1.98,1.9,2.14,3.57,3.68,33,41,35
302,D1,2015-05-23,Hannover,Freiburg,2,1,H,1,0,H,...,1.97,1.92,1.98,1.92,2.02,3.54,4.11,33,34,34
304,D1,2015-05-23,M'gladbach,Augsburg,1,3,A,1,0,H,...,1.94,1.89,2.02,1.96,1.47,5.0,7.05,33,66,46
305,D1,2015-05-23,Paderborn,Stuttgart,1,2,A,1,1,D,...,2.09,2.02,1.88,1.83,3.56,4.13,2.01,33,31,33


### Positions

In [None]:
# export
def _positions_from_sorted_points(sorted_teams):
    current_position = 0
    current_points = 1000
    gameday_positions = {}

    for team,points in sorted_teams:
        if points<current_points:
            current_position += 1
        gameday_positions[team] = current_position
        current_points = points
    return gameday_positions

def add_positions(df):
    teams = extract_teams(df)
    all_positions = {}

    for gameday,group in df.groupby('gameday'):
        gameday_positions = {}
        points = []
        for _,row in group.iterrows():
            points.append([row['HomeTeam'], row['home_points']])
            points.append([row['AwayTeam'], row['away_points']])

        sorted_teams = sorted(points, key=lambda el: el[1], reverse=True)
        gameday_positions = _positions_from_sorted_points(sorted_teams)
        all_positions[gameday] = gameday_positions
        
    df['home_position'] = df.apply(lambda row: all_positions[row.gameday][row.HomeTeam], axis=1)
    df['away_position'] = df.apply(lambda row: all_positions[row.gameday][row.AwayTeam], axis=1)
    
    return df

In [None]:
df = add_positions(df)
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA,gameday,home_points,away_points,home_position,away_position
0,D1,2014-08-22,Bayern Munich,Wolfsburg,2,1,H,1,0,H,...,1.73,1.68,1.29,6.67,10.58,0,0,0,1,1
1,D1,2014-08-23,Dortmund,Leverkusen,0,2,A,0,1,A,...,1.87,1.84,1.75,4.18,4.77,0,0,0,1,1
2,D1,2014-08-23,Ein Frankfurt,Freiburg,1,0,H,1,0,H,...,1.9,1.86,2.01,3.74,3.92,0,0,0,1,1
3,D1,2014-08-23,FC Koln,Hamburg,0,0,D,0,0,D,...,2.92,2.7,2.06,3.62,3.86,0,0,0,1,1
4,D1,2014-08-23,Hannover,Schalke 04,2,1,H,0,0,D,...,2.08,2.04,3.1,3.6,2.37,0,0,0,1,1


### Simple diffs

In [None]:
# export
def add_simple_diffs(df):
    df['points_diff'] = df.home_points - df.away_points
    df['position_diff'] = df.home_position - df.away_position
    return df

In [None]:
df = add_simple_diffs(df)
df.tail()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,PSCH,PSCD,PSCA,gameday,home_points,away_points,home_position,away_position,points_diff,position_diff
301,D1,2015-05-23,Hamburg,Schalke 04,2,0,H,0,0,D,...,2.09,3.89,3.54,33,32,48,14,5,-16,9
303,D1,2015-05-23,Hoffenheim,Hertha,2,1,H,1,0,H,...,2.14,3.57,3.68,33,41,35,8,11,6,-3
302,D1,2015-05-23,Hannover,Freiburg,2,1,H,1,0,H,...,2.02,3.54,4.11,33,34,34,12,12,0,0
304,D1,2015-05-23,M'gladbach,Augsburg,1,3,A,1,0,H,...,1.47,5.0,7.05,33,66,46,3,6,20,-3
305,D1,2015-05-23,Paderborn,Stuttgart,1,2,A,1,1,D,...,3.56,4.13,2.01,33,31,33,15,13,-2,2


## Result

In [None]:
df.head()

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,PSCH,PSCD,PSCA,gameday,home_points,away_points,home_position,away_position,points_diff,position_diff
0,D1,2014-08-22,Bayern Munich,Wolfsburg,2,1,H,1,0,H,...,1.29,6.67,10.58,0,0,0,1,1,0,0
1,D1,2014-08-23,Dortmund,Leverkusen,0,2,A,0,1,A,...,1.75,4.18,4.77,0,0,0,1,1,0,0
2,D1,2014-08-23,Ein Frankfurt,Freiburg,1,0,H,1,0,H,...,2.01,3.74,3.92,0,0,0,1,1,0,0
3,D1,2014-08-23,FC Koln,Hamburg,0,0,D,0,0,D,...,2.06,3.62,3.86,0,0,0,1,1,0,0
4,D1,2014-08-23,Hannover,Schalke 04,2,1,H,0,0,D,...,3.1,3.6,2.37,0,0,0,1,1,0,0


In [None]:
# export
def results_from_goals(df, home_col, away_col):
    """
    Compares goals in home_col to goals in away_col, creates [new_col] with results encoded as:
    -1 -> home win
    0 -> draw
    1 -> away win
    """
    series = pd.Series(0, index=df.index)
    series[df[home_col]>df[away_col]] = -1
    series[df[home_col]<df[away_col]] = 1
    
    return series

In [None]:
result_df = pd.DataFrame(columns=['home', 'away'])
result_df['home'] = [1,2,3,0]
result_df['away'] = [2,1,3,0]


result_df['result'] = results_from_goals(result_df, 'home', 'away')
result_df.head()

Unnamed: 0,home,away,result
0,1,2,1
1,2,1,-1
2,3,3,0
3,0,0,0


In [None]:
assert (result_df.result.values == [1,-1,0,0]).all()

## Profit odds

In [None]:
odds_df = result_df.copy()
odds_df[['odds_home', 'odds_draw', 'odds_away']] = df.loc[:3, ['B365H', 'B365D', 'B365A']]

odds_df.head()

Unnamed: 0,home,away,result,odds_home,odds_draw,odds_away
0,1,2,1,1.25,6.0,10.0
1,2,1,-1,1.57,4.33,5.0
2,3,3,0,2.05,3.4,3.6
3,0,0,0,2.0,3.5,3.6


In [None]:
# export
def create_profit_df(df, odds_home, odds_draw, odds_away, home_profit='y_home', draw_profit='y_draw', away_profit='y_away', df_result_col='result'):
    y_df = pd.DataFrame(index=df.index, columns=[home_profit, draw_profit, away_profit])

    y_df[home_profit] = -1
    y_df[draw_profit] = -1
    y_df[away_profit] = -1

    y_df.loc[df[df_result_col]==-1, home_profit] += df[odds_home]
    y_df.loc[df[df_result_col]==0, draw_profit] += df[odds_draw]
    y_df.loc[df[df_result_col]==--1, away_profit] += df[odds_away]

    return y_df

def add_profit_cols(df, odds_home, odds_draw, odds_away, home_profit='y_home', draw_profit='y_draw', away_profit='y_away', df_result_col='result'):
    profit_df = create_profit_df(odds_df, 'odds_home', 'odds_draw', 'odds_away')
    
    return df.merge(profit_df, left_index=True, right_index=True)

In [None]:
create_profit_df(odds_df, 'odds_home', 'odds_draw', 'odds_away')

Unnamed: 0,y_home,y_draw,y_away
0,-1.0,-1.0,9.0
1,0.57,-1.0,-1.0
2,-1.0,2.4,-1.0
3,-1.0,2.5,-1.0


In [None]:
add_profit_cols(odds_df, 'odds_home', 'odds_draw', 'odds_away')

Unnamed: 0,home,away,result,odds_home,odds_draw,odds_away,y_home,y_draw,y_away
0,1,2,1,1.25,6.0,10.0,-1.0,-1.0,9.0
1,2,1,-1,1.57,4.33,5.0,0.57,-1.0,-1.0
2,3,3,0,2.05,3.4,3.6,-1.0,2.4,-1.0
3,0,0,0,2.0,3.5,3.6,-1.0,2.5,-1.0


## Normalizer

In [None]:
df = pd.DataFrame([[1,2,3], [4,5,6]], columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [None]:
# export
def normalize_by_args(x, mean, std):
    return (x-mean)/std

def normalize_col(col):
    return normalize_by_args(col, col.mean(), col.std())

class ColumnNormalizer:
    def __init__(self, columns, names=None):
        self.mean = columns.values.mean()
        self.std = columns.values.std()
        self.names = names
        
    @classmethod
    def from_df(cls, df, col_names):
        return cls(df.loc[:,col_names], names=col_names)
        
    def __str__(self):
        return f'Mean: {self.mean} | Std: {self.std} | Names: {self.names or "unknown"}'
    
    def __repr__(self): return str(self)
    
    def __call__(self, x):
        return normalize_by_args(x, self.mean, self.std)

In [None]:
normalize_by_args(df.a, 5, 2.)

0   -2.0
1   -0.5
Name: a, dtype: float64

In [None]:
normalize_col(df.a)

0   -0.707107
1    0.707107
Name: a, dtype: float64

In [None]:
ColumnNormalizer(df.loc[:,['a', 'b']]), ColumnNormalizer.from_df(df, ['a','b'])

(Mean: 3.0 | Std: 1.5811388300841898 | Names: unknown,
 Mean: 3.0 | Std: 1.5811388300841898 | Names: ['a', 'b'])

In [None]:
norm = ColumnNormalizer.from_df(df, ['a','b'])

df.loc[:, norm.names] = norm(df.loc[:, norm.names])
df

Unnamed: 0,a,b,c
0,-1.264911,-0.632456,3
1,0.632456,1.264911,6


In [None]:
assert df.loc[:, ['a', 'b']].values.mean()==0., 'ColumnNormalizer should produce 0 mean.'
assert df.loc[:, ['a', 'b']].values.std()==1., 'ColumnNormalizer should produce 1. std.'

## Profit loss

Computes the outcome of a betting allocation. The loss function gets negated to make minimizing achieve the goal we're aiming for.

In [None]:
# export
def odds_loss(actual, target):
    """
        Compute the mean negative profit
    """
    probs = F.softmax(actual, dim=1)
    return -(probs*target).sum(dim=1).mean()

def odds_profit(actual, target):
    """
        Compute the total profit
    """
    probs = F.softmax(actual, dim=1)
    return (probs*target).sum()

In [None]:
odds_df = pd.DataFrame([[3.2, 3.25, 2.29], [1.91, 3.3, 3.93]], columns=['home', 'draw', 'away'])
odds_df.head()

Unnamed: 0,home,draw,away
0,3.2,3.25,2.29
1,1.91,3.3,3.93


In [None]:
allocations = torch.tensor([[.5, .2, .3], [.8, .05, .15]])

odds_loss(allocations, odds_df.values), odds_profit(allocations, odds_df.values)

(tensor(-2.8457, dtype=torch.float64), tensor(5.6913, dtype=torch.float64))

## Export

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_scraping.ipynb.
Converted 01_utility.ipynb.
Converted 02_data.ipynb.
Converted 03_sportde.ipynb.
Converted 04_feature_engineering.ipynb.
Converted index.ipynb.
