# Pipeline

1. Preprocessing
2. **Data Extraction**

This file performs data extraction. It generates several features and returns a .pkl file containing a dataframe with said features.

# Imports

In [67]:
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Data extraction into new Dataframe

In [92]:
# We don't need HomeID and AwayID here, but I put it in just for exploration
results = pd.read_pickle('preprocessed_results.pkl')
df = results[['HomeID', 'AwayID', 'FTHG', 'FTAG', 'FTR']].copy()
df

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR
0,0,15,3,0,1
1,1,12,1,0,1
2,2,16,0,0,0
3,3,14,6,0,1
4,4,11,2,2,0
...,...,...,...,...,...
13015,97,105,2,2,0
13016,111,106,2,0,1
13017,120,109,0,2,2
13018,122,107,2,2,0


In [93]:
results.sample(3)

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
6473,I1,2011-03-20,70,69,0,1,2,0,0,D,...,4,2,20,28,6,4,4,4,0,1
7500,I1,2014-12-01,82,76,2,1,1,0,0,D,...,8,4,13,13,8,10,2,3,0,0
11274,SP1,2014-12-20,118,99,1,3,2,1,1,D,...,5,4,11,20,4,5,3,4,0,1


In [5]:
#slc = df['AwayID'] == 105
#df[slc][['FTR']].value_counts() / sum(slc)

# Implementing Features

__For the moment, the features are closely aligned to those implemented by Hubáček et al. That means that most features also get implemented twice, once for the home team and once for that away team. The naming convention for this approach is FEATURE_NAME_home/away.__

Example: <br>
A_WIN_PCT_home would be the historical away winning percentage of the team that plays the current game at home. <br>
H_WIN_PCT_home would be the historical home winning percentage of that same team.

That means the word after the last underscore indicates if the value of this feature represents the home team or the away team of the current(!) game.

In [43]:
# TODO: evtl. die feature-erstellungen in Funktionen packen um sie an und aus togglen zu können
# Alle nicht-kursiven Features aus dem Hubacek Paper müssen doppelt implementiert werden, siehe oben.

## Historical strength

In [70]:
df.sample(3)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR
3627,41,47,2,1,1
8991,67,71,2,1,1
10036,117,97,0,1,2


In [71]:
# Returns all games that were played by a specific team between date minus two years and date.
def get_historical_games(team_id, date):
    games_by_team = results[((results['HomeID'] == team_id) | (results['AwayID'] == team_id))
                           & (results['Date'] < date) & (results['Date'] > date - datetime.timedelta(730))]
    return games_by_team

In [72]:
get_historical_games(8, datetime.date(2016, 5, 20)).head(3)

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
1348,E1,2014-12-01,13,8,3,5,2,2,2,D,...,4,9,10,8,11,2,1,1,0,0
1383,E1,2014-08-02,8,10,5,1,1,4,0,H,...,12,6,14,14,6,6,1,2,0,0
1395,E1,2014-12-02,16,8,2,3,2,1,1,D,...,3,7,7,9,7,5,4,3,0,0


### Home team percentages

In [94]:
# Add home percentages to dataframe
df['H_WIN_PCT_home'] = 0
df['H_DRAW_PCT_home'] = 0
df['A_WIN_PCT_home'] = 0
df['A_DRAW_PCT_home'] = 0
df['H_WIN_PCT_away'] = 0
df['H_DRAW_PCT_away'] = 0
df['A_WIN_PCT_away'] = 0
df['A_DRAW_PCT_away'] = 0

suffixes = {'HomeID':'_home', 'AwayID':'_away'}    # suffix dict for the feature names

for row, game in results.iterrows():
    for id_name in ['HomeID', 'AwayID']:
        team_id = game[id_name]
        game_date = game['Date']

        suffix = suffixes[id_name]
        
        hist_games = get_historical_games(team_id, game_date)
        game_count = len(hist_games)

        # total counts of home and away games in the last two years
        home_game_count = len(hist_games[hist_games['HomeID'] == team_id])
        away_game_count = len(hist_games[hist_games['AwayID'] == team_id])

        home_games_won = len(hist_games[(hist_games['HomeID'] == team_id) & (hist_games['FTR'] == 1)])
        away_games_won = len(hist_games[(hist_games['AwayID'] == team_id) & (hist_games['FTR'] == 2)])

        home_games_drawn = len(hist_games[(hist_games['HomeID'] == team_id) & (hist_games['FTR'] == 0)])
        away_games_drawn = len(hist_games[(hist_games['AwayID'] == team_id) & (hist_games['FTR'] == 0)])

        if home_game_count > 0:         # TODO should the threshold be higher to only use significant enough data?
            df.loc[row, 'H_WIN_PCT' + suffix] = home_games_won / home_game_count
            df.loc[row, 'H_DRAW_PCT' + suffix] = home_games_drawn / home_game_count
        else:
            df.loc[row, 'H_WIN_PCT' + suffix] = 0          # TODO maybe choose some standard value like 0.25 instead of 0?
            df.loc[row, 'H_DRAW_PCT' + suffix] = 0         # Overall Median/Mean could also work.

        if away_game_count > 0:
            df.loc[row, 'A_WIN_PCT' + suffix] = away_games_won / away_game_count
            df.loc[row, 'A_DRAW_PCT' + suffix] = away_games_drawn / away_game_count
        else:
            df.loc[row, 'A_WIN_PCT' + suffix] = 0          
            df.loc[row, 'A_DRAW_PCT' + suffix] = 0 

### Away team percentages

In [95]:
df.sample(6)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,H_WIN_PCT_home,H_DRAW_PCT_home,A_WIN_PCT_home,A_DRAW_PCT_home,H_WIN_PCT_away,H_DRAW_PCT_away,A_WIN_PCT_away,A_DRAW_PCT_away
12566,108,105,1,1,0,0.74359,0.153846,0.666667,0.194444,0.710526,0.184211,0.487179,0.307692
472,22,1,3,3,0,0.75,0.25,0.142857,0.285714,0.36,0.24,0.181818,0.181818
6741,68,77,5,3,1,0.653846,0.115385,0.21875,0.1875,0.517241,0.172414,0.214286,0.25
4221,37,56,1,0,1,0.3125,0.375,0.25,0.25,0.3125,0.375,0.125,0.1875
11733,113,98,1,0,1,0.638889,0.138889,0.292683,0.414634,0.425,0.225,0.289474,0.157895
10609,97,102,0,2,2,0.567568,0.189189,0.236842,0.263158,0.473684,0.263158,0.184211,0.289474


In [58]:
df.dtypes

HomeID               int64
AwayID               int64
FTHG                 int32
FTAG                 int32
FTR                  int64
H_WIN_PCT_home     float64
H_DRAW_PCT_home    float64
A_WIN_PCT_home     float64
A_DRAW_PCT_home    float64
dtype: object

## Current form (data from the last five games)

In [10]:
def get_last_five_games(team_id, date):
    pass

In [7]:
# The columns H5GR and A5GR contain the points collected by the home
# and away teams over the last 5 games.
# (Win = 3 points, Draw = 1 point, Loss = 0 points)
# If there is not enough data, we just add 1.333333 points (or some mean maybe)
# until we get to 5 games.
df['H5GR'] = 0
df['A5GR'] = 0

cur_game = 0
for game in results.iterrows():      # each game is a tupple of an ID and the actual game information
    team_id = game[1]['HomeID']      # therefore we use game[1]
    team_points = 0         # points from the last 5 games
    
    recent_games = results.loc[(results['HomeID'] == team_id) | (results['AwayID'] == team_id)].tail(6) # TODO Date checken!
    recent_games.drop(recent_games.tail(1).index, inplace=True)   # drop cur_game from recent games
    
    # Add the points accordingly
    for recent in recent_games:
        if recent['FTR'] == 'D':
            team_points += 1
        elif (recent['FTR'] == '1' and recent['HomeID'] == team_id
            or recent['FTR'] == '2' and recent['AwayID'] == team_id):
            team_points += 3           
        # TODO funktioniert noch nicht

    
    df['H5GR'][cur_game] = team_points
    
    cur_game += 1


(0, Div                        E1
Date      2010-08-14 00:00:00
HomeID                      0
AwayID                     15
FTHG                        3
FTAG                        0
FTR                         H
HTHG                        2
HTAG                        0
HTR                         H
HS                         23
AS                         12
HST                        11
AST                         2
HF                         15
AF                         15
HC                         16
AC                          7
HY                          1
AY                          2
HR                          0
AR                          0
Name: 0, dtype: object)


In [None]:
df.sample(3)

# Save dataframe as .pkl