# Pipeline

1. Preprocessing
2. **Data Extraction**

This file performs data extraction. It generates several features and returns a .pkl file containing a dataframe with said features.

# Imports

In [88]:
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
from sklearn.model_selection import train_test_split

# Data extraction into new Dataframe

In [151]:
# We don't need HomeID and AwayID here, but I put it in just for exploration
results = pd.read_pickle('preprocessed_results.pkl')
df = results[['HomeID', 'AwayID', 'FTHG', 'FTAG', 'FTR']].copy()
df

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR
0,0,15,3,0,H
1,1,12,1,0,H
2,2,16,0,0,D
3,3,14,6,0,H
4,4,11,2,2,D
...,...,...,...,...,...
13015,97,105,2,2,D
13016,111,106,2,0,H
13017,120,109,0,2,A
13018,122,107,2,2,D


In [152]:
results.sample(3)

Unnamed: 0,Div,Date,HomeID,AwayID,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR
1470,E1,2014-12-04,4,12,0,1,A,0,0,D,...,6,3,17,15,6,6,3,1,0,0
9594,I1,2019-05-26,66,77,0,0,D,0,0,D,...,5,1,12,9,4,1,1,2,0,0
11880,SP1,2016-08-19,100,122,2,1,H,0,0,D,...,6,6,14,17,1,8,3,3,0,0


In [153]:
# Replace H, D and A by 1, 0 and 2 (target variables)
result_dict = {'H': 1, 'D': 0, 'A': 2}
df['FTR'].replace(result_dict, inplace=True)

In [5]:
#slc = df['AwayID'] == 105
#df[slc][['FTR']].value_counts() / sum(slc)

# Implementing Features

In [6]:
# TODO: evtl. die feature-erstellungen in Funktionen packen um sie an und aus togglen zu können

## Historical strength

In [154]:
df.sample(3)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR
6676,81,80,4,2,1
10795,116,109,0,1,2
11718,100,118,2,2,0


In [155]:
# Returns all games that were played by a specific team between date minus two years and date.
def get_historical_games(team_id, date):
    games_by_team = results[((results['HomeID'] == team_id) | (results['AwayID'] == team_id))
                           & (results['Date'] < date) & (results['Date'] > date - datetime.timedelta(730))]
    return games_by_team

In [None]:
get_historical_games(39, datetime.date(2019, 5, 20))

### Home win percentage (TODO: away)

In [157]:
# Add home win percentage to dataframe
df['H_WIN_PCT'] = 0
for row, game in results.iterrows():
    team_id = game['HomeID']
    game_date = game['Date']

    hist_games = get_historical_games(team_id, game_date)
    game_count = len(hist_games)
    
    games_won = len(hist_games[(hist_games['HomeID'] == team_id) & (hist_games['FTHG'] > hist_games['FTAG'])
                   | (hist_games['AwayID'] == team_id) & (hist_games['FTHG'] < hist_games['FTAG'])])
    
    if game_count > 0:
        df.loc[row, 'H_WIN_PCT'] = games_won / game_count
    else:
        df.loc[row, 'H_WIN_PCT'] = 0

In [167]:
df.sample(6)

Unnamed: 0,HomeID,AwayID,FTHG,FTAG,FTR,H_WIN_PCT
5049,52,59,2,0,1,0.242424
2647,10,4,2,0,1,0.525641
2861,30,12,2,1,1,0.285714
5740,49,54,0,0,0,0.323944
6039,56,61,0,4,2,0.333333
8784,79,68,6,2,1,0.486842


## Current form (data from the last five games)

In [10]:
def get_last_five_games(team_id, date):
    pass

In [7]:
# The columns H5GR and A5GR contain the points collected by the home
# and away teams over the last 5 games.
# (Win = 3 points, Draw = 1 point, Loss = 0 points)
# If there is not enough data, we just add 1.333333 points (or some mean maybe)
# until we get to 5 games.
df['H5GR'] = 0
df['A5GR'] = 0

cur_game = 0
for game in results.iterrows():      # each game is a tupple of an ID and the actual game information
    team_id = game[1]['HomeID']      # therefore we use game[1]
    team_points = 0         # points from the last 5 games
    
    recent_games = results.loc[(results['HomeID'] == team_id) | (results['AwayID'] == team_id)].tail(6) # TODO Date checken!
    recent_games.drop(recent_games.tail(1).index, inplace=True)   # drop cur_game from recent games
    
    # Add the points accordingly
    for recent in recent_games:
        if recent['FTR'] == 'D':
            team_points += 1
        elif (recent['FTR'] == '1' and recent['HomeID'] == team_id
            or recent['FTR'] == '2' and recent['AwayID'] == team_id):
            team_points += 3           
        # TODO funktioniert noch nicht

    
    df['H5GR'][cur_game] = team_points
    
    cur_game += 1


(0, Div                        E1
Date      2010-08-14 00:00:00
HomeID                      0
AwayID                     15
FTHG                        3
FTAG                        0
FTR                         H
HTHG                        2
HTAG                        0
HTR                         H
HS                         23
AS                         12
HST                        11
AST                         2
HF                         15
AF                         15
HC                         16
AC                          7
HY                          1
AY                          2
HR                          0
AR                          0
Name: 0, dtype: object)


In [None]:
df.sample(3)