In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
from difflib import SequenceMatcher

from betting.data.sportde import set_dtypes

# Processing the kaggle dataset 

## Filetype

In [11]:
data_dir = Path('../../../data/kaggle')
csv_path = data_dir / 'closing_odds.csv'
target_path = data_dir / 'closing_odds.feather'

In [13]:
if not target_path.is_file():
    df = pd.read_csv(csv_path)
    df.sort_values('match_date', inplace=True)
    df = df.reset_index(drop=True)

    df.drop(['top_bookie_home_win', 'top_bookie_draw', 'top_bookie_away_win', 'n_odds_home_win', 'n_odds_draw', 'n_odds_away_win'], inplace=True, axis=1)

    df = set_dtypes(df, [int, str, str, str, int, str, int, float, float, float, float, float, float])
    df.match_date = pd.to_datetime(df.match_date, format='%Y-%m-%d')

    df.to_feather(target_path)

df = pd.read_feather(target_path)

In [14]:
df.columns, df.shape

(Index(['match_id', 'league', 'match_date', 'home_team', 'home_score',
        'away_team', 'away_score', 'avg_odds_home_win', 'avg_odds_draw',
        'avg_odds_away_win', 'max_odds_home_win', 'max_odds_draw',
        'max_odds_away_win'],
       dtype='object'),
 (479440, 13))

## Compare against meta

In [None]:
meta_path = Path('../../../data/sportde/bundesliga')
matchday_df = pd.read_feather(meta_path/'matchdays.feather')
matchday_df.head(10)

## Split league

In [6]:
buli_str = 'Germany: Bundesliga'
buli_df = df.loc[df.league==buli_str,:].copy()
buli_df.league = 'bundesliga'

In [7]:
buli_df.tail()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win
474730,870166,bundesliga,2015-05-23,Dortmund,3,Werder Bremen,2,1.2541,6.4234,9.949,1.3,7.47,13.0
474731,870164,bundesliga,2015-05-23,Bayern Munich,2,Mainz,0,1.17,7.9562,13.4545,1.2,9.48,17.0
474739,870165,bundesliga,2015-05-23,B. Monchengladbach,1,Augsburg,3,1.4703,4.5645,6.3879,1.53,5.0,7.16
475481,871556,bundesliga,2015-05-28,Hamburger SV,1,Karlsruher,1,2.2679,3.1768,3.2704,2.46,3.4,3.85
476386,872810,bundesliga,2015-06-01,Karlsruher,1,Hamburger SV,1,2.1682,3.3089,3.3564,2.3,3.45,3.87


In [8]:
len(buli_df)

2747

## Seasons

In [9]:
row = buli_df.iloc[0,:]
date = row.match_date
date.year, date.month

(2005, 1)

In [10]:
def season_from_date(row):
    date = row.match_date
    if date.month<=7:
        season = str(date.year-1)[-2:] + str(date.year)[-2:]
    else:
        season = str(date.year)[-2:] + str(date.year+1)[-2:]

    return season

In [11]:
buli_df['season'] = buli_df.apply(func=season_from_date, axis=1)

In [12]:
buli_df.tail()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win,season
474730,870166,bundesliga,2015-05-23,Dortmund,3,Werder Bremen,2,1.2541,6.4234,9.949,1.3,7.47,13.0,1415
474731,870164,bundesliga,2015-05-23,Bayern Munich,2,Mainz,0,1.17,7.9562,13.4545,1.2,9.48,17.0,1415
474739,870165,bundesliga,2015-05-23,B. Monchengladbach,1,Augsburg,3,1.4703,4.5645,6.3879,1.53,5.0,7.16,1415
475481,871556,bundesliga,2015-05-28,Hamburger SV,1,Karlsruher,1,2.2679,3.1768,3.2704,2.46,3.4,3.85,1415
476386,872810,bundesliga,2015-06-01,Karlsruher,1,Hamburger SV,1,2.1682,3.3089,3.3564,2.3,3.45,3.87,1415


In [13]:
for season,group in buli_df.groupby('season'):
    print(season, len(group))

0405 153
0506 272
0607 306
0708 306
0809 274
0910 272
1112 272
1213 308
1314 308
1415 276


## Matchday

In [14]:
meta_path = Path('../../../data/sportde/bundesliga')
matchday_df = pd.read_feather(meta_path/'matchdays.feather')
matchday_df.head(10)

Unnamed: 0,season,matchday,start_date,end_date,league
0,506,1,2005-08-05,2005-08-07,bundesliga
1,506,2,2005-08-13,2005-08-14,bundesliga
2,506,3,2005-08-27,2005-08-28,bundesliga
3,506,4,2005-09-10,2005-09-11,bundesliga
4,506,5,2005-09-17,2005-09-18,bundesliga
5,506,6,2005-09-20,2005-09-21,bundesliga
6,506,7,2005-09-24,2005-09-25,bundesliga
7,506,8,2005-10-01,2005-10-02,bundesliga
8,506,9,2005-10-15,2005-10-16,bundesliga
9,506,10,2005-10-22,2005-10-23,bundesliga


In [15]:
first_date = matchday_df.start_date.min()
buli = buli_df.loc[first_date<=buli_df.match_date, :].copy()
len(buli)

2594

In [16]:
row = buli.iloc[0,:]
date = row.match_date
date

Timestamp('2005-08-05 00:00:00')

In [17]:
def matchday_from_date(matchday_df, date):
    return matchday_df.loc[(matchday_df.start_date<=date) & (date<=matchday_df.end_date), 'matchday'].iat[0]

In [18]:
matchdays = []
for _,match in buli.iterrows():
    try:
        matchdays.append(matchday_from_date(matchday_df, match.match_date))
    except Exception as e:
        matchdays.append(-1)
        
buli['matchday'] = matchdays

In [19]:
buli.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win,season,matchday
10162,194719,bundesliga,2005-08-05,Bayern Munich,3,B. Monchengladbach,0,1.3118,4.3973,8.9845,1.35,5.0,10.0,506,1
10273,194936,bundesliga,2005-08-06,Wolfsburg,2,Dortmund,2,2.4,3.266,2.622,2.6,3.45,2.85,506,1
10338,194932,bundesliga,2005-08-06,Duisburg,1,VfB Stuttgart,1,3.0755,3.2582,2.1282,3.39,3.35,2.2,506,1
10339,194933,bundesliga,2005-08-06,Hamburger SV,3,Nurnberg,0,1.5173,3.6491,5.76,1.57,4.15,6.41,506,1
10340,194934,bundesliga,2005-08-06,Hannover,2,Hertha Berlin,2,2.6282,3.2227,2.4255,3.16,3.35,2.65,506,1


## Team names

In [20]:
standings = pd.read_feather(meta_path/'standings.feather')
standings.head()

Unnamed: 0,season,matchday,rank,team,games_played,wins,draw,lost,total_goals_scored,total_goals_received,points,league
0,506,1,1,SV Werder,1,1,0,0,5,2,3,bundesliga
1,506,1,2,Leverkusen,1,1,0,0,4,1,3,bundesliga
2,506,1,3,FC Bayern,1,1,0,0,3,0,3,bundesliga
3,506,1,3,Hamburger SV,1,1,0,0,3,0,3,bundesliga
4,506,1,5,Schalke 04,1,1,0,0,2,1,3,bundesliga


In [21]:
league = 'bundesliga'
league_standings = standings.loc[standings.league==league, :].copy()
target_teams = list(league_standings.team.unique())
len(target_teams)

35

In [22]:
actual_teams = list(buli_df.home_team.unique())
len(actual_teams)

30

In [23]:
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [25]:
translation = {}

for actual_team in actual_teams:
    print('----')
    print(actual_team)
    print('----')
    
    scores = np.array([similar(actual_team, target_team) for target_team in target_teams])
    sort_idx = scores.argsort()[::-1]

    for e,sort_id in enumerate(sort_idx[:5]):
        print(f'[{e}] {target_teams[sort_id]}')

    index = int(input())
    translation[actual_team] = target_teams[sort_idx[index]]

----
Bayern Munich
----
[0] FC Bayern
[1] Braunschweig
[2] Paderborn
[3] K'lautern
[4] Hannover 96


 0


----
Wolfsburg
----
[0] Wolfsburg
[1] Duisburg
[2] Augsburg
[3] SC Freiburg
[4] Hamburger SV


 0


----
Schalke
----
[0] Schalke 04
[1] Aachen
[2] St. Pauli
[3] Braunschweig
[4] Bochum


 0


----
Nurnberg
----
[0] Nürnberg
[1] Hamburger SV
[2] Braunschweig
[3] Augsburg
[4] Duisburg


 0


----
Mainz
----
[0] Mainz 05
[1] Aachen
[2] FC Bayern
[3] Paderborn
[4] K'lautern


 0


----
Hansa Rostock
----
[0] Rostock
[1] Ingolstadt
[2] Hannover 96
[3] Braunschweig
[4] Mainz 05


 0


----
B. Monchengladbach
----
[0] M'gladbach
[1] Braunschweig
[2] Ingolstadt
[3] Bochum
[4] Aachen


 0


----
Bochum
----
[0] Bochum
[1] Hoffenheim
[2] Braunschweig
[3] Aachen
[4] Cottbus


 0


----
Hannover
----
[0] Hannover 96
[1] Union Berlin
[2] Hamburger SV
[3] Nürnberg
[4] FC Bayern


 0


----
Kaiserslautern
----
[0] K'lautern
[1] Paderborn
[2] Union Berlin
[3] Karlsruhe
[4] FC Bayern


 0


----
VfB Stuttgart
----
[0] VfB Stuttgart
[1] Frankfurt
[2] Augsburg
[3] Wolfsburg
[4] K'lautern


 0


----
Hamburger SV
----
[0] Hamburger SV
[1] Hertha BSC
[2] Hannover 96
[3] Augsburg
[4] Duisburg


 0


----
Freiburg
----
[0] SC Freiburg
[1] Duisburg
[2] Augsburg
[3] Nürnberg
[4] Frankfurt


 0


----
Bayer Leverkusen
----
[0] Leverkusen
[1] FC Bayern
[2] Paderborn
[3] SV Werder
[4] K'lautern


 0


----
Dortmund
----
[0] Dortmund
[1] Darmstadt
[2] Cottbus
[3] Paderborn
[4] Ingolstadt


 0


----
Arminia Bielefeld
----
[0] Bielefeld
[1] Dortmund
[2] Darmstadt
[3] Hertha BSC
[4] Mainz 05


 0


----
Hertha Berlin
----
[0] Hertha BSC
[1] Union Berlin
[2] Hoffenheim
[3] St. Pauli
[4] SV Werder


 0


----
Werder Bremen
----
[0] SV Werder
[1] Paderborn
[2] Leverkusen
[3] Hertha BSC
[4] Hamburger SV


 0


----
Duisburg
----
[0] Duisburg
[1] Augsburg
[2] Wolfsburg
[3] SC Freiburg
[4] Hamburger SV


 0


----
Eintracht Frankfurt
----
[0] Frankfurt
[1] Gr. Fürth
[2] SC Freiburg
[3] Aachen
[4] Ingolstadt


 0


----
Alemannia Aachen
----
[0] Aachen
[1] M'gladbach
[2] Braunschweig
[3] Union Berlin
[4] Hoffenheim


 0


----
Energie Cottbus
----
[0] Cottbus
[1] Ingolstadt
[2] Nürnberg
[3] Leverkusen
[4] Hertha BSC


 0


----
Karlsruher
----
[0] Karlsruhe
[1] K'lautern
[2] Aachen
[3] Braunschweig
[4] Hamburger SV


 0


----
Hoffenheim
----
[0] Hoffenheim
[1] Bochum
[2] Braunschweig
[3] Hertha BSC
[4] Hannover 96


 0


----
Augsburg
----
[0] Augsburg
[1] Duisburg
[2] Wolfsburg
[3] SC Freiburg
[4] Hamburger SV


 0


----
Greuther Furth
----
[0] Gr. Fürth
[1] K'lautern
[2] Frankfurt
[3] Karlsruhe
[4] Leverkusen


 0


----
Dusseldorf
----
[0] Düsseldorf
[1] Duisburg
[2] Dortmund
[3] Augsburg
[4] SV Werder


 0


----
Braunschweig
----
[0] Braunschweig
[1] Aachen
[2] Nürnberg
[3] Karlsruhe
[4] Hoffenheim


 0


----
Paderborn
----
[0] Paderborn
[1] FC Bayern
[2] K'lautern
[3] Aachen
[4] Dortmund


 0


----
Köln
----
[0] 1.FC Köln
[1] K'lautern
[2] Karlsruhe
[3] Union Berlin
[4] Aachen


 0


In [26]:
translation

{'Bayern Munich': 'FC Bayern',
 'Wolfsburg': 'Wolfsburg',
 'Schalke': 'Schalke 04',
 'Nurnberg': 'Nürnberg',
 'Mainz': 'Mainz 05',
 'Hansa Rostock': 'Rostock',
 'B. Monchengladbach': "M'gladbach",
 'Bochum': 'Bochum',
 'Hannover': 'Hannover 96',
 'Kaiserslautern': "K'lautern",
 'VfB Stuttgart': 'VfB Stuttgart',
 'Hamburger SV': 'Hamburger SV',
 'Freiburg': 'SC Freiburg',
 'Bayer Leverkusen': 'Leverkusen',
 'Dortmund': 'Dortmund',
 'Arminia Bielefeld': 'Bielefeld',
 'Hertha Berlin': 'Hertha BSC',
 'Werder Bremen': 'SV Werder',
 'Duisburg': 'Duisburg',
 'Eintracht Frankfurt': 'Frankfurt',
 'Alemannia Aachen': 'Aachen',
 'Energie Cottbus': 'Cottbus',
 'Karlsruher': 'Karlsruhe',
 'Hoffenheim': 'Hoffenheim',
 'Augsburg': 'Augsburg',
 'Greuther Furth': 'Gr. Fürth',
 'Dusseldorf': 'Düsseldorf',
 'Braunschweig': 'Braunschweig',
 'Paderborn': 'Paderborn',
 'Köln': '1.FC Köln'}

In [28]:
buli['home_team'].replace(translation, inplace=True)
buli['away_team'].replace(translation, inplace=True)

In [29]:
buli.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win,season,matchday
10162,194719,bundesliga,2005-08-05,FC Bayern,3,M'gladbach,0,1.3118,4.3973,8.9845,1.35,5.0,10.0,506,1
10273,194936,bundesliga,2005-08-06,Wolfsburg,2,Dortmund,2,2.4,3.266,2.622,2.6,3.45,2.85,506,1
10338,194932,bundesliga,2005-08-06,Duisburg,1,VfB Stuttgart,1,3.0755,3.2582,2.1282,3.39,3.35,2.2,506,1
10339,194933,bundesliga,2005-08-06,Hamburger SV,3,Nürnberg,0,1.5173,3.6491,5.76,1.57,4.15,6.41,506,1
10340,194934,bundesliga,2005-08-06,Hannover 96,2,Hertha BSC,2,2.6282,3.2227,2.4255,3.16,3.35,2.65,506,1


In [54]:
join_standings = standings.copy()
join_standings.matchday = join_standings.matchday+1

join_standings.head()

Unnamed: 0,season,matchday,rank,team,games_played,wins,draw,lost,total_goals_scored,total_goals_received,points,league
0,506,2,1,SV Werder,1,1,0,0,5,2,3,bundesliga
1,506,2,2,Leverkusen,1,1,0,0,4,1,3,bundesliga
2,506,2,3,FC Bayern,1,1,0,0,3,0,3,bundesliga
3,506,2,3,Hamburger SV,1,1,0,0,3,0,3,bundesliga
4,506,2,5,Schalke 04,1,1,0,0,2,1,3,bundesliga


In [55]:
start_rows = []
for (team, season, league),g in join_standings.groupby(['team', 'season', 'league']):
    start_rows.append([season, 1, 1, team, 0, 0, 0, 0, 0, 0, 0, league])
start_rows = pd.DataFrame(start_rows, columns=join_standings.columns)

In [65]:
join_standings = join_standings.append(start_rows)

In [65]:
old_columns = join_standings.columns.values
home_cols = ['home_'+col for col in old_columns]
home_cols

['home_season',
 'home_matchday',
 'home_rank',
 'home_team',
 'home_games_played',
 'home_wins',
 'home_draw',
 'home_lost',
 'home_total_goals_scored',
 'home_total_goals_received',
 'home_points',
 'home_league']

In [58]:
joined = pd.merge(buli, join_standings, left_on=['home_team', 'matchday', 'season', 'league'], right_on=['team', 'matchday', 'season', 'league'], how='inner')
joined.shape

(2586, 24)

In [60]:
joined.tail()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,...,matchday,rank,team,games_played,wins,draw,lost,total_goals_scored,total_goals_received,points
2581,870168,bundesliga,2015-05-23,Hamburger SV,2,Schalke 04,0,2.1193,3.6983,3.1952,...,34,17,Hamburger SV,33,8,8,17,23,50,32
2582,870167,bundesliga,2015-05-23,Frankfurt,2,Leverkusen,1,4.2062,4.0383,1.7552,...,34,11,Frankfurt,33,10,10,13,54,61,40
2583,870166,bundesliga,2015-05-23,Dortmund,3,SV Werder,2,1.2541,6.4234,9.949,...,34,7,Dortmund,33,12,7,14,44,40,43
2584,870164,bundesliga,2015-05-23,FC Bayern,2,Mainz 05,0,1.17,7.9562,13.4545,...,34,1,FC Bayern,33,24,4,5,78,18,76
2585,870165,bundesliga,2015-05-23,M'gladbach,1,Augsburg,3,1.4703,4.5645,6.3879,...,34,3,M'gladbach,33,19,9,5,52,23,66


In [None]:
joined['position_diff'] = joined.rank

In [61]:
matchday_mask = (5<joined.matchday)
