In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path

from betting.utility import *
from betting.data.cleanup import load_meta

# Merging datahub data with sportde meta

In [3]:
hub_dir = Path('../../../data/datahub')
meta_dir = Path('../../../data/sportde/')

In [12]:
match_meta = pd.read_feather(meta_dir/'games.feather')
standings_meta = pd.read_feather(meta_dir/'standings.feather')

In [5]:
actual = pd.read_feather(hub_dir/'concat.feather')
actual.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365H,B365D,B365A,season,league
0,2009-08-15,Aston Villa,Wigan,0,2,1.67,3.6,5.5,2009,premier_league
1,2009-08-15,Blackburn,Man City,0,2,3.6,3.25,2.1,2009,premier_league
2,2009-08-15,Bolton,Sunderland,0,1,2.25,3.25,3.25,2009,premier_league
3,2009-08-15,Chelsea,Hull City,2,1,1.17,6.5,21.0,2009,premier_league
4,2009-08-15,Everton,Arsenal,1,6,3.2,3.25,2.3,2009,premier_league


## Leagues, Season

In [6]:
actual.league.unique()

array(['premier_league', 'league_one', 'bundesliga', 'serie_a',
       'primera_division'], dtype=object)

In [7]:
actual.season.unique()

array([2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018])

## Matchday

In [8]:
actual.dtypes

Date        datetime64[ns]
HomeTeam            object
AwayTeam            object
FTHG                 int64
FTAG                 int64
B365H              float64
B365D              float64
B365A              float64
season               int32
league              object
dtype: object

In [9]:
match_meta.head()

Unnamed: 0,season,matchday,home_team,away_team,home_goals,away_goals,league
0,2005,1,FC Bayern,M'gladbach,3,0,bundesliga
1,2005,1,Hamburger SV,Nürnberg,3,0,bundesliga
2,2005,1,1.FC Köln,Mainz 05,1,0,bundesliga
3,2005,1,Duisburg,VfB Stuttgart,1,1,bundesliga
4,2005,1,Wolfsburg,Dortmund,2,2,bundesliga


In [11]:
with_matchday = pd.merge(actual, match_meta,
                  left_on=['season', 'league', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG'],
                  right_on=['season', 'league', 'home_team', 'away_team', 'home_goals', 'away_goals']
                 )

with_matchday.drop(columns=['FTHG', 'FTAG', 'HomeTeam', 'AwayTeam'], inplace=True)
with_matchday.head()

Unnamed: 0,Date,B365H,B365D,B365A,season,league,matchday,home_team,away_team,home_goals,away_goals
0,2009-08-15,1.67,3.6,5.5,2009,premier_league,1,Aston Villa,Wigan,0,2
1,2009-08-15,3.6,3.25,2.1,2009,premier_league,1,Blackburn,Man City,0,2
2,2009-08-15,2.25,3.25,3.25,2009,premier_league,1,Bolton,Sunderland,0,1
3,2009-08-15,1.17,6.5,21.0,2009,premier_league,1,Chelsea,Hull City,2,1
4,2009-08-15,3.2,3.25,2.3,2009,premier_league,1,Everton,Arsenal,1,6


## Standings

Shift standings by one matchday so that each row only contains information available at that point.

In [15]:
standings_meta.matchday += 1

In [19]:
with_standings = merge_with_prefix(with_matchday, standings_meta, 'home_',
                  left_on=['season', 'league', 'home_team', 'matchday'],
                 right_on=['season', 'league', 'team', 'matchday'],
                                  drop_additional=True)

with_standings = merge_with_prefix(with_standings, standings_meta, 'away_',
                  left_on=['season', 'league', 'away_team', 'matchday'],
                 right_on=['season', 'league', 'team', 'matchday'],
                                  drop_additional=True)

with_standings.head()

Unnamed: 0,Date,B365H,B365D,B365A,season,league,matchday,home_team,away_team,home_goals,...,home_total_goals_received,home_points,away_rank,away_games_played,away_wins,away_draw,away_lost,away_total_goals_scored,away_total_goals_received,away_points
0,2009-08-22,1.17,7.0,19.0,2009,premier_league,3,Arsenal,Portsmouth,4,...,1,3,20,1,0,0,1,0,1,-9
1,2009-08-22,2.3,3.2,3.25,2009,premier_league,3,Birmingham,Stoke City,0,...,1,0,2,1,1,0,0,2,0,3
2,2009-08-22,2.6,3.25,2.8,2009,premier_league,3,Hull City,Bolton,1,...,2,0,13,1,0,0,1,0,1,0
3,2009-08-22,1.33,4.5,12.0,2009,premier_league,3,Man City,Wolverhampton,1,...,0,3,15,1,0,0,1,0,2,0
4,2009-08-22,2.1,3.25,3.75,2009,premier_league,3,Sunderland,Blackburn,2,...,0,3,15,1,0,0,1,0,2,0


## Column names

In [21]:
with_standings.rename(columns={'B365H': 'home_odds',
                       'B365D': 'draw_odds',
                       'B365A': 'away_odds'}, inplace=True)
with_standings.head()

Unnamed: 0,Date,home_odds,draw_odds,away_odds,season,league,matchday,home_team,away_team,home_goals,...,home_total_goals_received,home_points,away_rank,away_games_played,away_wins,away_draw,away_lost,away_total_goals_scored,away_total_goals_received,away_points
0,2009-08-22,1.17,7.0,19.0,2009,premier_league,3,Arsenal,Portsmouth,4,...,1,3,20,1,0,0,1,0,1,-9
1,2009-08-22,2.3,3.2,3.25,2009,premier_league,3,Birmingham,Stoke City,0,...,1,0,2,1,1,0,0,2,0,3
2,2009-08-22,2.6,3.25,2.8,2009,premier_league,3,Hull City,Bolton,1,...,2,0,13,1,0,0,1,0,1,0
3,2009-08-22,1.33,4.5,12.0,2009,premier_league,3,Man City,Wolverhampton,1,...,0,3,15,1,0,0,1,0,2,0
4,2009-08-22,2.1,3.25,3.75,2009,premier_league,3,Sunderland,Blackburn,2,...,0,3,15,1,0,0,1,0,2,0


In [22]:
with_standings.to_feather(hub_dir/'with_meta.feather')