In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime
from difflib import SequenceMatcher

from betting.data.sportde import set_dtypes
from betting.utility import *

# Processing the kaggle dataset 

## Filetype

In [3]:
data_dir = Path('../../../data/kaggle')
csv_path = data_dir / 'closing_odds.csv'
target_path = data_dir / 'closing_odds.feather'

In [4]:
if not target_path.is_file():
    df = pd.read_csv(csv_path)
    df.sort_values('match_date', inplace=True)
    df = df.reset_index(drop=True)

    df.drop(['top_bookie_home_win', 'top_bookie_draw', 'top_bookie_away_win', 'n_odds_home_win', 'n_odds_draw', 'n_odds_away_win'], inplace=True, axis=1)

    df = set_dtypes(df, [int, str, str, str, int, str, int, float, float, float, float, float, float])
    df.match_date = pd.to_datetime(df.match_date, format='%Y-%m-%d')

    df.to_feather(target_path)

df = pd.read_feather(target_path)

In [5]:
df.columns, df.shape

(Index(['match_id', 'league', 'match_date', 'home_team', 'home_score',
        'away_team', 'away_score', 'avg_odds_home_win', 'avg_odds_draw',
        'avg_odds_away_win', 'max_odds_home_win', 'max_odds_draw',
        'max_odds_away_win'],
       dtype='object'),
 (479440, 13))

## Compare against meta

In [6]:
meta_path = Path('../../../data/sportde')
matches_df = pd.read_feather(meta_path/'games.feather')
matches_df.head(10)

Unnamed: 0,index,season,matchday,home_team,away_team,home_goals,away_goals,league
0,0,2005,1,Alavés,Barcelona,0,0,primera_division
1,1,2005,1,Valencia,Betis Sevilla,1,0,primera_division
2,2,2005,1,Athletic,S. Sebastian,3,0,primera_division
3,3,2005,1,FC Sevilla,Racing,1,0,primera_division
4,4,2005,1,RC Celta,Málaga,2,0,primera_division
5,5,2005,1,Espanyol,Getafe,0,2,primera_division
6,6,2005,1,Mallorca,Deportivo,0,1,primera_division
7,7,2005,1,Atlético,Zaragoza,0,0,primera_division
8,8,2005,1,Osasuna,Villarreal,2,1,primera_division
9,9,2005,1,Cádiz,Real Madrid,1,2,primera_division


## Split leagues

In [7]:
meta_leagues = list(matches_df.league.unique())
meta_leagues

['primera_division', 'bundesliga', 'premier_league', 'league_one']

In [8]:
actual_leagues = df.league.unique()
league_translation = {
    'Spain: Primera Division' : 'primera_division',
    'Germany: Bundesliga' : 'bundesliga',
    'England: Premier League' : 'premier_league',
    'France: Ligue 1' : 'league_one'
}
league_translation

{'Spain: Primera Division': 'primera_division',
 'Germany: Bundesliga': 'bundesliga',
 'England: Premier League': 'premier_league',
 'France: Ligue 1': 'league_one'}

In [9]:
df.league.replace(league_translation, inplace=True)

league_df = df.loc[df.league.isin(meta_leagues)].copy()
league_df.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win
0,170088,premier_league,2005-01-01,Liverpool,0,Chelsea,1,2.9944,3.1944,2.2256,3.2,3.25,2.29
40,170097,premier_league,2005-01-01,Middlesbrough,0,Manchester United,2,3.3611,3.2222,2.0489,3.75,3.25,2.1
41,170096,premier_league,2005-01-01,Tottenham,5,Everton,2,2.2156,3.1722,3.0444,2.29,3.2,3.35
43,170095,premier_league,2005-01-01,Portsmouth,1,Norwich,1,1.6344,3.3933,5.2633,1.7,3.54,6.14
44,170093,premier_league,2005-01-01,Manchester City,2,Southampton,1,1.7,3.3667,4.7344,1.73,3.5,5.0


In [10]:
len(league_df)

13588

## Seasons

In [11]:
def season_from_date(row):
    date = row.match_date
    if date.month<=7:
        season = (date.year-1)
    else:
        season = (date.year)

    return season

In [12]:
league_df['season'] = league_df.apply(func=season_from_date, axis=1)

league_df.tail()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win,season
475110,870751,premier_league,2015-05-24,Aston Villa,0,Burnley,1,2.1924,3.4138,3.3055,2.25,3.54,3.5,2014
475111,870750,premier_league,2015-05-24,Arsenal,4,West Brom,1,1.3328,5.4514,8.5714,1.37,6.0,10.0,2014
475118,870753,premier_league,2015-05-24,Crystal Palace,1,Swansea,0,1.8179,3.729,4.3548,2.0,4.1,5.0,2014
475481,871556,bundesliga,2015-05-28,Hamburger SV,1,Karlsruher,1,2.2679,3.1768,3.2704,2.46,3.4,3.85,2014
476386,872810,bundesliga,2015-06-01,Karlsruher,1,Hamburger SV,1,2.1682,3.3089,3.3564,2.3,3.45,3.87,2014


In [13]:
overlapping_seasons = [season for season in matches_df.season.unique() if season in league_df.season.unique()]
overlapping_seasons

[2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]

In [14]:
season_df = league_df.loc[league_df.season.isin(overlapping_seasons),:].copy()
matches_df = matches_df.loc[matches_df.season.isin(overlapping_seasons),:].copy()

len(season_df), len(matches_df)

(12844, 14460)

## Team names

In [15]:
standings = pd.read_feather(meta_path/'standings.feather')
standings = standings.loc[standings.season.isin(overlapping_seasons),:].copy()
league_standings = standings.loc[standings.league.isin(meta_leagues), :].copy()
league_standings.head()

Unnamed: 0,index,season,matchday,rank,team,games_played,wins,draw,lost,total_goals_scored,total_goals_received,points,league
0,0,2005,1,1,Athletic,1,1,0,0,3,0,3,primera_division
1,1,2005,1,2,RC Celta,1,1,0,0,2,0,3,primera_division
2,2,2005,1,2,Getafe,1,1,0,0,2,0,3,primera_division
3,3,2005,1,4,Osasuna,1,1,0,0,2,1,3,primera_division
4,4,2005,1,4,Real Madrid,1,1,0,0,2,1,3,primera_division


In [16]:
target_teams = list(league_standings.team.unique())
actual_teams = list(season_df.home_team.unique())

len(target_teams), len(actual_teams)

(138, 134)

In [17]:
team_translations = create_translation(actual_teams, target_teams)
team_translations

{'Bayern Munich': 'FC Bayern',
 'Wolfsburg': 'Wolfsburg',
 'Lens': 'RC Lens',
 'AC Ajaccio': 'AC Ajaccio',
 'Bordeaux': 'Bordeaux',
 'Metz': 'FC Metz',
 'Monaco': 'AS Monaco',
 'Toulouse': 'Toulouse',
 'Troyes': 'ESTAC Troyes',
 'Duisburg': 'Duisburg',
 'Hamburger SV': 'Hamburger SV',
 'Hannover': 'Hannover 96',
 'Werder Bremen': 'SV Werder',
 'Sochaux': 'Sochaux',
 'Eintracht Frankfurt': 'Frankfurt',
 'Lyon': 'Lyon',
 'Schalke': 'Schalke 04',
 'Rennes': 'Rennes',
 'Auxerre': 'Auxerre',
 'Kaiserslautern': "K'lautern",
 'Everton': 'Everton',
 'Aston Villa': 'Aston Villa',
 'Fulham': 'Fulham',
 'Manchester City': 'Man City',
 'Portsmouth': 'Portsmouth',
 'Sunderland': 'Sunderland',
 'West Ham': 'West Ham',
 'Middlesbrough': 'Middlesbrough',
 'Nurnberg': 'Nürnberg',
 'Paris SG': 'PSG',
 'Le Mans': 'Le Mans',
 'Lille': 'Lille',
 'Nice': 'Nàstic',
 'St Etienne': 'Saint-Étienne',
 'Arminia Bielefeld': 'Bielefeld',
 'Bayer Leverkusen': 'Leverkusen',
 'B. Monchengladbach': "M'gladbach",
 'Dort

In [18]:
manual = {'Real Sociedad' : 'S. Sebastian',
          'Santander' : 'Racing',
          'Manchester United' : 'ManUtd',
          'Atl. Madrid' : 'Atlético',
          'Nice' : 'OGC Nizza',
          'Dep. La Coruna' : 'Deportivo',
          'Ath Bilbao' : 'Athletic',
          'Gimnastic' : 'Nàstic',
          'Le Havre' : 'HAC',
          'Gijon' : 'Sporting Gijón',
          'Evian TG' : 'Thonon Évian FC',
          'Rayo Vallecano' : 'Rayo',
          'Wolves': 'Wolverhampton',
          'Valenciennes' : 'VAFC'
         }

In [19]:
for actual,target in manual.items():
    team_translations[actual] = target

In [20]:
season_df['home_team'].replace(team_translations, inplace=True)
season_df['away_team'].replace(team_translations, inplace=True)

In [21]:
set(league_standings.team) - set(season_df.home_team)

{'AC Arles', 'Blackpool', 'Hércules', 'St. Pauli'}

In [22]:
targets = list(team_translations.values())
len(targets)==len(set(targets))

True

In [23]:
from collections import defaultdict

rev = defaultdict(list)

for actual,target in team_translations.items():
    rev[target].append(actual)
    
for target,actuals in rev.items():
    if len(actuals)>1:
        print(target, actuals)

In [24]:
season_df.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win,season
10162,194719,bundesliga,2005-08-05,FC Bayern,3,M'gladbach,0,1.3118,4.3973,8.9845,1.35,5.0,10.0,2005
10273,194936,bundesliga,2005-08-06,Wolfsburg,2,Dortmund,2,2.4,3.266,2.622,2.6,3.45,2.85,2005
10325,194908,league_one,2005-08-06,RC Lens,2,Marseille,0,2.1783,2.935,3.205,2.45,3.0,3.85,2005
10326,194909,league_one,2005-08-06,AC Ajaccio,3,Lille,3,2.6657,2.8643,2.6014,2.85,3.0,2.75,2005
10327,194910,league_one,2005-08-06,Bordeaux,1,AS Nancy,0,1.5957,3.25,5.7443,1.65,3.39,6.5,2005


## Matchday

In [25]:
matches_df.columns

Index(['index', 'season', 'matchday', 'home_team', 'away_team', 'home_goals',
       'away_goals', 'league'],
      dtype='object')

In [26]:
with_matchday = pd.merge(season_df, matches_df,
                  left_on=['season', 'league', 'home_team', 'away_team', 'home_score', 'away_score'],
                  right_on=['season', 'league', 'home_team', 'away_team', 'home_goals', 'away_goals']
                 )
with_matchday.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,max_odds_home_win,max_odds_draw,max_odds_away_win,season,index,matchday,home_goals,away_goals
0,194719,bundesliga,2005-08-05,FC Bayern,3,M'gladbach,0,1.3118,4.3973,8.9845,1.35,5.0,10.0,2005,0,1,3,0
1,194936,bundesliga,2005-08-06,Wolfsburg,2,Dortmund,2,2.4,3.266,2.622,2.6,3.45,2.85,2005,4,1,2,2
2,194908,league_one,2005-08-06,RC Lens,2,Marseille,0,2.1783,2.935,3.205,2.45,3.0,3.85,2005,10,2,2,0
3,194909,league_one,2005-08-06,AC Ajaccio,3,Lille,3,2.6657,2.8643,2.6014,2.85,3.0,2.75,2005,11,2,3,3
4,194910,league_one,2005-08-06,Bordeaux,1,AS Nancy,0,1.5957,3.25,5.7443,1.65,3.39,6.5,2005,12,2,1,0


# Standings

In [27]:
league_standings.columns

Index(['index', 'season', 'matchday', 'rank', 'team', 'games_played', 'wins',
       'draw', 'lost', 'total_goals_scored', 'total_goals_received', 'points',
       'league'],
      dtype='object')

In [30]:
with_standings = merge_with_prefix(with_matchday, league_standings, 'home_',
                  left_on=['season', 'league', 'home_team', 'matchday'],
                 right_on=['season', 'league', 'team', 'matchday'])

with_standings = merge_with_prefix(with_standings, league_standings, 'away_',
                  left_on=['season', 'league', 'away_team', 'matchday'],
                 right_on=['season', 'league', 'team', 'matchday'])

with_standings.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,...,away_index,away_rank,team_y,away_games_played,away_wins,away_draw,away_lost,away_total_goals_scored,away_total_goals_received,away_points
0,194719,bundesliga,2005-08-05,FC Bayern,3,M'gladbach,0,1.3118,4.3973,8.9845,...,17,17,M'gladbach,1,0,0,1,0,3,0
1,194936,bundesliga,2005-08-06,Wolfsburg,2,Dortmund,2,2.4,3.266,2.622,...,6,7,Dortmund,1,0,1,0,2,2,1
2,194908,league_one,2005-08-06,RC Lens,2,Marseille,0,2.1783,2.935,3.205,...,38,19,Marseille,2,0,0,2,0,4,0
3,194909,league_one,2005-08-06,AC Ajaccio,3,Lille,3,2.6657,2.8643,2.6014,...,26,7,Lille,2,1,1,0,4,3,4
4,194910,league_one,2005-08-06,Bordeaux,1,AS Nancy,0,1.5957,3.25,5.7443,...,36,17,AS Nancy,2,0,0,2,0,2,0


In [32]:
with_standings.shape

(12835, 38)

In [33]:
with_standings.columns

Index(['match_id', 'league', 'match_date', 'home_team', 'home_score',
       'away_team', 'away_score', 'avg_odds_home_win', 'avg_odds_draw',
       'avg_odds_away_win', 'max_odds_home_win', 'max_odds_draw',
       'max_odds_away_win', 'season', 'index', 'matchday', 'home_goals',
       'away_goals', 'home_index', 'home_rank', 'team_x', 'home_games_played',
       'home_wins', 'home_draw', 'home_lost', 'home_total_goals_scored',
       'home_total_goals_received', 'home_points', 'away_index', 'away_rank',
       'team_y', 'away_games_played', 'away_wins', 'away_draw', 'away_lost',
       'away_total_goals_scored', 'away_total_goals_received', 'away_points'],
      dtype='object')

In [34]:
with_standings.drop(columns=['index', 'home_index', 'away_index'], inplace=True)
with_standings.head()

Unnamed: 0,match_id,league,match_date,home_team,home_score,away_team,away_score,avg_odds_home_win,avg_odds_draw,avg_odds_away_win,...,home_points,away_rank,team_y,away_games_played,away_wins,away_draw,away_lost,away_total_goals_scored,away_total_goals_received,away_points
0,194719,bundesliga,2005-08-05,FC Bayern,3,M'gladbach,0,1.3118,4.3973,8.9845,...,3,17,M'gladbach,1,0,0,1,0,3,0
1,194936,bundesliga,2005-08-06,Wolfsburg,2,Dortmund,2,2.4,3.266,2.622,...,1,7,Dortmund,1,0,1,0,2,2,1
2,194908,league_one,2005-08-06,RC Lens,2,Marseille,0,2.1783,2.935,3.205,...,3,19,Marseille,2,0,0,2,0,4,0
3,194909,league_one,2005-08-06,AC Ajaccio,3,Lille,3,2.6657,2.8643,2.6014,...,2,7,Lille,2,1,1,0,4,3,4
4,194910,league_one,2005-08-06,Bordeaux,1,AS Nancy,0,1.5957,3.25,5.7443,...,6,17,AS Nancy,2,0,0,2,0,2,0


In [36]:
save_path = Path('../../../data/kaggle/with_meta.feather')

with_standings.to_feather(save_path.open('wb'))