In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp data.cleanup

In [None]:
# export
import pandas as pd
from betting.utility import *

In [None]:
from pathlib import Path

# General utility

In [None]:
hub_dir = Path('../data/datahub')
meta_dir = Path('../data/sportde/')

league_dir = hub_dir / 'french-ligue-1_zip'
league = 'league_one'

## Load meta

In [None]:
# export
def load_meta(meta_dir):
    meta_matches = pd.read_feather(meta_dir/'games.feather')
    meta_matchdays = pd.read_feather(meta_dir/'matchdays.feather')

    meta = pd.merge(meta_matches, meta_matchdays, on=['matchday', 'season', 'league'])
    return meta

In [None]:
meta = load_meta(meta_dir)
meta.head()

Unnamed: 0,index_x,season,matchday,home_team,away_team,home_goals,away_goals,league,index_y,start_date,end_date
0,0,2005,1,Alavés,Barcelona,0,0,primera_division,0,2005-08-27,2005-08-28
1,1,2005,1,Valencia,Betis Sevilla,1,0,primera_division,0,2005-08-27,2005-08-28
2,2,2005,1,Athletic,S. Sebastian,3,0,primera_division,0,2005-08-27,2005-08-28
3,3,2005,1,FC Sevilla,Racing,1,0,primera_division,0,2005-08-27,2005-08-28
4,4,2005,1,RC Celta,Málaga,2,0,primera_division,0,2005-08-27,2005-08-28


## Season from date and meta

In [None]:
actual_matches = pd.read_feather(league_dir / 'concat.feather')
actual_matches.head()

Unnamed: 0,index,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365H,B365D,B365A
0,0,F1,2009-08-08,Auxerre,Sochaux,0,1,2.0,3.0,4.33
1,1,F1,2009-08-08,Grenoble,Marseille,0,2,4.75,3.1,1.91
2,2,F1,2009-08-08,Le Mans,Lyon,2,2,3.75,3.1,2.1
3,3,F1,2009-08-08,Monaco,Toulouse,1,0,2.3,3.0,3.4
4,4,F1,2009-08-08,Montpellier,Paris SG,1,1,3.5,3.0,2.25


In [None]:
# export
def add_season_from_date(actual_df, meta_df, date_col):
    for season,df in meta_df.groupby('season'):
        start = df.start_date.min()
        end = df.end_date.max()

        actual_df.loc[(start<=actual_df[date_col]) & (actual_df[date_col]<=end), 'season'] = season
    actual_df.season = actual_df.season.astype(int)
        
    return actual_df

In [None]:
r = add_season_from_date(actual_matches, meta, 'Date')
r

Unnamed: 0,index,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365H,B365D,B365A,season
0,0,F1,2009-08-08,Auxerre,Sochaux,0,1,2.00,3.0,4.33,2009
1,1,F1,2009-08-08,Grenoble,Marseille,0,2,4.75,3.1,1.91,2009
2,2,F1,2009-08-08,Le Mans,Lyon,2,2,3.75,3.1,2.10,2009
3,3,F1,2009-08-08,Monaco,Toulouse,1,0,2.30,3.0,3.40,2009
4,4,F1,2009-08-08,Montpellier,Paris SG,1,1,3.50,3.0,2.25,2009
...,...,...,...,...,...,...,...,...,...,...,...
3794,375,F1,2019-05-24,Nantes,Strasbourg,0,1,1.65,3.8,5.50,2018
3795,376,F1,2019-05-24,Nice,Monaco,2,0,3.10,3.5,2.25,2018
3796,377,F1,2019-05-24,Nimes,Lyon,2,3,3.40,3.8,2.00,2018
3797,378,F1,2019-05-24,Reims,Paris SG,3,1,6.50,5.0,1.45,2018


## Matching team names

This will be specific to the meta data from sportde.

### Load test data

In [None]:
actual_matches = pd.read_feather(league_dir / 'concat.feather')

actual_matches.Div = league
actual_matches = add_season_from_date(actual_matches, meta, 'Date')
actual_matches.head()

Unnamed: 0,index,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,B365H,B365D,B365A,season
0,0,league_one,2009-08-08,Auxerre,Sochaux,0,1,2.0,3.0,4.33,2009
1,1,league_one,2009-08-08,Grenoble,Marseille,0,2,4.75,3.1,1.91,2009
2,2,league_one,2009-08-08,Le Mans,Lyon,2,2,3.75,3.1,2.1,2009
3,3,league_one,2009-08-08,Monaco,Toulouse,1,0,2.3,3.0,3.4,2009
4,4,league_one,2009-08-08,Montpellier,Paris SG,1,1,3.5,3.0,2.25,2009


Goal: Match each actual team name against the corresponding team name in targets. Multiple actual might be matched against the same target.

### Match

In [None]:
# export
def match_teams_from_goals(actuals_df, meta_df, home_team, away_team, home_score, away_score, league, date_col, season_col='season'):
    seasons = actuals_df[season_col].unique()
    actual_names = unique_in_cols(actuals_df, [home_team, away_team])
    n_actuals = len(actual_names)
    
    translation = {}

    for season in seasons:
        for matchday,df in meta_df.loc[(meta_df.league==league) & (meta_df.season==season)].groupby('matchday'):
            start_date = df.start_date.unique()[0]
            end_date = df.end_date.unique()[0]
            results = [f'{home}:{away}' for home,away  in df.loc[:,['home_goals', 'away_goals']].values]

            for unique_result in unique_elements(results):
                unique_home, unique_away = unique_result.split(':')
                actual_mask = (actuals_df.FTHG==int(unique_home)) & (actuals_df.FTAG==int(unique_away)) & (start_date<=actuals_df[date_col]) & (actuals_df[date_col]<=end_date)
                actual_row = actuals_df.loc[actual_mask]

                meta_mask = (df.home_goals==int(unique_home)) & (df.away_goals==int(unique_away))
                meta_row = df.loc[meta_mask]

                try:
                    translation[actual_row.HomeTeam.iloc[0]] = meta_row.home_team.iloc[0]
                    translation[actual_row.AwayTeam.iloc[0]] = meta_row.away_team.iloc[0]
                except:
                    # put excellent error handling here
                    pass

                if len(translation.keys())==n_actuals: return translation
            
    # some actuals are missing
    for actual_name in actual_names:
        if not actual_name in translation.keys():
            translation[actual_name] = ''
    return translation

In [None]:
%%time
match_teams_from_goals(actual_matches, meta, 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'league_one', 'Date')

Wall time: 7.58 s


{'Auxerre': 'Auxerre',
 'Sochaux': 'Sochaux',
 'Le Mans': 'Le Mans',
 'Lyon': 'Lyon',
 'Monaco': 'AS Monaco',
 'Toulouse': 'Toulouse',
 'Montpellier': 'Montpellier',
 'Paris SG': 'PSG',
 'Rennes': 'Rennes',
 'Boulogne': 'US Boulogne',
 'Valenciennes': 'VAFC',
 'Nancy': 'AS Nancy',
 'Lille': 'Lille',
 'Lorient': 'Lorient',
 'Bordeaux': 'Bordeaux',
 'Lens': 'RC Lens',
 'Grenoble': 'Grenoble',
 'Nice': 'OGC Nizza',
 'St Etienne': 'Saint-Étienne',
 'Marseille': 'Marseille',
 'Arles': 'AC Arles',
 'Brest': 'Stade Brest',
 'Caen': 'SM Caen',
 'Ajaccio': 'AC Ajaccio',
 'Dijon': 'Dijon FCO',
 'Evian Thonon Gaillard': 'Thonon Évian FC',
 'Troyes': 'ESTAC Troyes',
 'Reims': 'Stade Reims',
 'Bastia': 'Bastia',
 'Guingamp': 'Guingamp',
 'Nantes': 'FC Nantes',
 'Metz': 'FC Metz',
 'Angers': 'Angers',
 'Ajaccio GFCO': 'GFC Ajaccio',
 'Amiens': 'Amiens',
 'Strasbourg': 'Strasbourg',
 'Nimes': 'Nîmes'}

## Export

In [None]:
#hide
from nbdev.export import *
notebook2script()

Converted 00_scraping.ipynb.
Converted 01_utility.ipynb.
Converted 02_data_football_uk.ipynb.
Converted 03_sportde.ipynb.
Converted 04_data_cleanup.ipynb.
Converted 04_feature_engineering.ipynb.
Converted index.ipynb.
