In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path

# Concat of downloaded files

In [3]:
league = 'league_one'
path = Path(f'../../data/sportde/{league}/raw')

In [4]:
matchday_paths = list(path.glob('*_matchdays.feather'))
standings_paths = list(path.glob('*_standings.feather'))
games_paths = list(path.glob('*_games.feather'))

In [5]:
def merge_dfs_from_paths(paths):
    dfs = [pd.read_feather(path) for path in paths]
    expected_len = sum(len(df) for df in dfs)
    
    concat = pd.concat(dfs)
    
    assert len(concat)==expected_len, f'Expected len: {expected_len}, got: {len(concat)}'
    
    return concat.reset_index(drop=True)

In [6]:
matchday_df = merge_dfs_from_paths(matchday_paths)
matchday_df.to_feather(path/'../matchdays.feather')

standings_df = merge_dfs_from_paths(standings_paths)
standings_df.to_feather(path/'../standings.feather')

games_df = merge_dfs_from_paths(games_paths)
games_df.to_feather(path/'../games.feather')

# Sport.de data validation

In [7]:
data_dir = Path(f'../../data/sportde/{league}')

In [8]:
standings = pd.read_feather(data_dir/'standings.feather')
standings.head()

Unnamed: 0,season,matchday,rank,team,games_played,wins,draw,lost,total_goals_scored,total_goals_received,points,league
0,506,1,1,PSG,1,1,0,0,4,1,3,league_one
1,506,1,2,FC Nantes,1,1,0,0,2,0,3,league_one
2,506,1,2,Bordeaux,1,1,0,0,2,0,3,league_one
3,506,1,4,Lyon,1,1,0,0,2,1,3,league_one
4,506,1,5,AS Monaco,1,1,0,0,1,0,3,league_one


In [9]:
teams = standings.team.unique()
teams

array(['PSG', 'FC Nantes', 'Bordeaux', 'Lyon', 'AS Monaco', 'Lille',
       'Toulouse', 'ESTAC Troyes', 'OGC Nizza', 'AC Ajaccio', 'Auxerre',
       'Saint-Étienne', 'Strasbourg', 'Le Mans', 'AS Nancy', 'Sochaux',
       'Rennes', 'Marseille', 'RC Lens', 'FC Metz', 'Lorient', 'VAFC',
       'CS Sedan', 'SM Caen', 'Grenoble', 'HAC', 'Montpellier',
       'US Boulogne', 'AC Arles', 'Stade Brest', 'Thonon Évian FC',
       'Dijon FCO', 'Bastia', 'Stade Reims', 'Guingamp', 'Angers',
       'GFC Ajaccio', 'Amiens', 'Nîmes'], dtype=object)

In [10]:
games = pd.read_feather(data_dir/'games.feather')

home_teams = games.home_team.unique()
away_teams = games.away_team.unique()

teams = set(teams)
home_teams = set(home_teams)
away_teams = set(away_teams)

home_teams.issubset(teams), away_teams.issubset(teams)

(True, True)

# Merge all leagues

In [21]:
filetypes = ['games.feather', 'matchdays.feather', 'standings.feather']
base_path = Path(f'../../data/sportde/')
list(base_path.glob('*'))

[PosixPath('../../data/sportde/primera_division'),
 PosixPath('../../data/sportde/bundesliga'),
 PosixPath('../../data/sportde/premier_league'),
 PosixPath('../../data/sportde/league_one')]

In [28]:
for filetype in filetypes:
    files = [list(league_path.glob(filetype))[0] for league_path in base_path.glob('*/') if league_path.is_dir()]
    dfs = [pd.read_feather(file) for file in files]
    df = pd.concat(dfs).reset_index()
    
    df.to_feather(base_path/filetype)

In [29]:
 list(base_path.glob('*'))

[PosixPath('../../data/sportde/games.feather'),
 PosixPath('../../data/sportde/primera_division'),
 PosixPath('../../data/sportde/bundesliga'),
 PosixPath('../../data/sportde/standings.feather'),
 PosixPath('../../data/sportde/premier_league'),
 PosixPath('../../data/sportde/matchdays.feather'),
 PosixPath('../../data/sportde/league_one')]

### Fix seasons

In [37]:
for filetype in filetypes:
    df = pd.read_feather(base_path/filetype)
    seasons = {season_str: 2000+int(season_str[:2]) for season_str in list(df.season.unique())}
    df.season.replace(seasons, inplace=True)
    
    df.to_feather(base_path/filetype)