In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path

# Concat of downloaded files

In [3]:
league = 'serie_a'
path = Path(f'../../data/sportde/{league}/raw')

In [4]:
matchday_paths = list(path.glob('*_matchdays.feather'))
standings_paths = list(path.glob('*_standings.feather'))
games_paths = list(path.glob('*_games.feather'))

In [5]:
def merge_dfs_from_paths(paths):
    dfs = [pd.read_feather(path) for path in paths]
    expected_len = sum(len(df) for df in dfs)
    
    concat = pd.concat(dfs)
    
    assert len(concat)==expected_len, f'Expected len: {expected_len}, got: {len(concat)}'
    
    return concat.reset_index(drop=True)

In [6]:
matchday_df = merge_dfs_from_paths(matchday_paths)
matchday_df.to_feather(path/'../matchdays.feather')

standings_df = merge_dfs_from_paths(standings_paths)
standings_df.to_feather(path/'../standings.feather')

games_df = merge_dfs_from_paths(games_paths)
games_df.to_feather(path/'../games.feather')

# Sport.de data validation

In [7]:
data_dir = Path(f'../../data/sportde/{league}')

In [8]:
standings = pd.read_feather(data_dir/'standings.feather')
standings.head()

Unnamed: 0,season,matchday,rank,team,games_played,wins,draw,lost,total_goals_scored,total_goals_received,points,league
0,506,1,1,AS Rom,1,1,0,0,3,0,3,serie_a
1,506,1,1,Inter,1,1,0,0,3,0,3,serie_a
2,506,1,3,Livorno,1,1,0,0,2,1,3,serie_a
3,506,1,3,Siena,1,1,0,0,2,1,3,serie_a
4,506,1,5,Juventus,1,1,0,0,1,0,3,serie_a


In [9]:
teams = standings.team.unique()
teams

array(['AS Rom', 'Inter', 'Livorno', 'Siena', 'Juventus', 'Udinese',
       'Ascoli', 'Palermo', 'Parma', 'Cagliari', 'Sampdoria', 'Lecce',
       'Messina', 'Chievo Verona', 'Empoli', 'Treviso', 'Reggina',
       'AC Florenz', 'Lazio Rom', 'AC Mailand', 'Atalanta', 'Catania',
       'FC Turin', 'SSC Neapel', 'FC Genua', 'Bologna', 'Bari', 'Cesena',
       'Brescia', 'Novara', 'Pescara', 'Hellas Verona', 'Sassuolo',
       'Frosinone', 'Carpi', 'Crotone', 'SPAL Ferrara', 'Benevento',
       'Spezia'], dtype=object)

In [10]:
games = pd.read_feather(data_dir/'games.feather')

home_teams = games.home_team.unique()
away_teams = games.away_team.unique()

teams = set(teams)
home_teams = set(home_teams)
away_teams = set(away_teams)

home_teams.issubset(teams), away_teams.issubset(teams)

(True, True)

# Merge all leagues

In [11]:
filetypes = ['games.feather', 'matchdays.feather', 'standings.feather']
base_path = Path(f'../../data/sportde/')
list(base_path.glob('*'))

[WindowsPath('../../data/sportde/bundesliga'),
 WindowsPath('../../data/sportde/games.feather'),
 WindowsPath('../../data/sportde/league_one'),
 WindowsPath('../../data/sportde/matchdays.feather'),
 WindowsPath('../../data/sportde/premier_league'),
 WindowsPath('../../data/sportde/primera_division'),
 WindowsPath('../../data/sportde/serie_a'),
 WindowsPath('../../data/sportde/standings.feather')]

In [12]:
for filetype in filetypes:
    files = [list(league_path.glob(filetype))[0] for league_path in base_path.glob('*/') if league_path.is_dir()]
    dfs = [pd.read_feather(file) for file in files]
    df = pd.concat(dfs).reset_index()
    
    df.to_feather(base_path/filetype)

In [13]:
 list(base_path.glob('*'))

[WindowsPath('../../data/sportde/bundesliga'),
 WindowsPath('../../data/sportde/games.feather'),
 WindowsPath('../../data/sportde/league_one'),
 WindowsPath('../../data/sportde/matchdays.feather'),
 WindowsPath('../../data/sportde/premier_league'),
 WindowsPath('../../data/sportde/primera_division'),
 WindowsPath('../../data/sportde/serie_a'),
 WindowsPath('../../data/sportde/standings.feather')]

### Fix seasons

In [14]:
for filetype in filetypes:
    df = pd.read_feather(base_path/filetype)
    seasons = {season_str: 2000+int(season_str[:2]) for season_str in list(df.season.unique())}
    df.season.replace(seasons, inplace=True)
    
    df.to_feather(base_path/filetype)