In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
def transform_results(df):
    # Keep dates from year 2015 - 2022
    df['date'] = df['date'].apply(datetime.fromisoformat)
    df = df[df['date'] >= datetime(2015, 1, 1)].reset_index(drop=True)
    df['year'] = df['date'].apply(lambda x: int(x.year))
    df = df[['year', 'home_team', 'away_team', 'home_score', 'away_score']]
    # Change from results from
    # team1 | team2 | score1 | score2
    # to
    # team1 | team2 | score1
    # team2 | team1 | score2
    year = np.hstack((df['year'].values, df['year'].values))
    team1 = np.hstack((df['home_team'].values, df['away_team'].values))
    team2 = np.hstack((df['away_team'].values, df['home_team'].values))
    score = np.hstack((df['home_score'].values, df['away_score'].values))
    home_team = np.hstack((np.ones(len(df), dtype=int), np.zeros(len(df), dtype=int)))
    df = pd.DataFrame({'year': year, 'team1': team1, 'team2': team2, 'team1_score': score, 'team1_home': home_team})
    df['team1'] = df['team1'].astype('string')
    df['team2'] = df['team2'].astype('string')
    return df

def aggregate_teams(df, min_players=23):
    # Choose columns to keep.
    columns = (['year', 'overall', 'potential', 'value_eur', 'wage_eur', 'age', 'height_cm', 
        'weight_kg', 'nationality_name', 'nation_team_id'] + 
        [f for f in df.columns if f.startswith(('attacking_', 'skill_', 'movement_', 'power_', 
            'mentality_', 'defending_', 'goalkeeping_'))])
    columns.remove('mentality_composure')
    columns.remove('goalkeeping_speed')
    df = df[columns]
    # Teams with defined national team
    df_nat_team = df.dropna(subset=['nation_team_id'])
    df_nat_team = df_nat_team.drop(columns=['nation_team_id'])
    df = df.drop(columns=['nation_team_id'])
    no_nat_names = set(df['nationality_name'].unique()).difference(set(df_nat_team['nationality_name'].unique()))
    # Teams without a defined national team but with at least min_players players
    df_no_nat_team = df[df['nationality_name'].isin(no_nat_names)]\
        .groupby(['year', 'nationality_name'])\
        .filter(lambda g: g.count()['overall'] >= min_players)\
        .sort_values(['overall'], ascending=False)\
        .groupby(['year', 'nationality_name'])\
        .apply(lambda g: g[:min_players])\
        .reset_index(drop=True)
    df = pd.concat((df_nat_team, df_no_nat_team), ignore_index=True)
    # Set nan for value and wage to 0
    df = df.fillna(0)
    # For each feature, aggregate mean, min, and max per team
    df_agg = df.groupby(['year', 'nationality_name']).agg(['mean', 'min', 'max'])
    df_agg.columns = df_agg.columns.map('_'.join)
    keep_columns = [f for f in df_agg.columns if not (f.startswith('goalkeeping_') and f.endswith(('mean', 'min')))]
    df_agg = df_agg[keep_columns].reset_index()
    df_agg = df_agg.rename(columns={'nationality_name': 'team'})
    df_agg['team'] = df_agg['team'].astype('string')
    return df_agg

def merge_results_teams(res, teams):
    match_list = []
    val_cols = [c for c in teams.columns if c not in ['year', 'team']]
    col_names = ['team1_' + c for c in val_cols] + ['team2_' + c for c in val_cols]
    # Translate team names into aggregated gold_columns for that team
    for row in res.itertuples(index=False):
        year = row.year
        team1 = row.team1
        team2 = row.team2
        if not ((team1 in teams[teams['year'] == year].values) and (team2 in teams[teams['year'] == year].values)):
            continue
        idx1 = (teams['year'] == year) & (teams['team'] == team1)
        idx2 = (teams['year'] == year) & (teams['team'] == team2)
        col_vals = np.hstack((
            teams.loc[idx1, val_cols].values[0],
            teams.loc[idx2, val_cols].values[0]
        ))
        d = {col_name: col_val for (col_name, col_val) in zip(col_names, col_vals)}
        d['team1'] = team1
        d['team2'] = team2
        d['team1_score'] = row.team1_score
        d['team1_home'] = row.team1_home
        match_list.append(d)
    return pd.DataFrame(match_list)

def feature_engineering(player_filename, results_filename):
    df_teams = pd.read_csv(player_filename)
    df_teams = aggregate_teams(df_teams, min_players=1)
    df_results = pd.read_csv(results_filename)
    df_results = transform_results(df_results)
    df = merge_results_teams(df_results, df_teams)
    return df, df_teams

In [4]:
df, df_teams = feature_engineering('players_all_years.csv', 'results.csv')

  df_teams = pd.read_csv(player_filename)


In [5]:
df.to_parquet('national_games.parquet')