In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [93]:
def transform_results(df):
    # Keep dates from year 2015 - 2022
    df['year'] = df['Date'].apply(lambda r: int(r.split('.')[-1]))
    df = df[df['year'] >= 2015].reset_index(drop=True)
    df = df[['year', 'Home', 'Away', 'H_Score', 'A_Score']]
    df = df.rename(
        columns={
            'Home': 'home_team', 
            'Away': 'away_team',
            'H_Score': 'home_score',
            'A_Score': 'away_score'
        }
    )
    # Change from results from
    # team1 | team2 | score1 | score2
    # to
    # team1 | team2 | score1
    # team2 | team1 | score2
    year = np.hstack((df['year'].values, df['year'].values))
    team1 = np.hstack((df['home_team'].values, df['away_team'].values))
    team2 = np.hstack((df['away_team'].values, df['home_team'].values))
    score = np.hstack((df['home_score'].values, df['away_score'].values))
    home_team = np.hstack((np.ones(len(df), dtype=int), np.zeros(len(df), dtype=int)))
    df = pd.DataFrame({'year': year, 'team1': team1, 'team2': team2, 'team1_score': score, 'team1_home': home_team})
    df['team1'] = df['team1'].astype('string')
    df['team2'] = df['team2'].astype('string')
    df = df.dropna(subset='team1_score')
    df['team1_score'] = df['team1_score'].apply(int)
    return df

def aggregate_teams(df):
    # Choose columns to keep.
    columns = (['year', 'overall', 'potential', 'value_eur', 'wage_eur', 'age', 'height_cm', 
        'weight_kg', 'club_name'] + 
        [f for f in df.columns if f.startswith(('attacking_', 'skill_', 'movement_', 'power_', 
            'mentality_', 'defending_', 'goalkeeping_'))])
    columns.remove('mentality_composure')
    columns.remove('goalkeeping_speed')
    df = df[columns]
    df = df.fillna(0)
    # For each feature, aggregate mean, min, and max per team
    df_agg = df.groupby(['year', 'club_name']).agg(['mean', 'min', 'max'])
    df_agg.columns = df_agg.columns.map('_'.join)
    keep_columns = [f for f in df_agg.columns if not (f.startswith('goalkeeping_') and f.endswith(('mean', 'min')))]
    df_agg = df_agg[keep_columns].reset_index()
    df_agg = df_agg.rename(columns={'club_name': 'team'})
    df_agg['team'] = df_agg['team'].astype('string')
    return df_agg

def merge_results_teams(res, teams):
    match_list = []
    val_cols = [c for c in teams.columns if c not in ['year', 'team']]
    col_names = ['team1_' + c for c in val_cols] + ['team2_' + c for c in val_cols]
    # Translate team names into aggregated gold_columns for that team
    for row in res.itertuples(index=False):
        year = row.year
        team1 = row.team1
        team2 = row.team2
        if not ((team1 in teams[teams['year'] == year].values) and (team2 in teams[teams['year'] == year].values)):
            continue
        idx1 = (teams['year'] == year) & (teams['team'] == team1)
        idx2 = (teams['year'] == year) & (teams['team'] == team2)
        col_vals = np.hstack((
            teams.loc[idx1, val_cols].values[0],
            teams.loc[idx2, val_cols].values[0]
        ))
        d = {col_name: col_val for (col_name, col_val) in zip(col_names, col_vals)}
        d['team1'] = team1
        d['team2'] = team2
        d['team1_score'] = row.team1_score
        d['team1_home'] = row.team1_home
        match_list.append(d)
    return pd.DataFrame(match_list)

def feature_engineering(player_filename, results_filename):
    df_teams = pd.read_csv(player_filename)
    df_results = pd.read_csv(results_filename)
    teams1 = set(df_results['Home']).union(set(df_results['Away']))
    teams2 = set(df_teams['club_name'])
    teams = teams1.intersection(teams2)
    df_results = df_results[df_results['Home'].isin(teams) & df_results['Away'].isin(teams)]
    df_teams = df_teams[df_teams['club_name'].isin(teams)]
    df_teams = aggregate_teams(df_teams)
    df_results = transform_results(df_results)
    df = merge_results_teams(df_results, df_teams)
    return df, df_teams

In [94]:
df, df_teams = feature_engineering('players_all_years.csv', 'full_data.csv')

  df_teams = pd.read_csv(player_filename)


In [96]:
df.to_parquet('league_games.parquet')