In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [20]:
start_date = "2025-08-19"
end_date = "2025-08-21"
folder_name = f"v2_data//pred_data//test_pred_v2_{start_date}_{end_date}"

read .csv for player_matches and all_stats

In [None]:
player_match_stats = pd.read_csv(f"{folder_name}/player_match_stats.csv")
all_stats = pd.read_csv(f"{folder_name}/all_stats.csv")

merge matches and player stats, dropping extra columns in matches

matches could be cleaned up earlier in the process.

In [23]:
def merge_match_player_stats(p_m_stats,p_stats)-> pd.DataFrame:
    p_m_stats = p_m_stats[['account_id','match_id','team','winning_team','win','hero_id']]
    p_m_stats = p_m_stats.merge(p_stats, on=['account_id', 'hero_id'], how='left')
    return p_m_stats

In [24]:
p_m_stats = merge_match_player_stats(player_match_stats, all_stats)


In [None]:
p_m_stats.to_csv(f"{folder_name}/player_match_merged_stats.csv", index=False)

In [31]:
def create_std_team_stats(stats_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create team-level stats based on player&player_hero stats in min, max, mean, and std.
    """

    phm_stats = pd.DataFrame()
    phm_stats = stats_df.copy()

    # set the team stats to be set to quantiles
    team_stats = [
        'p_total_matches_played', 'p_total_time_played',
        'p_win_rate', 'ph_matches_played', 'ph_time_played',
        'ph_kills_per_min', 'ph_deaths_per_min', 'ph_accuracy',
        'ph_total_kd', 'h_total_kd', 'ph_kd_ratio', 
        'ph_hero_xp_ratio', 'ph_avg_match_length',
        'ph_avg_damage_per_match', 'h_damage_per_match', 'ph_damage_ratio',
        'ph_assists_ratio', 'ph_win_rate', 'h_total_win_rate',
        'ph_win_rate_ratio'
    ]

    # check for missing columns
    missing_cols = [col for col in team_stats if col not in phm_stats.columns]
    if missing_cols:
        print(f"*CRITICAL* Missing columns in team stats: {missing_cols}")
        return pd.DataFrame()  # Return an empty DataFrame if missing columns are found

    # for each columm, set min, max, and quantiles
    agg_function = {
        col: ["min", "max",
                "mean",
                "std"]
        for col in team_stats
    }

    agg_function['win'] = 'first'
    
    # group by columns, using agg_function as the aggregation function
    phm_stats = (phm_stats.groupby(
        ['match_id','team']).agg(
            agg_function))    
    
    # converts lambda tuples into col:val pairs
    def clean_columns(c_tuple):
        col, stat = c_tuple
        if stat == "min":
            return f"{col}_min"
        elif stat == "max":
            return f"{col}_max"
        elif stat == "mean":
            return f"{col}_mean"
        elif stat == "std":
            return f"{col}_std"
        elif callable(stat):
            return f"{col}_{stat.__name__}"
        else:
            return f"{col}_{stat}"

    phm_stats.columns = [clean_columns(col) for col in phm_stats.columns]

    return phm_stats.reset_index()

In [32]:
team_stats = create_std_team_stats(p_m_stats)

In [None]:
team_stats.to_csv(f"{folder_name}/team_match_stats.csv", index=False)

In [39]:
team_stats.columns

Index(['match_id', 'team', 'p_total_matches_played_min',
       'p_total_matches_played_max', 'p_total_matches_played_mean',
       'p_total_matches_played_std', 'p_total_time_played_min',
       'p_total_time_played_max', 'p_total_time_played_mean',
       'p_total_time_played_std', 'p_win_rate_min', 'p_win_rate_max',
       'p_win_rate_mean', 'p_win_rate_std', 'ph_matches_played_min',
       'ph_matches_played_max', 'ph_matches_played_mean',
       'ph_matches_played_std', 'ph_time_played_min', 'ph_time_played_max',
       'ph_time_played_mean', 'ph_time_played_std', 'ph_kills_per_min_min',
       'ph_kills_per_min_max', 'ph_kills_per_min_mean', 'ph_kills_per_min_std',
       'ph_deaths_per_min_min', 'ph_deaths_per_min_max',
       'ph_deaths_per_min_mean', 'ph_deaths_per_min_std', 'ph_accuracy_min',
       'ph_accuracy_max', 'ph_accuracy_mean', 'ph_accuracy_std',
       'ph_total_kd_min', 'ph_total_kd_max', 'ph_total_kd_mean',
       'ph_total_kd_std', 'h_total_kd_min', 'h_total_kd_

In [40]:
def create_training_data(team_stat_base:pd.DataFrame) -> pd.DataFrame:
    """
    Create training data by merging team stats with match outcomes.
    """
    t_stats = team_stat_base.copy()
    t_stats = t_stats.pivot(index='match_id', columns='team')

    t_stats.columns = [f'{col[0]}_{col[1]}' for col in t_stats.columns]
    t_stats['team_0_win'] = t_stats['win_first_Team0']
    t_stats.drop('win_first_Team0', axis=1, inplace=True)
    t_stats.drop('win_first_Team1', axis=1, inplace=True)
    t_stats = t_stats.reset_index()

    return t_stats

In [42]:
training_data = create_training_data(team_stats)
training_data.to_csv(f"{folder_name}/training_data.csv", index=False)
training_data.head(25)

Unnamed: 0,match_id,p_total_matches_played_min_Team0,p_total_matches_played_min_Team1,p_total_matches_played_max_Team0,p_total_matches_played_max_Team1,p_total_matches_played_mean_Team0,p_total_matches_played_mean_Team1,p_total_matches_played_std_Team0,p_total_matches_played_std_Team1,p_total_time_played_min_Team0,...,h_total_win_rate_std_Team1,ph_win_rate_ratio_min_Team0,ph_win_rate_ratio_min_Team1,ph_win_rate_ratio_max_Team0,ph_win_rate_ratio_max_Team1,ph_win_rate_ratio_mean_Team0,ph_win_rate_ratio_mean_Team1,ph_win_rate_ratio_std_Team0,ph_win_rate_ratio_std_Team1,team_0_win
0,38940475,247.0,424.0,1808.0,2763.0,1066.166667,1223.166667,687.332355,877.633504,448608.0,...,0.014128,0.85157,0.915505,1.302871,1.321095,1.085626,1.149169,0.150991,0.140791,Y
1,38940760,516.0,342.0,1507.0,1960.0,908.666667,1072.5,453.259161,570.094992,1006385.0,...,0.01016,0.818229,0.800804,1.210511,1.145595,1.064949,1.050936,0.158063,0.127707,Y
2,38941002,98.0,159.0,1764.0,2653.0,814.166667,1138.0,709.986596,980.663653,163466.0,...,0.014058,0.746536,0.957502,1.197107,1.262176,1.009235,1.145249,0.192616,0.119797,N
3,38941052,212.0,900.0,1436.0,2437.0,780.833333,1520.333333,443.938472,522.606417,395025.0,...,0.01516,0.926725,0.99208,1.476725,1.302811,1.149936,1.136395,0.215652,0.116517,N
4,38941055,599.0,311.0,3152.0,1858.0,1805.166667,1230.5,860.974661,552.85215,1080913.0,...,0.018715,0.989314,0.964458,1.493073,1.228561,1.216439,1.05419,0.184862,0.103054,Y
5,38941174,88.0,64.0,1960.0,1801.0,891.833333,670.666667,691.075804,617.632793,159576.0,...,0.011201,1.037885,1.015716,1.679707,1.591,1.211674,1.17309,0.235401,0.211542,Y
6,38941189,316.0,696.0,2008.0,1656.0,1122.0,1162.666667,691.584557,368.317074,552051.0,...,0.023174,0.929432,0.85572,1.436253,1.791687,1.112532,1.27323,0.180449,0.308682,N
7,38941435,526.0,155.0,3332.0,1784.0,1896.666667,1169.333333,1086.280013,615.239357,987610.0,...,0.015127,0.447922,0.94771,1.308971,1.991161,0.973048,1.221394,0.284295,0.387569,N
8,38941796,246.0,211.0,1355.0,3320.0,940.666667,1654.5,379.11089,1014.204467,458696.0,...,0.030764,0.716818,0.895844,1.866167,1.221523,1.185963,1.061117,0.379159,0.113953,N
9,38941925,579.0,621.0,1824.0,2339.0,1132.666667,1379.333333,498.528501,808.848729,1178541.0,...,0.026563,0.951754,0.930522,1.160833,1.679707,1.040312,1.139099,0.072793,0.27529,N


In [47]:
training_data.columns

Index(['match_id', 'p_total_matches_played_min_Team0',
       'p_total_matches_played_min_Team1', 'p_total_matches_played_max_Team0',
       'p_total_matches_played_max_Team1', 'p_total_matches_played_mean_Team0',
       'p_total_matches_played_mean_Team1', 'p_total_matches_played_std_Team0',
       'p_total_matches_played_std_Team1', 'p_total_time_played_min_Team0',
       ...
       'h_total_win_rate_std_Team1', 'ph_win_rate_ratio_min_Team0',
       'ph_win_rate_ratio_min_Team1', 'ph_win_rate_ratio_max_Team0',
       'ph_win_rate_ratio_max_Team1', 'ph_win_rate_ratio_mean_Team0',
       'ph_win_rate_ratio_mean_Team1', 'ph_win_rate_ratio_std_Team0',
       'ph_win_rate_ratio_std_Team1', 'team_0_win'],
      dtype='object', length=162)

In [50]:
def create_differential_training_data(team_stat_base:pd.DataFrame) -> pd.DataFrame:
    """
    Create training data by merging team stats with match outcomes.
    Each stat is a differential: Team0 - Team1
    """
    t_stats = team_stat_base.copy()
    diff_cols = {}

    # create differential columns
    for col in t_stats.columns:
        if col.endswith('_Team0'):
            base_col = col[:-6]  # remove '_Team0'
            team1_col = f'{base_col}_Team1'
            if team1_col in t_stats.columns:
                diff_name = f'{base_col}_diff'
                diff_cols[diff_name] = t_stats[col] - t_stats[team1_col]

    non_team_cols = [col for col in t_stats.columns if not (col.endswith('_Team0') or col.endswith('_Team1'))]
    result = t_stats[non_team_cols].copy()
    for diff_name, diff_series in diff_cols.items():
        result[diff_name] = diff_series.round(3)
    result = result.reset_index(drop=True)
    return result

In [51]:
dif_training_data = create_differential_training_data(training_data)
dif_training_data.to_csv(f"{folder_name}/differential_training_data.csv", index=False)
dif_training_data.head(25)

Unnamed: 0,match_id,team_0_win,p_total_matches_played_min_diff,p_total_matches_played_max_diff,p_total_matches_played_mean_diff,p_total_matches_played_std_diff,p_total_time_played_min_diff,p_total_time_played_max_diff,p_total_time_played_mean_diff,p_total_time_played_std_diff,...,ph_win_rate_mean_diff,ph_win_rate_std_diff,h_total_win_rate_min_diff,h_total_win_rate_max_diff,h_total_win_rate_mean_diff,h_total_win_rate_std_diff,ph_win_rate_ratio_min_diff,ph_win_rate_ratio_max_diff,ph_win_rate_ratio_mean_diff,ph_win_rate_ratio_std_diff
0,38940475,Y,-177.0,-955.0,-157.0,-190.301,-350396.0,-1635937.0,-336308.667,-382558.404,...,-0.05,0.015,-0.035,-0.0,-0.017,0.013,-0.064,-0.018,-0.064,0.01
1,38940760,Y,174.0,-453.0,-163.833,-116.836,329143.0,-981687.0,-316416.333,-260725.24,...,-0.0,0.036,-0.048,-0.003,-0.009,0.017,0.017,0.065,0.014,0.03
2,38941002,N,-61.0,-889.0,-323.833,-270.677,-112643.0,-1794959.0,-671030.833,-585358.424,...,-0.076,0.034,-0.026,0.007,-0.009,0.012,-0.211,-0.065,-0.136,0.073
3,38941052,N,-688.0,-1001.0,-739.5,-78.668,-1379946.0,-1737812.0,-1384367.333,-59801.308,...,-0.002,0.057,-0.035,-0.0,-0.008,0.012,-0.065,0.174,0.014,0.099
4,38941055,Y,288.0,1294.0,574.667,308.123,500000.0,2092393.0,1021651.5,578057.735,...,0.055,0.023,-0.026,-0.004,-0.02,0.008,0.025,0.265,0.162,0.082
5,38941174,Y,24.0,159.0,221.167,73.443,57002.0,397977.0,397798.833,122502.613,...,0.004,-0.01,-0.042,-0.0,-0.01,0.017,0.022,0.089,0.039,0.024
6,38941189,N,-380.0,352.0,-40.667,323.267,-687686.0,552089.0,-84264.833,552261.357,...,-0.075,-0.036,0.022,0.009,-0.0,-0.006,0.074,-0.355,-0.161,-0.128
7,38941435,N,371.0,1548.0,727.333,471.041,697916.0,2660857.0,1307578.333,791535.185,...,-0.128,-0.037,-0.026,0.001,-0.01,0.013,-0.5,-0.682,-0.248,-0.103
8,38941796,N,35.0,-1965.0,-713.833,-635.094,52031.0,-3681128.0,-1317333.0,-1182391.093,...,0.054,0.104,0.022,-0.014,-0.003,-0.016,-0.179,0.645,0.125,0.265
9,38941925,N,-42.0,-515.0,-246.667,-310.32,68579.0,-988780.0,-433929.667,-615924.543,...,-0.047,-0.057,0.022,-0.014,-0.003,-0.01,0.021,-0.519,-0.099,-0.202
