In [1]:
import requests
import json
import os
import sys
import pandas as pd
import logging
from urllib.parse import urlencode
import time
from datetime import timedelta, datetime, timezone
from data.fetch_data import bulk_fetch_matches
from typing import Optional, Dict, Any, Iterable, List

logging.basicConfig(level=logging.DEBUG)
logging = logging.getLogger(__name__)


In [2]:
start_date = "2025-08-19"
end_date = "2025-08-21"
folder_name = f"v2_data//pred_data//test_pred_v2_{start_date}_{end_date}"

In [16]:
# Load data from .csvs

# Player <> Match Linking table
player_matches = pd.read_csv(f"{folder_name}/player_matches.csv")

# player stats across lifetime of player
player_stats = pd.read_csv(f"{folder_name}/p_stats.csv")

# player hero stats across liftetime of games played
player_hero_stats = pd.read_csv(f"{folder_name}/player_hero_stats.csv")

# raw hero stats for badge 100+ based on end_date
hero_stats = pd.read_csv(f"{folder_name}/hero_stats.csv")

In [17]:
def check_unique_naming(p_h_stats=None,
                        p_stats=None, 
                        h_stats=None
                        ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    static_names = ['account_id', 'hero_id', 'match_id']
    if p_h_stats is not None:
        p_h_stats = p_h_stats.rename(columns={col: f"ph_{col}" for col in p_h_stats.columns 
            if col not in static_names and not col.startswith('ph_')})
    if p_stats is not None:
        p_stats = p_stats.rename(columns={col: f"p_{col}" for col in p_stats.columns
            if col not in static_names and not col.startswith('p_')})
    if h_stats is not None:
        h_stats = h_stats.rename(columns={col: f"h_{col}" for col in h_stats.columns
            if col not in static_names and not col.startswith('h_')})

    return p_h_stats, p_stats, h_stats


In [18]:
player_hero_stats, player_stats, hero_stats = check_unique_naming(player_hero_stats, player_stats, hero_stats)

In [None]:
def merge_stats(player_stats, player_hero_stats, hero_stats)->pd.DataFrame:
    p_ph_stats = player_stats.merge(player_hero_stats, on='account_id')
    p_ph_h_stats = p_ph_stats.merge(hero_stats, on='hero_id')

    move_col = 'hero_id'
    pos = 1

    col = p_ph_h_stats.pop(move_col)
    p_ph_h_stats.insert(pos, move_col, col)

    return p_ph_h_stats

In [24]:
p_ph_h_stats = merge_stats(player_stats, player_hero_stats, hero_stats)

In [32]:
p_ph_h_stats.to_csv(f"{folder_name}/p_ph_h_stats.csv", index=False)

In [None]:
# Dated fucntion that handled renmae and merge in one. now split.
# def merge_pm_h_p_stats(p_m_stats, ph_stats, h_stats,)-> pd.DataFrame:
#     """merges hero_stats on df, where hero_id=hero_id, adds suffix 'h_' to hero_stats columns"""

#     h_stats_copy = h_stats.copy()
#     ph_stats_copy = ph_stats.copy()
#     p_m_stats_copy = p_m_stats.copy()

#     h_stats_copy = h_stats_copy.rename(columns={'h_hero_id': 'hero_id'})

#     ph_stats_copy = ph_stats_copy.add_prefix('ph_')
#     ph_stats_copy = ph_stats_copy.rename(columns={
#         'ph_hero_id': 'hero_id',
#         'ph_account_id': 'account_id',
#         'ph_match_id':'match_id','ph_team':'team',
#         'ph_winning_team':'winning_team','ph_win':'win'})

#     merged = pd.merge(p_m_stats_copy, h_stats_copy, on="hero_id", suffixes=("", ""))
#     print(f'merged: \n\n{merged}')

#     all_merged = pd.merge(merged, ph_stats_copy, on=['account_id', 'hero_id'], how='left')

#     return all_merged

#     merged = pd.merge(merged, ph_stats_copy, on=['account_id', 'hero_id'], suffixes=("", ""))

In [34]:
p_ph_h_stats.columns

Index(['account_id', 'hero_id', 'p_total_matches_played', 'p_total_kills',
       'p_total_deaths', 'p_total_wins', 'p_total_assists',
       'p_total_time_played', 'p_avg_kills', 'p_win_rate', 'ph_matches_played',
       'ph_last_played', 'ph_time_played', 'ph_wins', 'ph_ending_level',
       'ph_kills', 'ph_deaths', 'ph_assists', 'ph_denies_per_match',
       'ph_kills_per_min', 'ph_deaths_per_min', 'ph_assists_per_min',
       'ph_denies_per_min', 'ph_networth_per_min', 'ph_last_hits_per_min',
       'ph_damage_per_min', 'ph_damage_per_soul',
       'ph_damage_mitigated_per_min', 'ph_damage_taken_per_min',
       'ph_damage_taken_per_soul', 'ph_creeps_per_min',
       'ph_obj_damage_per_min', 'ph_obj_damage_per_soul', 'ph_accuracy',
       'ph_crit_shot_rate', 'h_bucket', 'h_wins', 'h_losses', 'h_matches',
       'h_matches_per_bucket', 'h_players', 'h_total_kills', 'h_total_deaths',
       'h_total_assists', 'h_total_net_worth', 'h_total_last_hits',
       'h_total_denies', 'h_tota

In [42]:
test = p_ph_h_stats.copy()
import numpy as np

In [43]:
test['ph_avg_match_length'] = np.where(test['ph_time_played'] == 0, 0,
                                        test['ph_time_played'] / test['ph_matches_played']/60)
test['ph_avg_match_length']

0        31.747236
1        38.366667
2        30.396667
3        31.863842
4        32.718333
           ...    
51500    24.100000
51501    20.656667
51502    27.816667
51503    25.166667
51504    31.204762
Name: ph_avg_match_length, Length: 51505, dtype: float64

In [44]:
test['ph_avg_damage_per_match'] = np.where(test['ph_avg_match_length'] == 0, 0,
                                        test['ph_damage_per_min'] * test['ph_avg_match_length'])
test['ph_avg_damage_per_match']

0        23034.805760
1        27868.000000
2        18392.081920
3        29362.850935
4        25468.398393
             ...     
51500    29653.000000
51501    22989.599596
51502    28739.817871
51503    28493.000000
51504    26029.407662
Name: ph_avg_damage_per_match, Length: 51505, dtype: float64

In [51]:
def calculate_ph_stats(p_ph_h_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Create player hero stats by aggregating the player_hero_stats DataFrame.
    Creates aggregate function across all hero_stats for players in df.
    Checks for potential divide by zero errors and sets result to 0 if denominator is zero.
    """
    import numpy as np
    
    all_stats = p_ph_h_stats.copy()

    # player_hero kd for the player<>hero combo
    all_stats['ph_total_kd'] = np.where(all_stats['ph_deaths'] == 0, 0, 
                                        all_stats['ph_kills'] / all_stats['ph_deaths'])

    # all 100 badge hero stats, total kd- used to compare player performance with hero
    all_stats['h_total_kd'] = np.where(all_stats['h_total_deaths'] == 0, 0, 
                                        all_stats['h_total_kills'] / all_stats['h_total_deaths'])

    # compares player_hero kd to average hero_kd - player skill with hero.
    all_stats['ph_kd_ratio'] = np.where(all_stats['ph_total_kd'] == 0, 0, 
                                        all_stats['ph_total_kd']/ all_stats['h_total_kd'])

    # What % of matches the player played with hero, compared to all matches played with hero.
    all_stats['ph_hero_xp_ratio'] = np.where(all_stats['ph_matches_played'] == 0, 0, 
                                        all_stats['ph_matches_played'] / all_stats['h_matches'])
    
    # Create damage per match ratio, player_hero to hero
    all_stats['ph_avg_match_length'] = np.where(all_stats['ph_time_played'] == 0, 0,
                                        all_stats['ph_time_played'] / all_stats['ph_matches_played']/60)
    all_stats['ph_avg_damage_per_match'] = np.where(all_stats['ph_avg_match_length'] == 0, 0,
                                        all_stats['ph_damage_per_min'] * all_stats['ph_avg_match_length'])
    all_stats['h_damage_per_match'] = np.where(all_stats['h_matches'] == 0, 0,
                                        all_stats['h_total_player_damage'] / all_stats['h_matches'])
    all_stats['ph_damage_ratio'] = np.where(all_stats['ph_avg_damage_per_match'] == 0, 0,
                                        all_stats['ph_avg_damage_per_match'] / all_stats['h_damage_per_match'])

    # player_hero ratios
    all_stats['ph_assists_ratio'] = np.where(all_stats['h_total_assists'] == 0, 0, 
                                            all_stats['ph_assists']/ all_stats['h_total_assists'])
    all_stats['ph_win_rate'] = np.where(all_stats['ph_matches_played'] == 0, 0, 
                                        all_stats['ph_wins'] / all_stats['ph_matches_played'])
    all_stats['h_total_win_rate'] = np.where(all_stats['h_matches'] == 0, 0,
                                        all_stats['h_wins'] / all_stats['h_matches'])
    all_stats['ph_win_rate_ratio'] = np.where(all_stats['ph_win_rate'] == 0, 0, 
                                            all_stats['ph_win_rate'] / all_stats['h_total_win_rate'])
    
    return all_stats

In [52]:
all_stats = calculate_ph_stats(p_ph_h_stats)

In [53]:
all_stats.columns

Index(['account_id', 'hero_id', 'p_total_matches_played', 'p_total_kills',
       'p_total_deaths', 'p_total_wins', 'p_total_assists',
       'p_total_time_played', 'p_avg_kills', 'p_win_rate', 'ph_matches_played',
       'ph_last_played', 'ph_time_played', 'ph_wins', 'ph_ending_level',
       'ph_kills', 'ph_deaths', 'ph_assists', 'ph_denies_per_match',
       'ph_kills_per_min', 'ph_deaths_per_min', 'ph_assists_per_min',
       'ph_denies_per_min', 'ph_networth_per_min', 'ph_last_hits_per_min',
       'ph_damage_per_min', 'ph_damage_per_soul',
       'ph_damage_mitigated_per_min', 'ph_damage_taken_per_min',
       'ph_damage_taken_per_soul', 'ph_creeps_per_min',
       'ph_obj_damage_per_min', 'ph_obj_damage_per_soul', 'ph_accuracy',
       'ph_crit_shot_rate', 'h_bucket', 'h_wins', 'h_losses', 'h_matches',
       'h_matches_per_bucket', 'h_players', 'h_total_kills', 'h_total_deaths',
       'h_total_assists', 'h_total_net_worth', 'h_total_last_hits',
       'h_total_denies', 'h_tota

In [54]:
all_stats.to_csv(f"{folder_name}/all_stats.csv", index=False)