In [1]:
import requests
import json
import os
import sys
import pandas as pd
import logging
from urllib.parse import urlencode
import time
from datetime import timedelta, datetime, timezone
from data.fetch_data import bulk_fetch_matches
from typing import Optional, Dict, Any, Iterable, List

logging.basicConfig(level=logging.DEBUG)
logging = logging.getLogger(__name__)


In [2]:
start_date = "2025-08-19"
end_date = "2025-08-21"
folder_name = f"v2_data//pred_data//test_pred_v2_{start_date}_{end_date}"

In [None]:
# Load data from .csvs
# Initial data set split to players & matches. match_id+account_id = PK
raw_players = pd.read_csv(f"{folder_name}/raw_players.csv")
raw_matches = pd.read_csv(f"{folder_name}/batch_matches.csv")

# player hero stats across liftetime of games played
player_hero_stats = pd.read_csv(f"{folder_name}/player_hero_stats.csv")

# aggregated stats for each player based on p_h_stats
player_match_stats = pd.read_csv(f"{folder_name}/player_match_stats.csv")

# raw hero stats for badge 100+ based on end_date
hero_stats = pd.read_csv(f"{folder_name}/hero_stats.csv")

In [None]:
def prep_player_match_stats(raw_players, raw_matches)->pd.DataFrame:
    # calculate win column for the account<>match
    player_match_stats = raw_players.merge(
        raw_matches[['match_id', 'winning_team']],
        on='match_id',
        how='left'
    )
    player_match_stats['win'] = player_match_stats.apply(
        lambda row: 'Y' if row['team'] == row['winning_team'] else 'N',
        axis=1
    )

    drop_columns = [
        'kills','deaths','assists','denies','net_worth','Unnamed: 0'
    ]
    
    for col in drop_columns:
        if col in player_match_stats.columns:
            player_match_stats.drop(columns=col, inplace=True)
    
    return player_match_stats

In [None]:
def merge_pm_h_p_stats(p_m_stats, ph_stats, h_stats,)-> pd.DataFrame:
    """merges hero_stats on df, where hero_id=hero_id, adds suffix 'h_' to hero_stats columns"""

    h_stats_copy = h_stats.copy()
    ph_stats_copy = ph_stats.copy()
    p_m_stats_copy = p_m_stats.copy()

    h_stats_copy = h_stats_copy.add_prefix('h_')
    h_stats_copy = h_stats_copy.rename(columns={'h_hero_id': 'hero_id'})

    ph_stats_copy = ph_stats_copy.add_prefix('ph_')
    ph_stats_copy = ph_stats_copy.rename(columns={
        'ph_hero_id': 'hero_id', 'ph_account_id': 'account_id',
        'ph_match_id':'match_id','ph_team':'team',
        'ph_winning_team':'winning_team','ph_win':'win'})
    
    merged = pd.merge(p_m_stats_copy, h_stats_copy, on="hero_id", suffixes=("h_", ""))
    merged = pd.merge(merged, ph_stats_copy, on="account_id", suffixes=("h_", "ph_"))

    return merged

In [None]:
def calculate_ph_stats(p_ph_h_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Create player hero stats by aggregating the player_hero_stats DataFrame.
    Creates aggregate function across all hero_stats for players in df.
    Checks for potential divide by zero errors and sets result to 0 if denominator is zero.
    """

    ph_stats = p_ph_h_stats.copy()

    # Avoid divide by zero for deaths
    ph_stats['ph_total_kd'] = np.where(ph_stats['deaths'] == 0, 0, ph_stats['kills'] / ph_stats['deaths'])

    # Avoid divide by zero for ph_total_kd
    ph_stats['ph_kd_ratio'] = np.where(ph_stats['ph_total_kd'] == 0, 0, ph_stats['h_total_kd']/ ph_stats['ph_total_kd'])

    ph_stats['h_avg_total_time_played'] = (ph_stats.groupby('hero_id')['time_played'].transform("mean"))
    # Avoid divide by zero for h_avg_total_time_played
    ph_stats['ph_time_played_ratio'] = np.where(ph_stats['h_avg_total_time_played'] == 0, 0, ph_stats['time_played']/ ph_stats['h_avg_total_time_played'])

    ph_stats['h_total_damage_per_min'] = (ph_stats.groupby('hero_id')['damage_per_min'].transform("mean"))
    # Avoid divide by zero for h_total_damage_per_min
    ph_stats['ph_damage_per_min_ratio'] = np.where(ph_stats['h_total_damage_per_min'] == 0, 0, ph_stats['damage_per_min']/ ph_stats['h_total_damage_per_min'])

    ph_stats['h_total_assists'] = (ph_stats.groupby('hero_id')['assists'].transform("mean"))
    # Avoid divide by zero for h_total_assists
    ph_stats['ph_assists_ratio'] = np.where(ph_stats['h_total_assists'] == 0, 0, ph_stats['assists']/ ph_stats['h_total_assists'])

    # Avoid divide by zero for matches_played
    ph_stats['ph_win_rate'] = np.where(ph_stats['matches_played'] == 0, 0, ph_stats['wins'] / ph_stats['matches_played'])
    ph_stats['h_total_win_rate'] = (ph_stats.groupby('hero_id')['ph_win_rate'].transform("mean"))
    # Avoid divide by zero for h_total_win_rate
    ph_stats['ph_win_rate_ratio'] = np.where(ph_stats['h_total_win_rate'] == 0, 0, ph_stats['ph_win_rate'] / ph_stats['h_total_win_rate'])

    ph_stats.rename(columns={
        "wins": "ph_wins",
        "kills": "ph_kills",
        "deaths": "ph_deaths",
        "assists": "ph_assists",
        "damage_per_min": "ph_damage_per_min",
        'time_played': 'ph_time_played'
    }, inplace=True)

    return ph_stats

def create_hero_stats(player_hero_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Create hero stats by aggregating the player_hero_stats DataFrame.
    """
    hero_stats = player_hero_stats.copy()

    hero_stats['h_total_kd'] = (hero_stats.groupby('hero_id')['ph_total_kd'].transform("mean"))


    return hero_stats

def create_player_stats(player_match_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Create player stats by aggregating the player_match_stats DataFrame.
    """
    player_stats = player_match_stats.copy()

In [None]:
player_match_stats = (prep_player_match_stats(raw_players, raw_matches))
player_match_stats.to_csv(f"{folder_name}/player_match_stats.csv", index=False)

In [None]:
ph_h_p_stats = merge_pm_h_p_stats(player_match_stats, hero_stats)
ph_h_p_stats.to_csv(f"{folder_name}/ph_h_p_stats.csv", index=False)

In [None]:
calc_stats = calculate_ph_stats(ph_h_p_stats)