In [2]:
import requests
import json
import os
import sys
import pandas as pd
import logging
from urllib.parse import urlencode
import time
from datetime import timedelta, datetime, timezone
from data.fetch_data import bulk_fetch_matches
from typing import Optional, Dict, Any, Iterable, List

logging.basicConfig(level=logging.DEBUG)
logging = logging.getLogger(__name__)


In [4]:
start_date = "2025-08-19"
end_date = "2025-08-21"
folder_name = f"v2_data//pred_data//test_pred_v2_{start_date}_{end_date}"

In [None]:
# Load data from .csvs
# Initial data set split to players & matches. match_id+account_id = PK
raw_players = pd.read_csv(f"{folder_name}/raw_players.csv")
raw_matches = pd.read_csv(f"{folder_name}/batch_matches.csv")

# player stats across lifetime of player
player_stats = pd.read_csv(f"{folder_name}/player_stats.csv")

# player hero stats across liftetime of games played
player_hero_stats = pd.read_csv(f"{folder_name}/player_hero_stats.csv")

# aggregated stats for each player based on p_h_stats
player_match_stats = pd.read_csv(f"{folder_name}/player_match_stats.csv")

# raw hero stats for badge 100+ based on end_date
hero_stats = pd.read_csv(f"{folder_name}/hero_stats.csv")

In [4]:
def check_unique_naming(p_h_stats=None,
                        p_stats=None, 
                        h_stats=None
                        ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    static_names = ['account_id', 'hero_id', 'match_id']
    if p_h_stats is not None:
        p_h_stats = p_h_stats.rename(columns={col: f"ph_{col}" for col in p_h_stats.columns 
            if col not in static_names and not col.startswith('ph_')})
    if p_stats is not None:
        p_stats = p_stats.rename(columns={col: f"p_{col}" for col in p_stats.columns
            if col not in static_names and not col.startswith('p_')})
    if h_stats is not None:
        h_stats = h_stats.rename(columns={col: f"h_{col}" for col in h_stats.columns
            if col not in static_names and not col.startswith('h_')})

    return p_h_stats, p_stats, h_stats


In [None]:
def merge_stats(player_stats, player_hero_stats, hero_stats)->pd.DataFrame:
    p_ph_stats = player_stats.merge(player_hero_stats, on='player_id', suffixes=('_player', '_player_hero'))
    p_ph_h_stats = p_ph_stats.merge(hero_stats, on='hero_id', suffixes=('', '_hero'))
    return p_ph_h_stats

In [47]:
def merge_pm_h_p_stats(p_m_stats, ph_stats, h_stats,)-> pd.DataFrame:
    """merges hero_stats on df, where hero_id=hero_id, adds suffix 'h_' to hero_stats columns"""

    h_stats_copy = h_stats.copy()
    ph_stats_copy = ph_stats.copy()
    p_m_stats_copy = p_m_stats.copy()

    h_stats_copy = h_stats_copy.rename(columns={'h_hero_id': 'hero_id'})

    ph_stats_copy = ph_stats_copy.add_prefix('ph_')
    ph_stats_copy = ph_stats_copy.rename(columns={
        'ph_hero_id': 'hero_id',
        'ph_account_id': 'account_id',
        'ph_match_id':'match_id','ph_team':'team',
        'ph_winning_team':'winning_team','ph_win':'win'})

    merged = pd.merge(p_m_stats_copy, h_stats_copy, on="hero_id", suffixes=("", ""))
    print(f'merged: \n\n{merged}')

    all_merged = pd.merge(merged, ph_stats_copy, on=['account_id', 'hero_id'], how='left')

    return all_merged

In [None]:
#     merged = pd.merge(merged, ph_stats_copy, on=['account_id', 'hero_id'], suffixes=("", ""))

In [9]:
def calculate_ph_stats(p_ph_h_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Create player hero stats by aggregating the player_hero_stats DataFrame.
    Creates aggregate function across all hero_stats for players in df.
    Checks for potential divide by zero errors and sets result to 0 if denominator is zero.
    """
    import numpy as np
    
    all_stats = p_ph_h_stats.copy()

    # Avoid divide by zero for deaths
    all_stats['ph_total_kd'] = np.where(all_stats['deaths'] == 0, 0, all_stats['kills'] / all_stats['deaths'])

    # Avoid divide by zero for ph_total_kd
    all_stats['ph_kd_ratio'] = np.where(all_stats['ph_total_kd'] == 0, 0, all_stats['h_total_kd']/ all_stats['ph_total_kd'])

    all_stats['h_avg_total_time_played'] = (all_stats.groupby('hero_id')['time_played'].transform("mean"))
    # Avoid divide by zero for h_avg_total_time_played
    all_stats['ph_time_played_ratio'] = np.where(all_stats['h_avg_total_time_played'] == 0, 0, all_stats['time_played']/ all_stats['h_avg_total_time_played'])

    all_stats['h_total_damage_per_min'] = (all_stats.groupby('hero_id')['damage_per_min'].transform("mean"))
    # Avoid divide by zero for h_total_damage_per_min
    all_stats['ph_damage_per_min_ratio'] = np.where(all_stats['h_total_damage_per_min'] == 0, 0, all_stats['damage_per_min']/ all_stats['h_total_damage_per_min'])

    all_stats['h_total_assists'] = (all_stats.groupby('hero_id')['assists'].transform("mean"))
    # Avoid divide by zero for h_total_assists
    all_stats['ph_assists_ratio'] = np.where(all_stats['h_total_assists'] == 0, 0, all_stats['assists']/ all_stats['h_total_assists'])

    # Avoid divide by zero for matches_played
    all_stats['ph_win_rate'] = np.where(all_stats['matches_played'] == 0, 0, all_stats['wins'] / all_stats['matches_played'])
    all_stats['h_total_win_rate'] = (all_stats.groupby('hero_id')['ph_win_rate'].transform("mean"))

    # Avoid divide by zero for h_total_win_rate
    all_stats['ph_win_rate_ratio'] = np.where(all_stats['h_total_win_rate'] == 0, 0, all_stats['ph_win_rate'] / all_stats['h_total_win_rate'])

    all_stats.rename(columns={
        "wins": "ph_wins",
        "kills": "ph_kills",
        "deaths": "ph_deaths",
        "assists": "ph_assists",
        "damage_per_min": "ph_damage_per_min",
        'time_played': 'ph_time_played'
    }, inplace=True)

    return ph_stats

def create_hero_stats(player_hero_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Create hero stats by aggregating the player_hero_stats DataFrame.
    """
    hero_stats = player_hero_stats.copy()

    hero_stats['h_total_kd'] = (hero_stats.groupby('hero_id')['ph_total_kd'].transform("mean"))


    return hero_stats

def create_player_stats(player_match_stats: pd.DataFrame) -> pd.DataFrame:
    """
    Create player stats by aggregating the player_match_stats DataFrame.
    """
    player_stats = player_match_stats.copy()

In [None]:
# step 1a
# Check naming and ensure unique across datasets
player_hero_stats, player_stats, hero_stats

In [None]:
# step 1
# Merge Player and Player_hero stats


In [None]:
# step 2
# Merge player_player_hero and hero stats

In [None]:
# Step 3
# calculate player, player_hero stats
calc_stats = calculate_ph_stats(ph_h_p_stats)

In [None]:
# Step 4
# Merge p_ph_h_calc_stats with player_matches



In [None]:
# ?