In [None]:
con.close()

In [None]:
import duckdb
import pandas as pd
import logging

def fetch_training_data(con, path, n=1000):
    """Fetches and structures data for training a model."""

    #-- Select a sample of matches for training
    complete_query ="""
    WITH match_data AS (
        SELECT 
            m.match_id,
            m.start_time,
            m.winning_team
        FROM 
            matches m
        ORDER BY 
            random()  -- Randomly select matches
        LIMIT 1000
    ),

    teams_of_players AS (
        SELECT
            pm.match_id,
            pm.account_id,
            pm.hero_id,
            pm.team,
            pm.kills,
            pm.deaths,
            pm.assists,
            CASE WHEN pm.deaths = 0 THEN pm.kills ELSE CAST(pm.kills AS FLOAT) / pm.deaths END AS kd_ratio
        FROM 
            player_matches pm
        JOIN 
            match_data md ON pm.match_id = md.match_id
    ),

    #-- Calculate team-level statistics

    team_stats AS (
        SELECT
            match_id,
            team,
            AVG(kd_ratio) AS team_avg_kd,
            MAX(kd_ratio) AS team_max_kd,
            MIN(kd_ratio) AS team_min_kd,
            COUNT(*) AS team_size
        FROM 
            teams_of_players
        GROUP BY 
            match_id, team
    ),

    #-- Get player trends and statistics
    player_performance AS (
        SELECT
            ta.match_id,
            ta.team,
            -- Basic performance stats
            AVG(pt.p_average_kills) AS avg_team_kills,
            AVG(pt.p_average_deaths) AS avg_team_deaths,
            AVG(pt.p_avg_kd) AS avg_team_kd,
            AVG(pt.p_win_rate) AS avg_team_win_rate,
            AVG(pt.p_total_matches) AS avg_player_matches,
            
            -- Streak information
            AVG(pt.p_win_streak_avg) AS avg_win_streak,
            AVG(pt.p_loss_streak_avg) AS avg_loss_streak,
            SUM(pt.win_streaks_2plus) AS team_win_streaks_2plus,
            SUM(pt.win_streaks_3plus) AS team_win_streaks_3plus,
            SUM(pt.win_streaks_4plus) AS team_win_streaks_4plus,
            SUM(pt.win_streaks_5plus) AS team_win_streaks_5plus,
            SUM(pt.loss_streaks_2plus) AS team_loss_streaks_2plus,
            SUM(pt.loss_streaks_3plus) AS team_loss_streaks_3plus,
            SUM(pt.loss_streaks_4plus) AS team_loss_streaks_4plus,
            SUM(pt.loss_streaks_5plus) AS team_loss_streaks_5plus,
            
            -- Hero comparison metrics
            AVG(pt.p_v_h_kd_pct) AS avg_hero_kd_percentage,
            
            -- Team strength indicators
            MAX(pt.p_win_rate) AS max_player_win_rate,
            MIN(pt.p_win_rate) AS min_player_win_rate,
            MAX(pt.p_avg_kd) AS max_player_kd,
            MIN(pt.p_avg_kd) AS min_player_kd,
            
            -- Experience metrics
            SUM(pt.p_total_matches) AS total_team_experience,
            MAX(pt.p_total_matches) AS most_experienced_player,
            MIN(pt.p_total_matches) AS least_experienced_player,
            
            -- Consistency metrics
            STDDEV(pt.p_win_rate) AS win_rate_consistency,
            STDDEV(pt.p_avg_kd) AS kd_consistency
        FROM 
            teams_of_players ta
        JOIN 
            player_trends pt ON ta.account_id = pt.account_id
        GROUP BY 
            ta.match_id, ta.team
    )

    #-- Calculate hero trend statistics by team
    WITH hero_trend_by_team AS (
        SELECT
            ta.match_id,
            ta.team,
            AVG(ht.win_rate) AS avg_hero_win_rate,
            AVG(ht.average_kd) AS avg_hero_kd,
            AVG(ht.pick_rate) AS avg_hero_pick_rate,
            SUM(ht.pick_rate) AS total_hero_popularity,
            MAX(ht.win_rate) AS max_hero_win_rate,
            MIN(ht.win_rate) AS min_hero_win_rate,
            MAX(ht.average_kd) AS max_hero_kd,
            MIN(ht.average_kd) AS min_hero_kd,
            STDDEV(ht.win_rate) AS hero_win_rate_variety,
            STDDEV(ht.average_kd) AS hero_kd_variety
        FROM 
            teams_of_players ta
        JOIN 
            hero_trends ht ON ta.hero_id = ht.hero_id
        WHERE 
            ht.trend_window_days = 30  -- Using 30-day trends
        GROUP BY 
            ta.match_id, ta.team
    ),

    #-- Get recent player performance (rolling stats)
    recent_performance AS (
        SELECT
            ta.match_id,
            ta.team,
            AVG(prs.p_win_pct_2) AS avg_recent_win_pct_2,
            AVG(prs.p_win_pct_3) AS avg_recent_win_pct_3,
            AVG(prs.p_win_pct_4) AS avg_recent_win_pct_4,
            AVG(prs.p_win_pct_5) AS avg_recent_win_pct_5,
            AVG(prs.p_loss_pct_2) AS avg_recent_loss_pct_2,
            AVG(prs.p_loss_pct_3) AS avg_recent_loss_pct_3,
            AVG(prs.p_loss_pct_4) AS avg_recent_loss_pct_4,
            AVG(prs.p_loss_pct_5) AS avg_recent_loss_pct_5,
            MAX(prs.p_win_pct_5) AS max_recent_win_pct,
            MIN(prs.p_win_pct_5) AS min_recent_win_pct
        FROM 
            teams_of_players ta
        JOIN 
            player_rolling_stats prs ON ta.account_id = prs.account_id AND ta.match_id = prs.match_id
        GROUP BY 
            ta.match_id, ta.team
    ),

    #-- create team features
    WITH team_features AS (
        SELECT
            ts.match_id,
            ts.team,
            -- Match-specific performance
            ts.team_avg_kd,
            ts.team_max_kd,
            ts.team_min_kd,
            
            -- Historical player performance
            pp.avg_team_kills,
            pp.avg_team_deaths,
            pp.avg_team_kd,
            pp.avg_team_win_rate,
            pp.max_player_win_rate,
            pp.min_player_win_rate,
            pp.max_player_kd,
            pp.min_player_kd,
            pp.avg_player_matches,
            pp.total_team_experience,
            pp.most_experienced_player,
            pp.least_experienced_player,
            pp.win_rate_consistency,
            pp.kd_consistency,
            
            -- Streak information
            pp.avg_win_streak,
            pp.avg_loss_streak,
            pp.team_win_streaks_2plus,
            pp.team_win_streaks_3plus,
            pp.team_win_streaks_4plus,
            pp.team_win_streaks_5plus,
            pp.team_loss_streaks_2plus,
            pp.team_loss_streaks_3plus,
            pp.team_loss_streaks_4plus,
            pp.team_loss_streaks_5plus,
            
            -- Hero comparison
            pp.avg_hero_kd_percentage,
            
            -- Hero performance
            ht.avg_hero_win_rate,
            ht.avg_hero_kd,
            ht.avg_hero_pick_rate,
            ht.total_hero_popularity,
            ht.max_hero_win_rate,
            ht.min_hero_win_rate,
            ht.max_hero_kd,
            ht.min_hero_kd,
            ht.hero_win_rate_variety,
            ht.hero_kd_variety,
            
            -- Recent performance
            COALESCE(rp.avg_recent_win_pct_2, 50) AS avg_recent_win_pct_2,
            COALESCE(rp.avg_recent_win_pct_3, 50) AS avg_recent_win_pct_3,
            COALESCE(rp.avg_recent_win_pct_4, 50) AS avg_recent_win_pct_4,
            COALESCE(rp.avg_recent_win_pct_5, 50) AS avg_recent_win_pct_5,
            COALESCE(rp.avg_recent_loss_pct_2, 50) AS avg_recent_loss_pct_2,
            COALESCE(rp.avg_recent_loss_pct_3, 50) AS avg_recent_loss_pct_3,
            COALESCE(rp.avg_recent_loss_pct_4, 50) AS avg_recent_loss_pct_4,
            COALESCE(rp.avg_recent_loss_pct_5, 50) AS avg_recent_loss_pct_5,
            COALESCE(rp.max_recent_win_pct, 50) AS max_recent_win_pct,
            COALESCE(rp.min_recent_win_pct, 50) AS min_recent_win_pct
        FROM 
            team_stats ts
        LEFT JOIN 
            player_performance pp ON ts.match_id = pp.match_id AND ts.team = pp.team
        LEFT JOIN 
            hero_trend_by_team ht ON ts.match_id = ht.match_id AND ts.team = ht.team
        LEFT JOIN 
            recent_performance rp ON ts.match_id = rp.match_id AND ts.team = rp.team
    ),

     -- Final dataset for model training
    SELECT
        md.match_id,
        md.start_time,
        md.winning_team,
        
        -- Team 0 features
        t0.team_avg_kd AS t0_avg_kd,
        t0.team_max_kd AS t0_max_kd,
        t0.team_min_kd AS t0_min_kd,
        t0.avg_team_kills AS t0_avg_kills,
        t0.avg_team_deaths AS t0_avg_deaths,
        t0.avg_team_kd AS t0_avg_historical_kd,
        t0.avg_team_win_rate AS t0_win_rate,
        t0.max_player_win_rate AS t0_max_win_rate,
        t0.min_player_win_rate AS t0_min_win_rate,
        t0.max_player_kd AS t0_max_historical_kd,
        t0.min_player_kd AS t0_min_historical_kd,
        t0.avg_win_streak AS t0_win_streak,
        t0.avg_loss_streak AS t0_loss_streak,
        t0.team_win_streaks_2plus AS t0_win_streaks_2,
        t0.avg_player_matches AS t0_player_matches,
        t0.total_team_experience AS t0_total_experience,
        t0.avg_hero_win_rate AS t0_hero_win_rate,
        t0.avg_recent_win_pct AS t0_recent_wins,
        
        -- Team 1 features
        t1.team_avg_kd AS t1_avg_kd,
        t1.team_max_kd AS t1_max_kd,
        t1.team_min_kd AS t1_min_kd,
        t1.avg_team_kills AS t1_avg_kills,
        t1.avg_team_deaths AS t1_avg_deaths,
        t1.avg_team_kd AS t1_avg_historical_kd,
        t1.avg_team_win_rate AS t1_win_rate,
        t1.max_player_win_rate AS t1_max_win_rate,
        t1.min_player_win_rate AS t1_min_win_rate,
        t1.max_player_kd AS t1_max_historical_kd,
        t1.min_player_kd AS t1_min_historical_kd,
        t1.avg_win_streak AS t1_win_streak,
        t1.avg_loss_streak AS t1_loss_streak,
        t1.team_win_streaks_2plus AS t1_win_streaks_2,
        t1.avg_player_matches AS t1_player_matches,
        t1.total_team_experience AS t1_total_experience,
        t1.avg_hero_win_rate AS t1_hero_win_rate,
        t1.avg_recent_win_pct AS t1_recent_wins,
        
        -- Differential features (team 0 relative to team 1)
        (t0.team_avg_kd - t1.team_avg_kd) AS kd_diff,
        (t0.avg_team_kd - t1.avg_team_kd) AS historical_kd_diff,
        (t0.avg_team_win_rate - t1.avg_team_win_rate) AS win_rate_diff,
        (t0.avg_win_streak - t1.avg_win_streak) AS win_streak_diff,
        (t0.avg_recent_win_pct - t1.avg_recent_win_pct) AS recent_win_diff,
        (t0.avg_hero_win_rate - t1.avg_hero_win_rate) AS hero_win_rate_diff,
        
        -- Target variable: 1 if team 0 won, 0 if team 1 won
        CASE WHEN md.winning_team = '0' THEN 1 ELSE 0 END AS team0_won
    FROM 
        match_data md
    JOIN 
        team_features t0 ON md.match_id = t0.match_id AND t0.team = '0'
    JOIN 
        team_features t1 ON md.match_id = t1.match_id AND t1.team = '1'
    """

        # Execute the complete query
    try:
        result_df = pd.read_sql_query(complete_query, con)
        print(f"Successfully fetched {len(result_df)} matches for training")
        return result_df
    except Exception as e:
        print(f"Error executing query: {e}")
        # You might want to print the complete query here for debugging
        print(complete_query)
        return None

if __name__ == "__main__":
    con = duckdb.connect("c:/Code/Local Code/deadlock_match_prediction/data/deadlock.db")
    path = "C:/Code/Local Code/deadlock_match_prediction/data/test_data/"
    df = fetch_training_data(con, path)
    print(df.head())

In [None]:
import duckdb
import pandas as pd
import logging
con = duckdb.connect("c:/Code/Local Code/deadlock_match_prediction/data/deadlock.db")

In [None]:
match_data = con.execute(f"""
    WITH match_data AS (
        SELECT 
            m.match_id,
            m.start_time,
            m.winning_team
        FROM 
            matches m
        ORDER BY 
            random()  -- Randomly select matches
        LIMIT 1000
    )
    SELECT * FROM match_data""").fetchdf()
print(match_data)

In [None]:
team_of_players = con.execute(f"""
    WITH teams_of_players AS (
        SELECT
            pm.match_id,
            pm.account_id,
            pm.hero_id,
            pm.team,
            pm.kills,
            pm.deaths,
            pm.assists,
            CASE WHEN pm.deaths = 0 THEN pm.kills ELSE CAST(pm.kills AS FLOAT) / pm.deaths END AS kd_ratio
        FROM 
            player_matches pm
        JOIN 
            match_data md ON pm.match_id = md.match_id
    )
SELECT * FROM teams_of_players""").fetchdf()
print(team_of_players)


In [None]:

#-- Calculate team-level statistics
team_stats= con.execute(f"""
    WITH team_stats AS (
        SELECT
            match_id,
            team,
            AVG(kd_ratio) AS team_avg_kd,
            MAX(kd_ratio) AS team_max_kd,
            MIN(kd_ratio) AS team_min_kd,
            COUNT(*) AS team_size
        FROM 
            team_of_players
        GROUP BY 
            match_id, team
    )
    Select * from team_stats""").fetchdf()
print(team_stats)



In [None]:
player_performance = con.execute(f"""
    WITH player_performance AS (
        SELECT
            ta.match_id,
            ta.team,
            -- Basic performance stats
            AVG(pt.p_average_kills) AS avg_team_kills,
            AVG(pt.p_average_deaths) AS avg_team_deaths,
            AVG(pt.p_avg_kd) AS avg_team_kd,
            AVG(pt.p_win_rate) AS avg_team_win_rate,
            AVG(pt.p_total_matches) AS avg_player_matches,
            
            -- Streak information
            AVG(pt.p_win_streak_avg) AS avg_win_streak,
            AVG(pt.p_loss_streak_avg) AS avg_loss_streak,
            SUM(pt.win_streaks_2plus) AS team_win_streaks_2plus,
            SUM(pt.win_streaks_3plus) AS team_win_streaks_3plus,
            SUM(pt.win_streaks_4plus) AS team_win_streaks_4plus,
            SUM(pt.win_streaks_5plus) AS team_win_streaks_5plus,
            SUM(pt.loss_streaks_2plus) AS team_loss_streaks_2plus,
            SUM(pt.loss_streaks_3plus) AS team_loss_streaks_3plus,
            SUM(pt.loss_streaks_4plus) AS team_loss_streaks_4plus,
            SUM(pt.loss_streaks_5plus) AS team_loss_streaks_5plus,
            
            
            -- Team strength indicators
            MAX(pt.p_win_rate) AS max_player_win_rate,
            MIN(pt.p_win_rate) AS min_player_win_rate,
            MAX(pt.p_avg_kd) AS max_player_kd,
            MIN(pt.p_avg_kd) AS min_player_kd,
            
            -- Experience metrics
            SUM(pt.p_total_matches) AS total_team_experience,
            MAX(pt.p_total_matches) AS most_experienced_player,
            MIN(pt.p_total_matches) AS least_experienced_player,
            
            -- Consistency metrics
            STDDEV(pt.p_win_rate) AS win_rate_consistency,
            STDDEV(pt.p_avg_kd) AS kd_consistency
        FROM 
            team_of_players ta
        JOIN 
            player_trends pt ON ta.account_id = pt.account_id
        GROUP BY 
            ta.match_id, ta.team
    )
    SELECT * FROM player_performance
    """).fetchdf()
print(player_performance)

In [None]:

#-- Calculate hero trend statistics by team
hero_trend_by_team= con.execute(f"""
    WITH hero_trend_by_team AS (
        SELECT
            ta.match_id,
            ta.team,
            AVG(ht.win_rate) AS avg_hero_win_rate,
            AVG(ht.average_kd) AS avg_hero_kd,
            AVG(ht.pick_rate) AS avg_hero_pick_rate,
            SUM(ht.pick_rate) AS total_hero_popularity,
            MAX(ht.win_rate) AS max_hero_win_rate,
            MIN(ht.win_rate) AS min_hero_win_rate,
            MAX(ht.average_kd) AS max_hero_kd,
            MIN(ht.average_kd) AS min_hero_kd,
            STDDEV(ht.win_rate) AS hero_win_rate_variety,
            STDDEV(ht.average_kd) AS hero_kd_variety
        FROM 
            team_of_players ta
        JOIN 
            hero_trends ht ON ta.hero_id = ht.hero_id
        WHERE 
            ht.trend_window_days = 30  -- Using 30-day trends
        GROUP BY 
            ta.match_id, ta.team
    )
        SELECT * from hero_trend_by_team""").fetchdf()
print(hero_trend_by_team)

In [None]:
#-- Get player trends and statistics
recent_performance= con.execute(f"""
    WITH recent_performance AS (
    SELECT
        ta.match_id,
        ta.team,
        -- Existing rolling win/loss percentages
        AVG(prs.p_win_pct_2) AS avg_recent_win_pct_2,
        AVG(prs.p_win_pct_3) AS avg_recent_win_pct_3,
        AVG(prs.p_win_pct_4) AS avg_recent_win_pct_4,
        AVG(prs.p_win_pct_5) AS avg_recent_win_pct_5,
        AVG(prs.p_loss_pct_2) AS avg_recent_loss_pct_2,
        AVG(prs.p_loss_pct_3) AS avg_recent_loss_pct_3,
        AVG(prs.p_loss_pct_4) AS avg_recent_loss_pct_4,
        AVG(prs.p_loss_pct_5) AS avg_recent_loss_pct_5,
        MAX(prs.p_win_pct_5) AS max_recent_win_pct,
        MIN(prs.p_win_pct_5) AS min_recent_win_pct,
        
        -- Player-hero specific metrics
        AVG(prs.p_v_h_kd_pct) AS avg_hero_kd_percentage,
        AVG(prs.p_v_h_pick_rate) AS avg_hero_pick_percentage,
        AVG(prs.p_h_match_count) AS avg_hero_match_count,
        SUM(prs.p_h_match_count) AS total_hero_experience,
        MAX(prs.p_v_h_kd_pct) AS max_hero_kd_percentage,
        MIN(prs.p_v_h_kd_pct) AS min_hero_kd_percentage,
        STDDEV(prs.p_v_h_kd_pct) AS hero_kd_consistency
    FROM 
        team_of_players ta
    JOIN 
        player_rolling_stats prs ON ta.account_id = prs.account_id AND ta.match_id = prs.match_id
    GROUP BY 
        ta.match_id, ta.team
)
    Select * from recent_performance""").fetchdf()
print(recent_performance)

In [None]:
team_features = con.execute(f"""
    WITH team_features AS (
    SELECT
        ts.match_id,
        ts.team,
        -- Existing features from team stats
        ts.team_avg_kd,
        ts.team_max_kd,
        ts.team_min_kd,
        
        -- Existing features from player performance
        pp.avg_team_kills,
        pp.avg_team_deaths,
        pp.avg_team_kd,
        pp.avg_team_win_rate,
        pp.max_player_win_rate,
        pp.min_player_win_rate,
        pp.max_player_kd,
        pp.min_player_kd,
        pp.avg_player_matches,
        pp.total_team_experience,
        pp.most_experienced_player,
        pp.least_experienced_player,
        pp.win_rate_consistency,
        pp.kd_consistency,
        pp.avg_win_streak,
        pp.avg_loss_streak,
        pp.team_win_streaks_2plus,
        pp.team_win_streaks_3plus,
        pp.team_win_streaks_4plus,
        pp.team_win_streaks_5plus,
        pp.team_loss_streaks_2plus,
        pp.team_loss_streaks_3plus,
        pp.team_loss_streaks_4plus,
        pp.team_loss_streaks_5plus,
        
        -- Hero trend features
        ht.avg_hero_win_rate,
        ht.avg_hero_kd,
        ht.avg_hero_pick_rate,
        ht.total_hero_popularity,
        ht.max_hero_win_rate,
        ht.min_hero_win_rate,
        ht.max_hero_kd,
        ht.min_hero_kd,
        ht.hero_win_rate_variety,
        ht.hero_kd_variety,
        
        -- Recent performance metrics (traditional)
        COALESCE(rps.avg_recent_win_pct_2, 50) AS avg_recent_win_pct_2,
        COALESCE(rps.avg_recent_win_pct_3, 50) AS avg_recent_win_pct_3,
        COALESCE(rps.avg_recent_win_pct_4, 50) AS avg_recent_win_pct_4,
        COALESCE(rps.avg_recent_win_pct_5, 50) AS avg_recent_win_pct_5,
        COALESCE(rps.avg_recent_loss_pct_2, 50) AS avg_recent_loss_pct_2,
        COALESCE(rps.avg_recent_loss_pct_3, 50) AS avg_recent_loss_pct_3,
        COALESCE(rps.avg_recent_loss_pct_4, 50) AS avg_recent_loss_pct_4,
        COALESCE(rps.avg_recent_loss_pct_5, 50) AS avg_recent_loss_pct_5,
        COALESCE(rps.max_recent_win_pct, 50) AS max_recent_win_pct,
        COALESCE(rps.min_recent_win_pct, 50) AS min_recent_win_pct,
        
        -- NEW: Player-hero specific metrics from rolling stats
        COALESCE(rps.avg_hero_kd_percentage, 0) AS avg_hero_kd_percentage,
        COALESCE(rps.avg_hero_pick_percentage, 0) AS avg_hero_pick_percentage,
        COALESCE(rps.avg_hero_match_count, 0) AS avg_hero_match_count,
        COALESCE(rps.total_hero_experience, 0) AS total_hero_experience,
        COALESCE(rps.max_hero_kd_percentage, 0) AS max_hero_kd_percentage,
        COALESCE(rps.min_hero_kd_percentage, 0) AS min_hero_kd_percentage,
        COALESCE(rps.hero_kd_consistency, 0) AS hero_kd_consistency
    FROM 
        team_stats ts
    LEFT JOIN 
        player_performance pp ON ts.match_id = pp.match_id AND ts.team = pp.team
    LEFT JOIN 
        hero_trend_by_team ht ON ts.match_id = ht.match_id AND ts.team = ht.team
    LEFT JOIN 
        recent_performance rps ON ts.match_id = rps.match_id AND ts.team = rps.team
)SELECT * from team_features""").fetchdf()
print(team_features)


In [None]:
#-- Final dataset for model training
training_set = con.execute(f"""
    SELECT
    md.match_id,
    md.start_time,
    md.winning_team,
    
    -- Team 0 features (existing)
    t0.team_avg_kd AS t0_avg_kd,
    t0.team_max_kd AS t0_max_kd,
    t0.team_min_kd AS t0_min_kd,
    -- [All other existing Team 0 features]
    
    -- NEW: Team 0 player-hero metrics
    t0.avg_hero_kd_percentage AS t0_hero_kd_percentage,
    t0.avg_hero_pick_percentage AS t0_hero_pick_percentage,
    t0.avg_hero_match_count AS t0_hero_match_count,
    t0.total_hero_experience AS t0_hero_experience,
    t0.max_hero_kd_percentage AS t0_max_hero_kd_pct,
    t0.min_hero_kd_percentage AS t0_min_hero_kd_pct,
    t0.hero_kd_consistency AS t0_hero_kd_consistency,
    
    -- Team 1 features (existing)
    t1.team_avg_kd AS t1_avg_kd,
    t1.team_max_kd AS t1_max_kd,
    t1.team_min_kd AS t1_min_kd,
    -- [All other existing Team 1 features]
    
    -- NEW: Team 1 player-hero metrics
    t1.avg_hero_kd_percentage AS t1_hero_kd_percentage,
    t1.avg_hero_pick_percentage AS t1_hero_pick_percentage,
    t1.avg_hero_match_count AS t1_hero_match_count,
    t1.total_hero_experience AS t1_hero_experience,
    t1.max_hero_kd_percentage AS t1_max_hero_kd_pct,
    t1.min_hero_kd_percentage AS t1_min_hero_kd_pct,
    t1.hero_kd_consistency AS t1_hero_kd_consistency,
    
    -- Existing differential features
    (t0.team_avg_kd - t1.team_avg_kd) AS kd_diff,
    -- [All other existing differential features]
    
    -- NEW: Player-hero differential features
    (t0.avg_hero_kd_percentage - t1.avg_hero_kd_percentage) AS hero_kd_pct_diff,
    (t0.avg_hero_pick_percentage - t1.avg_hero_pick_percentage) AS hero_pick_pct_diff,
    (t0.avg_hero_match_count - t1.avg_hero_match_count) AS hero_match_count_diff,
    (t0.total_hero_experience - t1.total_hero_experience) AS hero_experience_diff,
    
    -- Target variable: 1 if team 0 won, 0 if team 1 won
    CASE WHEN md.winning_team = '0' THEN 1 ELSE 0 END AS team0_won
FROM 
    match_data md
JOIN 
    team_features t0 ON md.match_id = t0.match_id AND t0.team = '0'
JOIN 
    team_features t1 ON md.match_id = t1.match_id AND t1.team = '1'
""").fetchdf()
print(training_set.head())

In [None]:
total_players_query = """
SELECT 
    COUNT(*) AS total_player_matches,
    COUNT(DISTINCT match_id) AS total_matches,
    COUNT(DISTINCT account_id) AS total_players
FROM 
    team_of_players
"""
total_counts = con.execute(total_players_query).fetchdf()
print(total_counts)

In [None]:
# Count how many have corresponding rolling stats
covered_players_query = """
SELECT 
    COUNT(*) AS covered_player_matches,
    COUNT(DISTINCT ta.match_id) AS covered_matches,
    COUNT(DISTINCT ta.account_id) AS covered_players
FROM 
    team_of_players ta
JOIN 
    player_rolling_stats prs ON ta.account_id = prs.account_id AND ta.match_id = prs.match_id
"""
covered_counts = con.execute(covered_players_query).fetchdf()
print(covered_counts)

In [None]:
coverage = pd.DataFrame({
    'Total': [total_counts['total_player_matches'][0], 
             total_counts['total_matches'][0], 
             total_counts['total_players'][0]],
    'Covered': [covered_counts['covered_player_matches'][0], 
               covered_counts['covered_matches'][0], 
               covered_counts['covered_players'][0]],
})
coverage['Coverage %'] = (coverage['Covered'] / coverage['Total'] * 100).round(2)
coverage.index = ['Player-Match Combinations', 'Unique Matches', 'Unique Players']

print("Rolling Stats Coverage:")
print(coverage)

In [None]:
missing_players = con.execute(f"""
    SELECT DISTINCT ta.account_id from team_of_players ta""").fetchdf()
print(missing_players)

In [None]:
import services.orchestrators as o
ids = missing_players['account_id'].unique().tolist()
small_df = o.run_etl_player_hero_match_trends_from_db(account_ids=ids)

In [None]:

        # Execute the complete query
try:
    result_df = pd.read_sql_query(complete_query, con)
    print(f"Successfully fetched {len(result_df)} matches for training")
    print(result_df.head())
except Exception as e:
    print(f"Error executing query: {e}")




In [None]:
import pandas as pd
import duckdb
from sklearn.model_selection import train_test_split
from fastai.tabular.all import *
import matplotlib.pyplot as plt
import numpy as np
from services import model_queries as mq

# Connect to database
con = duckdb.connect("data/deadlock.db")

# Execute your SQL query to get the structured data
df = mq.fetch_training_data(con)

# Check the data
print(f"Dataset shape: {df.shape}")
print(f"Sample of data:\n{df.head()}")

#check balance of the target variable
print(f"Target distribution:\n{df['team0_won'].value_counts(normalize=True)}")

In [None]:
# Remove non-feature columns
feature_df = training_data.drop(['match_id', 'start_time', 'winning_team'], axis=1)

# Define target variable
target = 'team0_won'

# Split into features and target
X = feature_df.drop(target, axis=1)
y = feature_df[target]

# Split into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Create FastAI TabularPandas object
procs = [Categorify, FillMissing, Normalize]
splits = (list(range(len(X_train))), list(range(len(X_train), len(X_train) + len(X_valid))))
to = TabularPandas(pd.concat([X_train, X_valid]), procs, 
                   cat_names=[], 
                   cont_names=list(X_train.columns),
                   y_names=target, 
                   splits=splits)

# Create DataLoaders
dls = to.dataloaders(bs=64)

In [None]:
# Create and train the random forest model
learn = tabular_learner(dls, metrics=accuracy)

# Find optimal learning rate
learn.lr_find()

# Train the model
learn.fit_one_cycle(5, 1e-2)

# Save the model
learn.export('deadlock_match_prediction_model.pkl')

In [None]:
# Evaluate model performance
interp = ClassificationInterpretation.from_learner(learn)

# Confusion matrix
interp.plot_confusion_matrix()

# Feature importance
importance = interp.feature_importance()
plt.figure(figsize=(12, 8))
plt.barh(X.columns[importance[1]], importance[0])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

# Top losses (most incorrect predictions)
interp.plot_top_losses(10)