In [39]:
con.close()

Imports and connection

In [1]:
import duckdb
import pandas as pd
import logging
con = duckdb.connect("c:/Code/Local Code/deadlock_match_prediction/data/deadlock.db")

Fetch a set of 1000 matches for a test model training.

In [2]:
training_set = con.execute(f"""
    WITH match_data AS (
        SELECT 
            m.match_id,
            m.start_time,
            m.winning_team
        FROM 
            matches m
        ORDER BY 
            random()  -- Randomly select matches
        LIMIT 1000
    ),
    teams_of_players AS (
        SELECT
            pm.match_id,
            pm.account_id,
            pm.hero_id,
            pm.team,
            pm.kills,
            pm.deaths,
            pm.assists,
            CASE WHEN pm.deaths = 0 THEN pm.kills ELSE CAST(pm.kills AS FLOAT) / pm.deaths END AS kd_ratio
        FROM 
            player_matches pm
        JOIN 
            match_data md ON pm.match_id = md.match_id
    ),
    team_stats AS (
        SELECT
            match_id,
            team,
            SUM(kills) AS total_kills,
            SUM(deaths) AS total_deaths,
            SUM(assists) AS total_assists,
            AVG(kd_ratio) AS avg_kd_ratio
        FROM 
            teams_of_players
        GROUP BY 
            match_id, team
    ),
    player_performance AS (
        SELECT
            ta.match_id,
            ta.team,
            -- Basic performance stats
            AVG(pt.p_average_kills) AS avg_team_kills,
            AVG(pt.p_average_deaths) AS avg_team_deaths,
            AVG(pt.p_avg_kd) AS avg_team_kd,
            AVG(pt.p_win_rate) AS avg_team_win_rate,
            AVG(pt.p_total_matches) AS avg_player_matches,
            
            -- Streak information
            AVG(pt.p_win_streak_avg) AS avg_win_streak,
            AVG(pt.p_loss_streak_avg) AS avg_loss_streak,
            SUM(pt.win_streaks_2plus) AS team_win_streaks_2plus,
            SUM(pt.win_streaks_3plus) AS team_win_streaks_3plus,
            SUM(pt.win_streaks_4plus) AS team_win_streaks_4plus,
            SUM(pt.win_streaks_5plus) AS team_win_streaks_5plus,
            SUM(pt.loss_streaks_2plus) AS team_loss_streaks_2plus,
            SUM(pt.loss_streaks_3plus) AS team_loss_streaks_3plus,
            SUM(pt.loss_streaks_4plus) AS team_loss_streaks_4plus,
            SUM(pt.loss_streaks_5plus) AS team_loss_streaks_5plus,
            
            
            -- Team strength indicators
            MAX(pt.p_win_rate) AS max_player_win_rate,
            MIN(pt.p_win_rate) AS min_player_win_rate,
            MAX(pt.p_avg_kd) AS max_player_kd,
            MIN(pt.p_avg_kd) AS min_player_kd,
            
            -- Experience metrics
            SUM(pt.p_total_matches) AS total_team_experience,
            MAX(pt.p_total_matches) AS most_experienced_player,
            MIN(pt.p_total_matches) AS least_experienced_player,
            
            -- Consistency metrics
            STDDEV(pt.p_win_rate) AS win_rate_consistency,
            STDDEV(pt.p_avg_kd) AS kd_consistency
        FROM 
            teams_of_players ta
        JOIN 
            player_trends pt ON ta.account_id = pt.account_id
        GROUP BY 
            ta.match_id, ta.team
    ),
    hero_trend_by_team AS (
        SELECT
            ta.match_id,
            ta.team,
            AVG(ht.win_rate) AS avg_hero_win_rate,
            AVG(ht.average_kd) AS avg_hero_kd,
            AVG(ht.pick_rate) AS avg_hero_pick_rate,
            SUM(ht.pick_rate) AS total_hero_popularity,
            MAX(ht.win_rate) AS max_hero_win_rate,
            MIN(ht.win_rate) AS min_hero_win_rate,
            MAX(ht.average_kd) AS max_hero_kd,
            MIN(ht.average_kd) AS min_hero_kd,
            STDDEV(ht.win_rate) AS hero_win_rate_variety,
            STDDEV(ht.average_kd) AS hero_kd_variety
        FROM 
            teams_of_players ta
        JOIN 
            hero_trends ht ON ta.hero_id = ht.hero_id
        WHERE 
            ht.trend_window_days = 30  -- Using 30-day trends
        GROUP BY 
            ta.match_id, ta.team
    ),

    recent_performance AS (
    SELECT
        ta.match_id,
        ta.team,
        -- Existing rolling win/loss percentages
        AVG(prs.p_win_pct_2) AS avg_recent_win_pct_2,
        AVG(prs.p_win_pct_3) AS avg_recent_win_pct_3,
        AVG(prs.p_win_pct_4) AS avg_recent_win_pct_4,
        AVG(prs.p_win_pct_5) AS avg_recent_win_pct_5,
        AVG(prs.p_loss_pct_2) AS avg_recent_loss_pct_2,
        AVG(prs.p_loss_pct_3) AS avg_recent_loss_pct_3,
        AVG(prs.p_loss_pct_4) AS avg_recent_loss_pct_4,
        AVG(prs.p_loss_pct_5) AS avg_recent_loss_pct_5,
        MAX(prs.p_win_pct_5) AS max_recent_win_pct,
        MIN(prs.p_win_pct_5) AS min_recent_win_pct,
        
        -- Player-hero specific metrics
        AVG(prs.p_v_h_kd_pct) AS avg_hero_kd_percentage,
        AVG(prs.p_v_h_pick_rate) AS avg_hero_pick_percentage,
        AVG(prs.p_h_match_count) AS avg_hero_match_count,
        SUM(prs.p_h_match_count) AS total_hero_experience,
        MAX(prs.p_v_h_kd_pct) AS max_hero_kd_percentage,
        MIN(prs.p_v_h_kd_pct) AS min_hero_kd_percentage,
        STDDEV(prs.p_v_h_kd_pct) AS hero_kd_consistency
    FROM 
        teams_of_players ta
    JOIN 
        player_rolling_stats prs ON ta.account_id = prs.account_id AND ta.match_id = prs.match_id
    GROUP BY 
        ta.match_id, ta.team
),
    team_features AS (
    SELECT
        ts.match_id,
        ts.team,
        -- Existing features from team stats
        ts.team_avg_kd,
        ts.team_max_kd,
        ts.team_min_kd,
        
        -- Existing features from player performance
        pp.avg_team_kills,
        pp.avg_team_deaths,
        pp.avg_team_kd,
        pp.avg_team_win_rate,
        pp.max_player_win_rate,
        pp.min_player_win_rate,
        pp.max_player_kd,
        pp.min_player_kd,
        pp.avg_player_matches,
        pp.total_team_experience,
        pp.most_experienced_player,
        pp.least_experienced_player,
        pp.win_rate_consistency,
        pp.kd_consistency,
        pp.avg_win_streak,
        pp.avg_loss_streak,
        pp.team_win_streaks_2plus,
        pp.team_win_streaks_3plus,
        pp.team_win_streaks_4plus,
        pp.team_win_streaks_5plus,
        pp.team_loss_streaks_2plus,
        pp.team_loss_streaks_3plus,
        pp.team_loss_streaks_4plus,
        pp.team_loss_streaks_5plus,
        
        -- Hero trend features
        ht.avg_hero_win_rate,
        ht.avg_hero_kd,
        ht.avg_hero_pick_rate,
        ht.total_hero_popularity,
        ht.max_hero_win_rate,
        ht.min_hero_win_rate,
        ht.max_hero_kd,
        ht.min_hero_kd,
        ht.hero_win_rate_variety,
        ht.hero_kd_variety,
        
        -- Recent performance metrics (traditional)
        COALESCE(rps.avg_recent_win_pct_2, 50) AS avg_recent_win_pct_2,
        COALESCE(rps.avg_recent_win_pct_3, 50) AS avg_recent_win_pct_3,
        COALESCE(rps.avg_recent_win_pct_4, 50) AS avg_recent_win_pct_4,
        COALESCE(rps.avg_recent_win_pct_5, 50) AS avg_recent_win_pct_5,
        COALESCE(rps.avg_recent_loss_pct_2, 50) AS avg_recent_loss_pct_2,
        COALESCE(rps.avg_recent_loss_pct_3, 50) AS avg_recent_loss_pct_3,
        COALESCE(rps.avg_recent_loss_pct_4, 50) AS avg_recent_loss_pct_4,
        COALESCE(rps.avg_recent_loss_pct_5, 50) AS avg_recent_loss_pct_5,
        COALESCE(rps.max_recent_win_pct, 50) AS max_recent_win_pct,
        COALESCE(rps.min_recent_win_pct, 50) AS min_recent_win_pct,
        
        -- NEW: Player-hero specific metrics from rolling stats
        COALESCE(rps.avg_hero_kd_percentage, 0) AS avg_hero_kd_percentage,
        COALESCE(rps.avg_hero_pick_percentage, 0) AS avg_hero_pick_percentage,
        COALESCE(rps.avg_hero_match_count, 0) AS avg_hero_match_count,
        COALESCE(rps.total_hero_experience, 0) AS total_hero_experience,
        COALESCE(rps.max_hero_kd_percentage, 0) AS max_hero_kd_percentage,
        COALESCE(rps.min_hero_kd_percentage, 0) AS min_hero_kd_percentage,
        COALESCE(rps.hero_kd_consistency, 0) AS hero_kd_consistency
    FROM 
        team_stats ts
    LEFT JOIN 
        player_performance pp ON ts.match_id = pp.match_id AND ts.team = pp.team
    LEFT JOIN 
        hero_trend_by_team ht ON ts.match_id = ht.match_id AND ts.team = ht.team
    LEFT JOIN 
        recent_performance rps ON ts.match_id = rps.match_id AND ts.team = rps.team
),

team_features_ordered AS (
    -- Ensure consistent ordering of teams
    SELECT 
        match_id, 
        team,
        -- Current features
        team_avg_kd,
        team_max_kd,
        team_min_kd,
        avg_hero_kd_percentage,
        avg_hero_pick_percentage,
        avg_hero_match_count,
        total_hero_experience,
        max_hero_kd_percentage,
        min_hero_kd_percentage,
        
        -- Additional features from player_trends
        avg_team_win_rate,
        avg_win_streak,
        avg_loss_streak,
        team_win_streaks_2plus,
        team_win_streaks_3plus,
        team_win_streaks_4plus,
        team_win_streaks_5plus,
        team_loss_streaks_2plus,
        team_loss_streaks_3plus,
        team_loss_streaks_4plus,
        team_loss_streaks_5plus,
        
        -- Additional features from rolling stats
        avg_recent_win_pct_2,
        avg_recent_win_pct_3,
        avg_recent_win_pct_4,
        avg_recent_win_pct_5,
        avg_recent_loss_pct_2,
        avg_recent_loss_pct_3,
        avg_recent_loss_pct_4,
        avg_recent_loss_pct_5,
        
        ROW_NUMBER() OVER (PARTITION BY match_id ORDER BY team) AS team_order
    FROM 
        team_features
)

SELECT
    md.match_id,
    md.start_time,
    md.winning_team,
    
    -- Team 0 features (existing)
    t0.team_avg_kd AS t0_avg_kd,
    t0.team_max_kd AS t0_max_kd,
    t0.team_min_kd AS t0_min_kd,
    t0.avg_hero_kd_percentage AS t0_hero_kd_percentage,
    t0.avg_hero_pick_percentage AS t0_hero_pick_percentage,
    t0.avg_hero_match_count AS t0_hero_match_count,
    t0.total_hero_experience AS t0_hero_experience,
    t0.max_hero_kd_percentage AS t0_max_hero_kd_pct,
    t0.min_hero_kd_percentage AS t0_min_hero_kd_pct,
    
    -- Team 0 additional features (previously missing)
    t0.avg_team_win_rate AS t0_win_rate,
    t0.avg_win_streak AS t0_win_streak_avg,
    t0.avg_loss_streak AS t0_loss_streak_avg,
    t0.team_win_streaks_2plus AS t0_win_streaks_2,
    t0.team_win_streaks_3plus AS t0_win_streaks_3,
    t0.team_win_streaks_4plus AS t0_win_streaks_4,
    t0.team_win_streaks_5plus AS t0_win_streaks_5,
    t0.team_loss_streaks_2plus AS t0_loss_streaks_2,
    t0.team_loss_streaks_3plus AS t0_loss_streaks_3,
    t0.team_loss_streaks_4plus AS t0_loss_streaks_4,
    t0.team_loss_streaks_5plus AS t0_loss_streaks_5,
    
    -- Team 0 rolling stats (previously missing)
    t0.avg_recent_win_pct_2 AS t0_win_pct_2,
    t0.avg_recent_win_pct_3 AS t0_win_pct_3,
    t0.avg_recent_win_pct_4 AS t0_win_pct_4,
    t0.avg_recent_win_pct_5 AS t0_win_pct_5,
    t0.avg_recent_loss_pct_2 AS t0_loss_pct_2,
    t0.avg_recent_loss_pct_3 AS t0_loss_pct_3,
    t0.avg_recent_loss_pct_4 AS t0_loss_pct_4,
    t0.avg_recent_loss_pct_5 AS t0_loss_pct_5,
    
    -- Team 1 features (existing)
    t1.team_avg_kd AS t1_avg_kd,
    t1.team_max_kd AS t1_max_kd,
    t1.team_min_kd AS t1_min_kd,
    t1.avg_hero_kd_percentage AS t1_hero_kd_percentage,
    t1.avg_hero_pick_percentage AS t1_hero_pick_percentage,
    t1.avg_hero_match_count AS t1_hero_match_count,
    t1.total_hero_experience AS t1_hero_experience,
    t1.max_hero_kd_percentage AS t1_max_hero_kd_pct,
    t1.min_hero_kd_percentage AS t1_min_hero_kd_pct,
    
    -- Team 1 additional features (previously missing)
    t1.avg_team_win_rate AS t1_win_rate,
    t1.avg_win_streak AS t1_win_streak_avg,
    t1.avg_loss_streak AS t1_loss_streak_avg,
    t1.team_win_streaks_2plus AS t1_win_streaks_2,
    t1.team_win_streaks_3plus AS t1_win_streaks_3,
    t1.team_win_streaks_4plus AS t1_win_streaks_4,
    t1.team_win_streaks_5plus AS t1_win_streaks_5,
    t1.team_loss_streaks_2plus AS t1_loss_streaks_2,
    t1.team_loss_streaks_3plus AS t1_loss_streaks_3,
    t1.team_loss_streaks_4plus AS t1_loss_streaks_4,
    t1.team_loss_streaks_5plus AS t1_loss_streaks_5,
    
    -- Team 1 rolling stats (previously missing)
    t1.avg_recent_win_pct_2 AS t1_win_pct_2,
    t1.avg_recent_win_pct_3 AS t1_win_pct_3,
    t1.avg_recent_win_pct_4 AS t1_win_pct_4,
    t1.avg_recent_win_pct_5 AS t1_win_pct_5,
    t1.avg_recent_loss_pct_2 AS t1_loss_pct_2,
    t1.avg_recent_loss_pct_3 AS t1_loss_pct_3,
    t1.avg_recent_loss_pct_4 AS t1_loss_pct_4,
    t1.avg_recent_loss_pct_5 AS t1_loss_pct_5,
    
    -- Existing differential features
    (t0.team_avg_kd - t1.team_avg_kd) AS kd_diff,
    (t0.avg_hero_kd_percentage - t1.avg_hero_kd_percentage) AS hero_kd_pct_diff,
    
    -- Additional differential features
    (t0.avg_team_win_rate - t1.avg_team_win_rate) AS win_rate_diff,
    (t0.avg_win_streak - t1.avg_win_streak) AS win_streak_diff,
    (t0.avg_loss_streak - t1.avg_loss_streak) AS loss_streak_diff,
    (t0.avg_recent_win_pct_5 - t1.avg_recent_win_pct_5) AS recent_win_pct_diff,
    
    -- Target variable
    CASE WHEN md.winning_team = t0.team THEN 1 ELSE 0 END AS team0_won
FROM 
    match_data md
JOIN 
    team_features_ordered t0 ON md.match_id = t0.match_id AND t0.team_order = 1
JOIN 
    team_features_ordered t1 ON md.match_id = t1.match_id AND t1.team_order = 2""").fetchdf()
print(f"Robust training set has {len(training_set)} rows")
print(training_set)
output_path = "data/final_training_data.csv"
training_set.to_csv(output_path, index=False)

BinderException: Binder Error: Referenced table "ts" not found!
Candidate tables: "md"

LINE 146:         ts.team_avg_kd,
                  ^

In [3]:
match_data = con.execute(f"""
    WITH match_data AS (
        SELECT 
            m.match_id,
            m.start_time,
            m.winning_team
        FROM 
            matches m
        ORDER BY 
            random()  -- Randomly select matches
        LIMIT 1000
    )
    SELECT * FROM match_data""").fetchdf()
print(match_data)

     match_id          start_time winning_team
0    34313556 2025-03-27 01:18:24        Team0
1    35328264 2025-05-01 14:37:05        Team1
2    34839529 2025-04-13 18:38:38        Team0
3    35038607 2025-04-20 20:06:53        Team0
4    34276957 2025-03-25 21:06:35        Team0
..        ...                 ...          ...
995  35546493 2025-05-09 01:16:16        Team0
996  34833454 2025-04-13 15:03:12        Team0
997  34085693 2025-03-20 01:04:40        Team1
998  35560949 2025-05-09 08:14:59        Team0
999  34164424 2025-03-22 14:38:33        Team1

[1000 rows x 3 columns]


Fetch the players from each of the matches

In [4]:
team_of_players = con.execute(f"""
    WITH teams_of_players AS (
        SELECT
            pm.match_id,
            pm.account_id,
            pm.hero_id,
            pm.team,
            pm.kills,
            pm.deaths,
            pm.assists,
            CASE WHEN pm.deaths = 0 THEN pm.kills ELSE CAST(pm.kills AS FLOAT) / pm.deaths END AS kd_ratio
        FROM 
            player_matches pm
        JOIN 
            match_data md ON pm.match_id = md.match_id
    )
SELECT * FROM teams_of_players""").fetchdf()
print(team_of_players)


       match_id  account_id  hero_id   team  kills  deaths  assists  kd_ratio
0      34204836    58363380       25  Team1     10      12        9  0.833333
1      34204836  1254518067       58  Team1     12       8        4  1.500000
2      34204836   209762137        7  Team0      5       8       11  0.625000
3      34204836  1044770420        3  Team1      5       6       16  0.833333
4      34204836  1774631088       20  Team1      5       7        7  0.714286
...         ...         ...      ...    ...    ...     ...      ...       ...
11995  35644069  1591197043        4  Team0      5      11        6  0.454545
11996  35644069   386039958        3  Team1      5       6       18  0.833333
11997  35644069  1519487905       16  Team0      8      12       20  0.666667
11998  35644069  1138122560       58  Team0     13      11        5  1.181818
11999  35644069  1849896648       20  Team0     11       3       14  3.666667

[12000 rows x 8 columns]


Combine normalized match_players into their respective teams and aggreggate some stats for each team.

In [5]:

#-- Calculate team-level statistics
team_stats= con.execute(f"""
    WITH team_stats AS (
        SELECT
            match_id,
            team,
            AVG(kd_ratio) AS team_avg_kd,
            MAX(kd_ratio) AS team_max_kd,
            MIN(kd_ratio) AS team_min_kd,
            COUNT(*) AS team_size
        FROM 
            team_of_players
        GROUP BY 
            match_id, team
    )
    Select * from team_stats""").fetchdf()
print(team_stats)



      match_id   team  team_avg_kd  team_max_kd  team_min_kd  team_size
0     34200032  Team0     0.678307        1.000     0.300000          6
1     35545025  Team1     2.350000        5.000     0.600000          6
2     34163631  Team0     4.277778        7.000     1.666667          6
3     34164424  Team0     0.795785        1.375     0.222222          6
4     35573603  Team0     1.625000        5.500     0.000000          6
...        ...    ...          ...          ...          ...        ...
1995  34310343  Team1     1.447619        3.000     1.000000          6
1996  34275587  Team1     1.530519        6.000     0.300000          6
1997  34230727  Team0     2.434217        9.500     0.363636          6
1998  34232282  Team0     1.833333        4.000     0.500000          6
1999  34246596  Team1     1.009921        1.625     0.625000          6

[2000 rows x 6 columns]


Combine additional player stats into team-wide stats using averages. Winrate, streaks, KD, and experience include a min, max, and average for the team.

In [6]:
player_performance = con.execute(f"""
    WITH player_performance AS (
        SELECT
            ta.match_id,
            ta.team,
            -- Basic performance stats
            AVG(pt.p_average_kills) AS avg_team_kills,
            AVG(pt.p_average_deaths) AS avg_team_deaths,
            AVG(pt.p_avg_kd) AS avg_team_kd,
            AVG(pt.p_win_rate) AS avg_team_win_rate,
            AVG(pt.p_total_matches) AS avg_player_matches,
            
            -- Streak information
            AVG(pt.p_win_streak_avg) AS avg_win_streak,
            AVG(pt.p_loss_streak_avg) AS avg_loss_streak,
            SUM(pt.win_streaks_2plus) AS team_win_streaks_2plus,
            SUM(pt.win_streaks_3plus) AS team_win_streaks_3plus,
            SUM(pt.win_streaks_4plus) AS team_win_streaks_4plus,
            SUM(pt.win_streaks_5plus) AS team_win_streaks_5plus,
            SUM(pt.loss_streaks_2plus) AS team_loss_streaks_2plus,
            SUM(pt.loss_streaks_3plus) AS team_loss_streaks_3plus,
            SUM(pt.loss_streaks_4plus) AS team_loss_streaks_4plus,
            SUM(pt.loss_streaks_5plus) AS team_loss_streaks_5plus,
            
            
            -- Team strength indicators
            MAX(pt.p_win_rate) AS max_player_win_rate,
            MIN(pt.p_win_rate) AS min_player_win_rate,
            MAX(pt.p_avg_kd) AS max_player_kd,
            MIN(pt.p_avg_kd) AS min_player_kd,
            
            -- Experience metrics
            SUM(pt.p_total_matches) AS total_team_experience,
            MAX(pt.p_total_matches) AS most_experienced_player,
            MIN(pt.p_total_matches) AS least_experienced_player,
            
            -- Consistency metrics
            STDDEV(pt.p_win_rate) AS win_rate_consistency,
            STDDEV(pt.p_avg_kd) AS kd_consistency
        FROM 
            team_of_players ta
        JOIN 
            player_trends pt ON ta.account_id = pt.account_id
        GROUP BY 
            ta.match_id, ta.team
    )
    SELECT * FROM player_performance
    """).fetchdf()
print(player_performance)

      match_id   team  avg_team_kills  avg_team_deaths  avg_team_kd  \
0     34273855  Team1        8.251667         6.186667     1.351667   
1     34423078  Team1        7.805000         5.570000     1.425000   
2     35617282  Team1        7.566667         7.908333     1.025000   
3     34816369  Team1        7.853333         6.045000     1.365000   
4     33845318  Team0        9.211667         7.073333     1.323333   
...        ...    ...             ...              ...          ...   
1995  35047765  Team0        7.953333         5.463333     1.543333   
1996  33840479  Team1        6.958333         5.775000     1.213333   
1997  35328478  Team1        7.538333         5.880000     1.470000   
1998  34099778  Team0        8.743333         6.043333     1.546667   
1999  34105174  Team0        7.315000         6.903333     1.231667   

      avg_team_win_rate  avg_player_matches  avg_win_streak  avg_loss_streak  \
0             54.115000          823.000000        2.216855        

Calculate hero trends from db

In [7]:

#-- Calculate hero trend statistics by team
hero_trend_by_team= con.execute(f"""
    WITH hero_trend_by_team AS (
        SELECT
            ta.match_id,
            ta.team,
            AVG(ht.win_rate) AS avg_hero_win_rate,
            AVG(ht.average_kd) AS avg_hero_kd,
            AVG(ht.pick_rate) AS avg_hero_pick_rate,
            SUM(ht.pick_rate) AS total_hero_popularity,
            MAX(ht.win_rate) AS max_hero_win_rate,
            MIN(ht.win_rate) AS min_hero_win_rate,
            MAX(ht.average_kd) AS max_hero_kd,
            MIN(ht.average_kd) AS min_hero_kd,
            STDDEV(ht.win_rate) AS hero_win_rate_variety,
            STDDEV(ht.average_kd) AS hero_kd_variety
        FROM 
            team_of_players ta
        JOIN 
            hero_trends ht ON ta.hero_id = ht.hero_id
        WHERE 
            ht.trend_window_days = 30  -- Using 30-day trends
        GROUP BY 
            ta.match_id, ta.team
    )
        SELECT * from hero_trend_by_team""").fetchdf()
print(hero_trend_by_team)

      match_id   team  avg_hero_win_rate  avg_hero_kd  avg_hero_pick_rate  \
0     34206261  Team1          49.098333     0.978333            4.631667   
1     34208766  Team1          50.005000     0.961667            3.840000   
2     35546493  Team0          49.603334     1.001667            4.111667   
3     34163535  Team0          49.660000     0.961667            3.295000   
4     34162727  Team0          49.941666     1.018333            4.200000   
...        ...    ...                ...          ...                 ...   
1995  34304075  Team1          50.765000     1.030000            4.798333   
1996  34273948  Team1          49.623333     1.018333            4.068333   
1997  34245771  Team1          50.818334     1.011667            4.183333   
1998  34249049  Team1          51.655000     1.008333            4.363333   
1999  34250805  Team0          49.838333     1.018333            4.111667   

      total_hero_popularity  max_hero_win_rate  min_hero_win_rate  \
0     

Calculate recency trends (where within what streaks is the team?)

In [8]:
#-- Get player trends and statistics
recent_performance= con.execute(f"""
    WITH recent_performance AS (
    SELECT
        ta.match_id,
        ta.team,
        -- Existing rolling win/loss percentages
        AVG(prs.p_win_pct_2) AS avg_recent_win_pct_2,
        AVG(prs.p_win_pct_3) AS avg_recent_win_pct_3,
        AVG(prs.p_win_pct_4) AS avg_recent_win_pct_4,
        AVG(prs.p_win_pct_5) AS avg_recent_win_pct_5,
        AVG(prs.p_loss_pct_2) AS avg_recent_loss_pct_2,
        AVG(prs.p_loss_pct_3) AS avg_recent_loss_pct_3,
        AVG(prs.p_loss_pct_4) AS avg_recent_loss_pct_4,
        AVG(prs.p_loss_pct_5) AS avg_recent_loss_pct_5,
        MAX(prs.p_win_pct_5) AS max_recent_win_pct,
        MIN(prs.p_win_pct_5) AS min_recent_win_pct,
        
        -- Player-hero specific metrics
        AVG(prs.p_v_h_kd_pct) AS avg_hero_kd_percentage,
        AVG(prs.p_v_h_pick_rate) AS avg_hero_pick_percentage,
        AVG(prs.p_h_match_count) AS avg_hero_match_count,
        SUM(prs.p_h_match_count) AS total_hero_experience,
        MAX(prs.p_v_h_kd_pct) AS max_hero_kd_percentage,
        MIN(prs.p_v_h_kd_pct) AS min_hero_kd_percentage,
        STDDEV(prs.p_v_h_kd_pct) AS hero_kd_consistency
    FROM 
        team_of_players ta
    JOIN 
        player_rolling_stats prs ON ta.account_id = prs.account_id AND ta.match_id = prs.match_id
    GROUP BY 
        ta.match_id, ta.team
)
    Select * from recent_performance""").fetchdf()
print(recent_performance)

      match_id   team  avg_recent_win_pct_2  avg_recent_win_pct_3  \
0     33930435  Team0             33.333333             38.888334   
1     34653328  Team0             50.000000             66.666667   
2     34149140  Team1             83.333333             72.223333   
3     33850425  Team1             75.000000             66.668332   
4     34860474  Team0             66.666667             61.111666   
...        ...    ...                   ...                   ...   
1325  34204836  Team0             58.333333             55.555000   
1326  34489740  Team0             58.333333             72.224998   
1327  34072049  Team1             66.666667             66.666667   
1328  34360295  Team0             41.666667             55.555000   
1329  34032646  Team1             50.000000             38.889999   

      avg_recent_win_pct_4  avg_recent_win_pct_5  avg_recent_loss_pct_2  \
0                41.666667             53.333333              66.666667   
1                66.6

Create list of features for each match

In [9]:
team_features = con.execute(f"""
    WITH team_features AS (
    SELECT
        ts.match_id,
        ts.team,
        -- Existing features from team stats
        ts.team_avg_kd,
        ts.team_max_kd,
        ts.team_min_kd,
        
        -- Existing features from player performance
        pp.avg_team_kills,
        pp.avg_team_deaths,
        pp.avg_team_kd,
        pp.avg_team_win_rate,
        pp.max_player_win_rate,
        pp.min_player_win_rate,
        pp.max_player_kd,
        pp.min_player_kd,
        pp.avg_player_matches,
        pp.total_team_experience,
        pp.most_experienced_player,
        pp.least_experienced_player,
        pp.win_rate_consistency,
        pp.kd_consistency,
        pp.avg_win_streak,
        pp.avg_loss_streak,
        pp.team_win_streaks_2plus,
        pp.team_win_streaks_3plus,
        pp.team_win_streaks_4plus,
        pp.team_win_streaks_5plus,
        pp.team_loss_streaks_2plus,
        pp.team_loss_streaks_3plus,
        pp.team_loss_streaks_4plus,
        pp.team_loss_streaks_5plus,
        
        -- Hero trend features
        ht.avg_hero_win_rate,
        ht.avg_hero_kd,
        ht.avg_hero_pick_rate,
        ht.total_hero_popularity,
        ht.max_hero_win_rate,
        ht.min_hero_win_rate,
        ht.max_hero_kd,
        ht.min_hero_kd,
        ht.hero_win_rate_variety,
        ht.hero_kd_variety,
        
        -- Recent performance metrics (traditional)
        COALESCE(rps.avg_recent_win_pct_2, 50) AS avg_recent_win_pct_2,
        COALESCE(rps.avg_recent_win_pct_3, 50) AS avg_recent_win_pct_3,
        COALESCE(rps.avg_recent_win_pct_4, 50) AS avg_recent_win_pct_4,
        COALESCE(rps.avg_recent_win_pct_5, 50) AS avg_recent_win_pct_5,
        COALESCE(rps.avg_recent_loss_pct_2, 50) AS avg_recent_loss_pct_2,
        COALESCE(rps.avg_recent_loss_pct_3, 50) AS avg_recent_loss_pct_3,
        COALESCE(rps.avg_recent_loss_pct_4, 50) AS avg_recent_loss_pct_4,
        COALESCE(rps.avg_recent_loss_pct_5, 50) AS avg_recent_loss_pct_5,
        COALESCE(rps.max_recent_win_pct, 50) AS max_recent_win_pct,
        COALESCE(rps.min_recent_win_pct, 50) AS min_recent_win_pct,
        
        -- NEW: Player-hero specific metrics from rolling stats
        COALESCE(rps.avg_hero_kd_percentage, 0) AS avg_hero_kd_percentage,
        COALESCE(rps.avg_hero_pick_percentage, 0) AS avg_hero_pick_percentage,
        COALESCE(rps.avg_hero_match_count, 0) AS avg_hero_match_count,
        COALESCE(rps.total_hero_experience, 0) AS total_hero_experience,
        COALESCE(rps.max_hero_kd_percentage, 0) AS max_hero_kd_percentage,
        COALESCE(rps.min_hero_kd_percentage, 0) AS min_hero_kd_percentage,
        COALESCE(rps.hero_kd_consistency, 0) AS hero_kd_consistency
    FROM 
        team_stats ts
    LEFT JOIN 
        player_performance pp ON ts.match_id = pp.match_id AND ts.team = pp.team
    LEFT JOIN 
        hero_trend_by_team ht ON ts.match_id = ht.match_id AND ts.team = ht.team
    LEFT JOIN 
        recent_performance rps ON ts.match_id = rps.match_id AND ts.team = rps.team
)SELECT * from team_features""").fetchdf()
print(team_features)


      match_id   team  team_avg_kd  team_max_kd  team_min_kd  avg_team_kills  \
0     34273855  Team1     1.327910     3.000000     0.428571        8.251667   
1     34423078  Team1     0.423611     0.750000     0.000000        7.805000   
2     34816369  Team1     2.544444     5.333333     0.800000        7.853333   
3     33845318  Team0     1.043162     2.250000     0.266667        9.211667   
4     34365243  Team1     1.069571     1.571429     0.400000        8.840000   
...        ...    ...          ...          ...          ...             ...   
1995  35199715  Team1     4.361111     9.000000     1.000000        8.978333   
1996  35168333  Team0     1.791751     3.200000     0.272727        7.328333   
1997  35067650  Team1     1.298413     2.500000     0.333333        8.375000   
1998  35609992  Team0     1.334560     2.200000     0.545455        7.520000   
1999  35614675  Team1     0.622024     1.166667     0.000000        8.021667   

      avg_team_deaths  avg_team_kd  avg

Combine data into dataset for training

In [10]:
training_set = con.execute(f"""
WITH team_features_ordered AS (
    -- Ensure consistent ordering of teams
    SELECT 
        match_id, 
        team,
        -- Current features
        team_avg_kd,
        team_max_kd,
        team_min_kd,
        avg_hero_kd_percentage,
        avg_hero_pick_percentage,
        avg_hero_match_count,
        total_hero_experience,
        max_hero_kd_percentage,
        min_hero_kd_percentage,
        
        -- Additional features from player_trends
        avg_team_win_rate,
        avg_win_streak,
        avg_loss_streak,
        team_win_streaks_2plus,
        team_win_streaks_3plus,
        team_win_streaks_4plus,
        team_win_streaks_5plus,
        team_loss_streaks_2plus,
        team_loss_streaks_3plus,
        team_loss_streaks_4plus,
        team_loss_streaks_5plus,
        
        -- Additional features from rolling stats
        avg_recent_win_pct_2,
        avg_recent_win_pct_3,
        avg_recent_win_pct_4,
        avg_recent_win_pct_5,
        avg_recent_loss_pct_2,
        avg_recent_loss_pct_3,
        avg_recent_loss_pct_4,
        avg_recent_loss_pct_5,
        
        ROW_NUMBER() OVER (PARTITION BY match_id ORDER BY team) AS team_order
    FROM 
        team_features
)

SELECT
    md.match_id,
    md.start_time,
    md.winning_team,
    
    -- Team 0 features (existing)
    t0.team_avg_kd AS t0_avg_kd,
    t0.team_max_kd AS t0_max_kd,
    t0.team_min_kd AS t0_min_kd,
    t0.avg_hero_kd_percentage AS t0_hero_kd_percentage,
    t0.avg_hero_pick_percentage AS t0_hero_pick_percentage,
    t0.avg_hero_match_count AS t0_hero_match_count,
    t0.total_hero_experience AS t0_hero_experience,
    t0.max_hero_kd_percentage AS t0_max_hero_kd_pct,
    t0.min_hero_kd_percentage AS t0_min_hero_kd_pct,
    
    -- Team 0 additional features (previously missing)
    t0.avg_team_win_rate AS t0_win_rate,
    t0.avg_win_streak AS t0_win_streak_avg,
    t0.avg_loss_streak AS t0_loss_streak_avg,
    t0.team_win_streaks_2plus AS t0_win_streaks_2,
    t0.team_win_streaks_3plus AS t0_win_streaks_3,
    t0.team_win_streaks_4plus AS t0_win_streaks_4,
    t0.team_win_streaks_5plus AS t0_win_streaks_5,
    t0.team_loss_streaks_2plus AS t0_loss_streaks_2,
    t0.team_loss_streaks_3plus AS t0_loss_streaks_3,
    t0.team_loss_streaks_4plus AS t0_loss_streaks_4,
    t0.team_loss_streaks_5plus AS t0_loss_streaks_5,
    
    -- Team 0 rolling stats (previously missing)
    t0.avg_recent_win_pct_2 AS t0_win_pct_2,
    t0.avg_recent_win_pct_3 AS t0_win_pct_3,
    t0.avg_recent_win_pct_4 AS t0_win_pct_4,
    t0.avg_recent_win_pct_5 AS t0_win_pct_5,
    t0.avg_recent_loss_pct_2 AS t0_loss_pct_2,
    t0.avg_recent_loss_pct_3 AS t0_loss_pct_3,
    t0.avg_recent_loss_pct_4 AS t0_loss_pct_4,
    t0.avg_recent_loss_pct_5 AS t0_loss_pct_5,
    
    -- Team 1 features (existing)
    t1.team_avg_kd AS t1_avg_kd,
    t1.team_max_kd AS t1_max_kd,
    t1.team_min_kd AS t1_min_kd,
    t1.avg_hero_kd_percentage AS t1_hero_kd_percentage,
    t1.avg_hero_pick_percentage AS t1_hero_pick_percentage,
    t1.avg_hero_match_count AS t1_hero_match_count,
    t1.total_hero_experience AS t1_hero_experience,
    t1.max_hero_kd_percentage AS t1_max_hero_kd_pct,
    t1.min_hero_kd_percentage AS t1_min_hero_kd_pct,
    
    -- Team 1 additional features (previously missing)
    t1.avg_team_win_rate AS t1_win_rate,
    t1.avg_win_streak AS t1_win_streak_avg,
    t1.avg_loss_streak AS t1_loss_streak_avg,
    t1.team_win_streaks_2plus AS t1_win_streaks_2,
    t1.team_win_streaks_3plus AS t1_win_streaks_3,
    t1.team_win_streaks_4plus AS t1_win_streaks_4,
    t1.team_win_streaks_5plus AS t1_win_streaks_5,
    t1.team_loss_streaks_2plus AS t1_loss_streaks_2,
    t1.team_loss_streaks_3plus AS t1_loss_streaks_3,
    t1.team_loss_streaks_4plus AS t1_loss_streaks_4,
    t1.team_loss_streaks_5plus AS t1_loss_streaks_5,
    
    -- Team 1 rolling stats (previously missing)
    t1.avg_recent_win_pct_2 AS t1_win_pct_2,
    t1.avg_recent_win_pct_3 AS t1_win_pct_3,
    t1.avg_recent_win_pct_4 AS t1_win_pct_4,
    t1.avg_recent_win_pct_5 AS t1_win_pct_5,
    t1.avg_recent_loss_pct_2 AS t1_loss_pct_2,
    t1.avg_recent_loss_pct_3 AS t1_loss_pct_3,
    t1.avg_recent_loss_pct_4 AS t1_loss_pct_4,
    t1.avg_recent_loss_pct_5 AS t1_loss_pct_5,
    
    -- Existing differential features
    (t0.team_avg_kd - t1.team_avg_kd) AS kd_diff,
    (t0.avg_hero_kd_percentage - t1.avg_hero_kd_percentage) AS hero_kd_pct_diff,
    
    -- Additional differential features
    (t0.avg_team_win_rate - t1.avg_team_win_rate) AS win_rate_diff,
    (t0.avg_win_streak - t1.avg_win_streak) AS win_streak_diff,
    (t0.avg_loss_streak - t1.avg_loss_streak) AS loss_streak_diff,
    (t0.avg_recent_win_pct_5 - t1.avg_recent_win_pct_5) AS recent_win_pct_diff,
    
    -- Target variable
    CASE WHEN md.winning_team = t0.team THEN 1 ELSE 0 END AS team0_won
FROM 
    match_data md
JOIN 
    team_features_ordered t0 ON md.match_id = t0.match_id AND t0.team_order = 1
JOIN 
    team_features_ordered t1 ON md.match_id = t1.match_id AND t1.team_order = 2""").fetchdf()
print(f"Robust training set has {len(training_set)} rows")
print(training_set)
output_path = "data/final_training_data.csv"
training_set.to_csv(output_path, index=False)

Robust training set has 1000 rows
     match_id          start_time winning_team  t0_avg_kd  t0_max_kd  \
0    33864219 2025-03-13 08:20:50        Team1   0.697097   1.875000   
1    33875284 2025-03-13 17:42:05        Team0   1.346080   4.250000   
2    33881073 2025-03-13 20:41:06        Team1   0.268452   0.375000   
3    34002696 2025-03-17 10:01:15        Team1   0.773611   1.750000   
4    34050431 2025-03-18 21:23:20        Team0   7.236111  11.000000   
..        ...                 ...          ...        ...        ...   
995  34104844 2025-03-20 18:20:41        Team1   0.469841   2.000000   
996  34499947 2025-04-02 00:03:06        Team0   3.840278   9.000000   
997  34104805 2025-03-20 18:19:27        Team1   0.376263   0.666667   
998  35644069 2025-05-10 21:42:36        Team0   1.094950   3.666667   
999  35463639 2025-05-06 08:04:09        Team1   0.736111   2.000000   

     t0_min_kd  t0_hero_kd_percentage  t0_hero_pick_percentage  \
0     0.142857              29.3333

Load and train model.

In [13]:
import os
import logging
import duckdb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# ── CONFIG ────────────────────────────────────────────────────────────────
USE_STAGED_CSV = False
STAGED_CSV_PATH = "data/final_training_data.csv"
DUCKDB_PATH      = "match_player_raw.duckdb"
DUCKDB_TABLE     = "training_set"

# ── SETUP LOGGING ─────────────────────────────────────────────────────────
logging.basicConfig(
    format="%(asctime)s %(levelname)s %(message)s",
    level=logging.INFO
)

# ── LOAD DATA ────────────────────────────────────────────────────────────
def load_data():
    if USE_STAGED_CSV and os.path.exists(STAGED_CSV_PATH):
        logging.info(f"Loading staged CSV: {STAGED_CSV_PATH}")
        return pd.read_csv(STAGED_CSV_PATH)
    logging.info("Querying DuckDB for training data…")
    con = duckdb.connect(DUCKDB_PATH)
    try:
        df = con.execute(f"SELECT * FROM {DUCKDB_TABLE}").fetchdf()
        df.to_csv(STAGED_CSV_PATH, index=False)
        logging.info(f"Staged CSV written: {STAGED_CSV_PATH}")
    except Exception as e:
        logging.exception("Failed to stage CSV")
        raise
    finally:
        con.close()
    return df

# ── VALIDATE ─────────────────────────────────────────────────────────────
def validate(df):
    required = [c for c in df.columns if c not in ("match_id","start_time","team0_won")]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns: {missing}")
    if df[required].isnull().any().any():
        logging.warning("Null values detected in feature columns")
    return df

# ── TRAIN/EVAL ───────────────────────────────────────────────────────────
def train_and_evaluate(df):
    X = df.drop(["match_id","start_time","winning_team","team0_won"], axis=1)
    y = df["team0_won"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:,1]
    print(classification_report(y_test, preds))

    # return everything you need for later
    return model, X_test, y_test, preds, probs

# ── MAIN ─────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    df    = load_data()
    df    = validate(df)
    model, X_test, y_test, preds, probs = train_and_evaluate(df)
    joblib.dump(model, "models/rf_team_classifier.joblib")
    logging.info("Model saved!")

    # ── LOOK AT A FEW ROWS ───────────────────────────────────────────────
    import pandas as pd

    # 1) match_id / start_time for each test row
    match_info = df.loc[X_test.index, ["match_id", "start_time"]].reset_index(drop=True)

    # 2) build the results table
    results = pd.DataFrame({
        "match_id":             match_info["match_id"],
        "start_time":           match_info["start_time"],
        "predicted_probability": probs,
        "predicted_label":      preds,
        "actual_label":         y_test.values,
    })
    results["success"] = results["predicted_label"] == results["actual_label"]

    # 3) print a random sample
    sample = results.sample(10, random_state=42).reset_index(drop=True)
    print(sample.to_string(index=False))


2025-05-21 16:08:36,841 INFO Querying DuckDB for training data…
2025-05-21 16:08:36,887 INFO Staged CSV written: data/final_training_data.csv
2025-05-21 16:08:37,120 INFO Model saved!


              precision    recall  f1-score   support

           0       0.85      0.89      0.87        91
           1       0.90      0.87      0.89       109

    accuracy                           0.88       200
   macro avg       0.88      0.88      0.88       200
weighted avg       0.88      0.88      0.88       200

 match_id          start_time  predicted_probability  predicted_label  actual_label  success
 34903405 2025-04-16 02:11:42                   0.21                0             0     True
 34415255 2025-03-30 06:22:37                   0.72                1             1     True
 34842599 2025-04-13 20:11:55                   0.11                0             0     True
 35597211 2025-05-09 23:35:04                   0.00                0             0     True
 35472192 2025-05-06 17:16:54                   0.04                0             1    False
 34552996 2025-04-03 22:22:26                   0.86                1             1     True
 34715730 2025-04-09 1

In [12]:
import pandas as pd

# Assuming you still have:
# df           – your full DataFrame with match_id/start_time
# X_test       – your features for the test set
# y_test       – the true labels for the test set
# preds        – model.predict(X_test)
# probs        – model.predict_proba(X_test)[:,1]

# 1) Grab the match identifiers for those test rows
match_info = df.loc[X_test.index, ["match_id", "start_time"]].reset_index(drop=True)

# 2) Build a results table
results = pd.DataFrame({
    "match_id":            match_info["match_id"],
    "start_time":          match_info["start_time"],
    "predicted_probability": probs,
    "predicted_label":     preds,
    "actual_label":        y_test.values,
})
results["success"] = results["predicted_label"] == results["actual_label"]

# 3) Peek at a few examples
sample = results.sample(10, random_state=42).reset_index(drop=True)
print(sample.to_string(index=False))

NameError: name 'X_test' is not defined

confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# 1) Split & train (instead of hiding inside a function)
X = df.drop(["match_id","start_time","team0_won","winning_team"], axis=1)
y = df["team0_won"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 2) Get preds
preds = model.predict(X_test)

# 3) Confusion matrix
cm = confusion_matrix(y_test, preds)
disp = ConfusionMatrixDisplay(cm, display_labels=[0,1])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix")
plt.show()


attach true and pred to x_test

In [None]:
# attach true & pred to X_test
X_err = X_test.copy()
X_err["y_true"] = y_test.values
X_err["y_pred"] = preds
mis = X_err[X_err["y_true"] != X_err["y_pred"]]
print("Sample misclassifications:\n", mis.head())


Feature Importance

In [None]:
import pandas as pd

# get importances
fi = pd.Series(model.feature_importances_, index=X_train.columns)
fi = fi.sort_values(ascending=False)

# print top 10
print("Top 10 features:\n", fi.head(10))

# simple bar plot
fi.head(10).plot.bar()
plt.ylabel("Importance")
plt.title("Top 10 Feature Importances")
plt.show()


Calibration wrapper

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

param_dist = {
    "n_estimators": [50,100,200,300],
    "max_depth":    [None, 10,20,30],
    "min_samples_split": [2,5,10],
    "min_samples_leaf":  [1,2,4],
    "max_features": ["sqrt","log2"]
}

rscv = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring="accuracy",
    n_jobs=-1,
    random_state=42
)

rscv.fit(X_train, y_train)
print("Best params:", rscv.best_params_)
print("CV best score:", rscv.best_score_)

# retrain best estimator on full train set if desired
best_rf = rscv.best_estimator_



Investigating anomoly with training % being static

In [None]:
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

# wrap your best RF (or base model) in a calibrator
calibrated = CalibratedClassifierCV(
    base_estimator=best_rf, cv=5, method="sigmoid"
)
calibrated.fit(X_train, y_train)

# get calibrated probabilities
probs = calibrated.predict_proba(X_test)[:,1]

# plot reliability curve
prob_true, prob_pred = calibration_curve(y_test, probs, n_bins=10)
plt.plot(prob_pred, prob_true, marker="o")
plt.plot([0,1],[0,1], linestyle="--")
plt.title("Calibration Curve")
plt.xlabel("Predicted probability")
plt.ylabel("True probability")
plt.show()


In [None]:
query = con.execute(f"SELECT count(*) FROM matches")
print(query.fetchone())