In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from scipy import stats
from scipy.stats import chi2_contingency, ttest_ind, mannwhitneyu, pointbiserialr

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score, brier_score_loss, log_loss
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, precision_recall_curve
from sklearn.calibration import calibration_curve, CalibratedClassifierCV

from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.feature_selection import SelectKBest, f_classif, RFE, mutual_info_classif

import xgboost as xgb
import lightgbm as lgb

import requests
import json
from datetime import datetime, timedelta
import calendar
import os
import time
import pickle

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.3f}'.format)

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

In [None]:
USERNAME = "EricRosen"  
TOKEN = os.environ.get('LICHESS_TOKEN', '')  

MAX_GAMES = 5000  
GAME_TYPE = "blitz" 

# Analysis settings
MIN_GAMES_FOR_OPENING = 5  
RECENT_FORM_WINDOW = 10 
TEST_SIZE = 0.2  
RANDOM_STATE = 42  
CV_FOLDS = 5  

CHART_TEMPLATE = "plotly_white"  
COLOR_WIN = "#2ecc71"  
COLOR_DRAW = "#f39c12" 
COLOR_LOSS = "#e74c3c"  
COLOR_PRIMARY = "#3498db"  
COLOR_SECONDARY = "#9b59b6"  

print(f"Configuration loaded:")
print(f"  - Username: {USERNAME}")
print(f"  - Game type: {GAME_TYPE}")
print(f"  - Max games: {MAX_GAMES}")
print(f"  - CV Folds: {CV_FOLDS}")

Configuration loaded:
  - Username: EricRosen
  - Game type: blitz
  - Max games: 5000
  - CV Folds: 5


In [None]:
def fetch_user_games(username, token, max_games=1000, game_type='blitz'):
    """
    Fetch user games from Lichess API
    """
    url = f"https://lichess.org/api/games/user/{username}"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/x-ndjson"
    }
    params = {
        "max": max_games,
        "rated": "true",
        "pgnInJson": "true",
        "clocks": "true",
        "opening": "true"
    }
    
    if game_type != 'all':
        params["perf"] = game_type
    
    games = []
    
    try:
        print(f"Fetching games from Lichess API...")
        response = requests.get(url, headers=headers, params=params, stream=True, timeout=60)
        response.raise_for_status()
        
        for i, line in enumerate(response.iter_lines()):
            if line:
                game = json.loads(line.decode('utf-8'))
                games.append(game)
                
                if (i + 1) % 500 == 0:
                    print(f"  Fetched {i + 1} games...")
        
        print(f"✅ Successfully fetched {len(games)} games")
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching games: {e}")
    
    return games

In [4]:
def process_games_to_dataframe(games, username):
    """
    Process raw game data into a structured DataFrame
    """
    games_data = []
    
    for game in games:
        try:
            players = game.get('players', {})
            white_player = players.get('white', {}).get('user', {}).get('name', '').lower()
            black_player = players.get('black', {}).get('user', {}).get('name', '').lower()
            
            is_white = white_player == username.lower()
            player_color = 'white' if is_white else 'black'
            opponent_color = 'black' if is_white else 'white'
            
            winner = game.get('winner')
            if winner == player_color:
                outcome = 'win'
                outcome_numeric = 1
            elif winner is None:
                outcome = 'draw'
                outcome_numeric = 0.5
            else:
                outcome = 'loss'
                outcome_numeric = 0
            
            player_rating = players.get(player_color, {}).get('rating', None)
            opponent_rating = players.get(opponent_color, {}).get('rating', None)
            rating_diff = players.get(player_color, {}).get('ratingDiff', 0)
            
            opening = game.get('opening', {})
            opening_name = opening.get('name', 'Unknown')
            opening_eco = opening.get('eco', 'Unknown')
            
            created_at = game.get('createdAt', 0)
            game_date = pd.to_datetime(created_at / 1000, unit='s') if created_at else None
            
            clocks = game.get('clocks', [])
            time_trouble = 0
            final_clock = None
            avg_time_per_move = None
            min_clock = None
            time_variance = None
            
            if clocks and len(clocks) > 0:
                player_clocks = []
                for i, clock in enumerate(clocks):
                    if (i % 2 == 0 and is_white) or (i % 2 == 1 and not is_white):
                        player_clocks.append(clock / 100)
                
                if player_clocks:
                    final_clock = player_clocks[-1]
                    min_clock = min(player_clocks)
                    time_trouble = 1 if min_clock < 30 else 0
                    
                    if len(player_clocks) > 1:
                        time_spent = [player_clocks[i-1] - player_clocks[i] 
                                      for i in range(1, len(player_clocks))
                                      if player_clocks[i-1] - player_clocks[i] > 0]
                        if time_spent:
                            avg_time_per_move = np.mean(time_spent)
                            time_variance = np.var(time_spent)
            
            moves_str = game.get('moves', '')
            num_moves = len(moves_str.split()) // 2 if moves_str else 0
            
            # Extract time control
            clock_info = game.get('clock', {})
            initial_time = clock_info.get('initial', 0)
            increment = clock_info.get('increment', 0)
            
            game_data = {
                'game_id': game.get('id'),
                'date': game_date,
                'speed': game.get('speed'),
                'status': game.get('status'),
                'player_color': player_color,
                'player_rating': player_rating,
                'opponent_rating': opponent_rating,
                'rating_diff_received': rating_diff,
                'rating_gap': opponent_rating - player_rating if (opponent_rating and player_rating) else None,
                'outcome': outcome,
                'outcome_numeric': outcome_numeric,
                'outcome_binary': 1 if outcome == 'win' else 0,
                'opening_name': opening_name,
                'opening_eco': opening_eco,
                'opening_family': opening_eco[:1] if opening_eco != 'Unknown' else 'Unknown',
                'opening_subfamily': opening_eco[:2] if opening_eco != 'Unknown' else 'Unknown',
                'num_moves': num_moves,
                'time_trouble': time_trouble,
                'final_clock': final_clock,
                'min_clock': min_clock,
                'avg_time_per_move': avg_time_per_move,
                'time_variance': time_variance,
                'initial_time': initial_time,
                'increment': increment,
                'hour_of_day': game_date.hour if game_date else None,
                'day_of_week': game_date.dayofweek if game_date else None,
                'month': game_date.month if game_date else None
            }
            
            games_data.append(game_data)
            
        except Exception as e:
            continue
    
    df = pd.DataFrame(games_data)
    df = df.sort_values('date').reset_index(drop=True)
    
    print(f"Processed {len(df)} games into DataFrame")
    
    return df

In [5]:
raw_games = fetch_user_games(USERNAME, TOKEN, MAX_GAMES, GAME_TYPE)

Fetching games from Lichess API...
  Fetched 500 games...
  Fetched 1000 games...
  Fetched 1500 games...
  Fetched 2000 games...
  Fetched 2500 games...
  Fetched 3000 games...
  Fetched 3500 games...
  Fetched 4000 games...
  Fetched 4500 games...
  Fetched 5000 games...
✅ Successfully fetched 5010 games


In [6]:
df = process_games_to_dataframe(raw_games, USERNAME)
df.head()

Processed 5010 games into DataFrame


Unnamed: 0,game_id,date,speed,status,player_color,player_rating,opponent_rating,rating_diff_received,rating_gap,outcome,outcome_numeric,outcome_binary,opening_name,opening_eco,opening_family,opening_subfamily,num_moves,time_trouble,final_clock,min_clock,avg_time_per_move,time_variance,initial_time,increment,hour_of_day,day_of_week,month
0,IrZxa0OO,2023-11-20 05:21:41.770999908,ultraBullet,mate,white,2307,2018,2,-289,win,1.0,1,Englund Gambit,A40,A,A4,19,1,9.43,9.43,0.505,0.065,15,0,5,0,11
1,WiKPy6ra,2023-11-20 05:22:05.135999918,ultraBullet,outoftime,black,2309,2013,2,-296,win,1.0,1,Queen's Pawn Game: Torre Attack,D03,D,D0,45,1,2.28,2.28,0.374,0.1,15,0,5,0,11
2,vegkAErg,2023-11-20 05:22:47.364000082,ultraBullet,mate,white,2311,2008,2,-303,win,1.0,1,Pirc Defense,B00,B,B0,26,1,5.6,5.6,0.552,0.083,15,0,5,0,11
3,MEuxbTcQ,2023-11-20 05:23:19.673000097,ultraBullet,outoftime,black,2313,2003,2,-310,win,1.0,1,"Alekhine Defense: Scandinavian Variation, Gesc...",B02,B,B0,37,1,3.43,3.43,0.385,0.057,15,0,5,0,11
4,FKf1uUo3,2023-11-20 05:24:00.244999886,ultraBullet,outoftime,white,2315,1998,2,-317,win,1.0,1,Pirc Defense,B00,B,B0,35,1,2.39,2.39,0.485,0.086,15,0,5,0,11


In [7]:
print("Missing Values Summary:")
print("=" * 50)
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)

missing_df[missing_df['Missing Count'] > 0]

Missing Values Summary:


Unnamed: 0,Missing Count,Missing %
time_variance,26,0.519
avg_time_per_move,26,0.519
final_clock,9,0.18
min_clock,9,0.18


In [8]:
print("Data Quality Checks:")
print("=" * 50)

missing_ratings = df[(df['player_rating'].isnull()) | (df['opponent_rating'].isnull())]
print(f"Games with missing ratings: {len(missing_ratings)} ({len(missing_ratings)/len(df)*100:.1f}%)")

short_games = df[df['num_moves'] < 5]
print(f"Very short games (<5 moves): {len(short_games)} ({len(short_games)/len(df)*100:.1f}%)")

timeout_games = df[df['status'].isin(['timeout', 'outoftime'])]
print(f"Timeout games: {len(timeout_games)} ({len(timeout_games)/len(df)*100:.1f}%)")

print(f"\nRating Gap Statistics:")
print(df['rating_gap'].describe())

Data Quality Checks:
Games with missing ratings: 0 (0.0%)
Very short games (<5 moves): 30 (0.6%)
Timeout games: 1841 (36.7%)

Rating Gap Statistics:
count    5010.000
mean     -202.852
std       287.150
min     -1776.000
25%      -315.750
50%      -132.000
75%       -13.000
max       392.000
Name: rating_gap, dtype: float64


In [9]:
df_clean = df.copy()
df_clean = df_clean.dropna(subset=['player_rating', 'opponent_rating', 'date'])

print(f"Original dataset: {len(df)} games")
print(f"Cleaned dataset: {len(df_clean)} games")
print(f"Removed: {len(df) - len(df_clean)} games ({(len(df) - len(df_clean))/len(df)*100:.1f}%)")

Original dataset: 5010 games
Cleaned dataset: 5010 games
Removed: 0 games (0.0%)


In [10]:
df = df_clean.copy()

print("\n" + "=" * 60)
print("FINAL DATASET SUMMARY")
print("=" * 60)
print(f"Total games: {len(df):,}")
print(f"Date range: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")
print(f"Unique openings: {df['opening_name'].nunique()}")
print(f"\nGame speeds:")
print(df['speed'].value_counts())


FINAL DATASET SUMMARY
Total games: 5,010
Date range: 2023-11-20 to 2025-12-03
Unique openings: 620

Game speeds:
speed
blitz          2054
ultraBullet    1354
bullet         1199
rapid           334
classical        69
Name: count, dtype: int64


In [11]:
total_games = len(df)
wins = len(df[df['outcome'] == 'win'])
draws = len(df[df['outcome'] == 'draw'])
losses = len(df[df['outcome'] == 'loss'])

win_rate = wins / total_games * 100
draw_rate = draws / total_games * 100
loss_rate = losses / total_games * 100

print("overall performance statistics")
print(f"\nTotal Games: {total_games:,}")
print(f"\nResults:")
print(f"  Wins:   {wins:,} ({win_rate:.1f}%)")
print(f"  Draws:  {draws:,} ({draw_rate:.1f}%)")
print(f"  Losses: {losses:,} ({loss_rate:.1f}%)")
print(f"\nPerformance Score: {(wins + 0.5*draws) / total_games * 100:.1f}%")

overall performance statistics

Total Games: 5,010

Results:
  Wins:   3,250 (64.9%)
  Draws:  271 (5.4%)
  Losses: 1,489 (29.7%)

Performance Score: 67.6%


In [12]:
# Overall results pie chart
fig_overall = px.pie(
    values=[wins, draws, losses],
    names=['Wins', 'Draws', 'Losses'],
    title=f'Overall Game Results (n={total_games:,})',
    color_discrete_sequence=[COLOR_WIN, COLOR_DRAW, COLOR_LOSS],
    hole=0.4
)
fig_overall.update_traces(textposition='outside', textinfo='percent+label+value')
fig_overall.update_layout(template=CHART_TEMPLATE, font=dict(size=14))
fig_overall.show()

In [13]:
# Game termination status breakdown
status_counts = df['status'].value_counts()

fig_status = px.bar(
    x=status_counts.index,
    y=status_counts.values,
    title='Game Termination Status',
    labels={'x': 'Status', 'y': 'Number of Games'},
    color=status_counts.values,
    color_continuous_scale='Viridis'
)
fig_status.update_layout(template=CHART_TEMPLATE, showlegend=False, xaxis_tickangle=-45)
fig_status.show()

In [14]:
# Game length distribution
fig_moves = px.histogram(
    df, x='num_moves', nbins=50,
    title='Distribution of Game Length (Number of Moves)',
    labels={'num_moves': 'Number of Moves', 'count': 'Frequency'},
    color_discrete_sequence=[COLOR_PRIMARY]
)
fig_moves.add_vline(x=df['num_moves'].median(), line_dash="dash", line_color="red",
                    annotation_text=f"Median: {df['num_moves'].median():.0f}")
fig_moves.update_layout(template=CHART_TEMPLATE)
fig_moves.show()

print(f"Game length statistics:")
print(df['num_moves'].describe())

Game length statistics:
count   5010.000
mean      43.398
std       19.063
min        0.000
25%       30.000
50%       42.000
75%       56.000
max      191.000
Name: num_moves, dtype: float64


In [15]:
# Player rating over time
df_sorted = df.sort_values('date')

fig_rating = go.Figure()
fig_rating.add_trace(go.Scatter(
    x=df_sorted['date'], y=df_sorted['player_rating'],
    mode='lines', name='Rating',
    line=dict(color=COLOR_PRIMARY, width=1), opacity=0.7
))

rolling_avg = df_sorted['player_rating'].rolling(window=20, min_periods=1).mean()
fig_rating.add_trace(go.Scatter(
    x=df_sorted['date'], y=rolling_avg,
    mode='lines', name='20-Game Moving Average',
    line=dict(color='red', width=3)
))

fig_rating.update_layout(
    title='Rating Progression Over Time',
    xaxis_title='Date', yaxis_title='Rating',
    template=CHART_TEMPLATE, hovermode='x unified'
)
fig_rating.show()

print(f"\nRating Statistics:")
print(f"  Starting: {df_sorted['player_rating'].iloc[0]:.0f}")
print(f"  Current:  {df_sorted['player_rating'].iloc[-1]:.0f}")
print(f"  Change:   {df_sorted['player_rating'].iloc[-1] - df_sorted['player_rating'].iloc[0]:+.0f}")
print(f"  Peak:     {df['player_rating'].max():.0f}")
print(f"  Low:      {df['player_rating'].min():.0f}")


Rating Statistics:
  Starting: 2307
  Current:  2627
  Change:   +320
  Peak:     2961
  Low:      2001


In [16]:
# Rating gap distribution
fig_gap = px.histogram(
    df, x='rating_gap', nbins=50,
    title='Rating Gap Distribution (Opponent Rating - Your Rating)',
    labels={'rating_gap': 'Rating Gap', 'count': 'Number of Games'},
    color_discrete_sequence=[COLOR_PRIMARY]
)
fig_gap.add_vline(x=0, line_dash="dash", line_color="red", annotation_text="Equal rating")
fig_gap.add_vline(x=df['rating_gap'].mean(), line_dash="dot", line_color="green", 
                  annotation_text=f"Mean: {df['rating_gap'].mean():.0f}")
fig_gap.update_layout(template=CHART_TEMPLATE)
fig_gap.show()

In [17]:
# ELO expected score function
def expected_score_elo(rating_diff):
    """Calculate expected score using ELO formula"""
    return 1 / (1 + 10 ** (rating_diff / 400))

# Create rating gap buckets
df['rating_gap_bucket'] = pd.cut(
    df['rating_gap'],
    bins=[-500, -200, -100, -50, 0, 50, 100, 200, 500],
    labels=['<-200', '-200 to -100', '-100 to -50', '-50 to 0', 
            '0 to 50', '50 to 100', '100 to 200', '>200']
)

gap_performance = df.groupby('rating_gap_bucket').agg({
    'outcome_binary': ['mean', 'count'],
    'outcome_numeric': 'mean'
}).round(3)
gap_performance.columns = ['win_rate', 'games', 'score']
gap_performance['win_rate_pct'] = gap_performance['win_rate'] * 100

print("Win Rate by Rating Gap:")
print("=" * 60)
print(gap_performance)

Win Rate by Rating Gap:
                   win_rate  games  score  win_rate_pct
rating_gap_bucket                                      
<-200                 0.784   1202  0.804        78.400
-200 to -100          0.655    882  0.680        65.500
-100 to -50           0.610    526  0.644        61.000
-50 to 0              0.475    592  0.514        47.500
0 to 50               0.499    417  0.541        49.900
50 to 100             0.409    298  0.445        40.900
100 to 200            0.289    232  0.341        28.900
>200                  0.252    127  0.291        25.200


In [18]:
# Compare actual vs ELO expected
df['elo_expected'] = df['rating_gap'].apply(expected_score_elo)
df['rating_gap_rounded'] = (df['rating_gap'] / 25).round() * 25

comparison = df.groupby('rating_gap_rounded').agg({
    'outcome_numeric': 'mean',
    'elo_expected': 'mean',
    'game_id': 'count'
}).reset_index()
comparison.columns = ['rating_gap', 'actual_score', 'expected_score', 'games']
comparison = comparison[comparison['games'] >= 10]

fig_compare = go.Figure()
fig_compare.add_trace(go.Scatter(
    x=comparison['rating_gap'], y=comparison['actual_score'] * 100,
    mode='markers+lines', name='Your Actual Score',
    line=dict(color=COLOR_PRIMARY, width=3), marker=dict(size=8)
))
fig_compare.add_trace(go.Scatter(
    x=comparison['rating_gap'], y=comparison['expected_score'] * 100,
    mode='lines', name='ELO Expected Score',
    line=dict(color='red', width=2, dash='dash')
))
fig_compare.add_hline(y=50, line_dash="dot", line_color="gray")
fig_compare.update_layout(
    title='Actual Performance vs ELO Expected Score',
    xaxis_title='Rating Gap (Opponent - You)',
    yaxis_title='Score (%)',
    template=CHART_TEMPLATE, hovermode='x unified'
)
fig_compare.show()

df['elo_deviation'] = df['outcome_numeric'] - df['elo_expected']
mean_deviation = df['elo_deviation'].mean()
print(f"\nELO Deviation: {mean_deviation:+.3f}")
print(f"You perform {'BETTER' if mean_deviation > 0 else 'WORSE'} than ELO predicts by {abs(mean_deviation)*100:.1f}%")


ELO Deviation: -0.002
You perform WORSE than ELO predicts by 0.2%


In [19]:
# Opening performance analysis
opening_stats = df.groupby('opening_name').agg({
    'outcome_binary': ['mean', 'sum'],
    'outcome_numeric': 'mean',
    'game_id': 'count',
    'rating_gap': 'mean'
}).round(3)
opening_stats.columns = ['win_rate', 'wins', 'score', 'games', 'avg_rating_gap']
opening_stats = opening_stats.reset_index()
opening_stats['win_rate_pct'] = opening_stats['win_rate'] * 100

opening_stats_filtered = opening_stats[opening_stats['games'] >= MIN_GAMES_FOR_OPENING].copy()
opening_stats_filtered = opening_stats_filtered.sort_values('games', ascending=False)

print(f"Openings played: {len(opening_stats)}")
print(f"Openings with {MIN_GAMES_FOR_OPENING}+ games: {len(opening_stats_filtered)}")

Openings played: 620
Openings with 5+ games: 184


In [20]:
# Top openings visualization
top_openings = opening_stats_filtered.nlargest(15, 'games')

fig_openings = px.bar(
    top_openings, x='games', y='opening_name', orientation='h',
    title='Top 15 Most Played Openings',
    labels={'games': 'Number of Games', 'opening_name': 'Opening'},
    color='win_rate_pct', color_continuous_scale='RdYlGn',
    text='win_rate_pct'
)
fig_openings.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig_openings.update_layout(
    template=CHART_TEMPLATE,
    yaxis={'categoryorder': 'total ascending'},
    coloraxis_colorbar_title='Win Rate %'
)
fig_openings.show()

In [21]:
# Performance by color
color_stats = df.groupby('player_color').agg({
    'outcome_binary': ['mean', 'sum'],
    'outcome_numeric': 'mean',
    'game_id': 'count',
    'rating_gap': 'mean'
}).round(3)
color_stats.columns = ['win_rate', 'wins', 'score', 'games', 'avg_rating_gap']
color_stats = color_stats.reset_index()
color_stats['win_rate_pct'] = color_stats['win_rate'] * 100

print("Performance by Color:")
print("=" * 60)
print(color_stats.to_string(index=False))

white_wr = color_stats[color_stats['player_color'] == 'white']['win_rate_pct'].values[0]
black_wr = color_stats[color_stats['player_color'] == 'black']['win_rate_pct'].values[0]
print(f"\nYour white advantage: {white_wr - black_wr:+.1f}%")

Performance by Color:
player_color  win_rate  wins  score  games  avg_rating_gap  win_rate_pct
       black     0.633  1585  0.662   2503        -201.957        63.300
       white     0.664  1665  0.690   2507        -203.745        66.400

Your white advantage: +3.1%


In [22]:
# Time trouble analysis
time_trouble_stats = df.groupby('time_trouble').agg({
    'outcome_binary': 'mean',
    'outcome_numeric': 'mean',
    'game_id': 'count'
}).round(3)
time_trouble_stats.columns = ['win_rate', 'score', 'games']
time_trouble_stats = time_trouble_stats.reset_index()
time_trouble_stats['time_trouble'] = time_trouble_stats['time_trouble'].map({0: 'No Time Trouble', 1: 'Time Trouble'})
time_trouble_stats['win_rate_pct'] = time_trouble_stats['win_rate'] * 100

print("Time Trouble Impact:")
print("=" * 60)
print(time_trouble_stats.to_string(index=False))

time_trouble_rate = df['time_trouble'].mean() * 100
print(f"\nTime trouble rate: {time_trouble_rate:.1f}%")

Time Trouble Impact:
   time_trouble  win_rate  score  games  win_rate_pct
No Time Trouble     0.908  0.917   1278        90.800
   Time Trouble     0.560  0.593   3732        56.000

Time trouble rate: 74.5%


In [23]:
# Time trouble visualization
if len(time_trouble_stats) > 1:
    fig_time = px.bar(
        time_trouble_stats, x='time_trouble', y='win_rate_pct',
        color='time_trouble',
        title='Win Rate: Time Trouble vs Normal',
        labels={'time_trouble': '', 'win_rate_pct': 'Win Rate (%)'},
        color_discrete_map={'No Time Trouble': COLOR_WIN, 'Time Trouble': COLOR_LOSS},
        text='games'
    )
    fig_time.update_traces(texttemplate='n=%{text}', textposition='outside')
    fig_time.update_layout(template=CHART_TEMPLATE, showlegend=False)
    fig_time.show()

In [24]:
# Calculate rolling form
df_sorted = df.sort_values('date').reset_index(drop=True)

for window in [5, 10, 20]:
    df_sorted[f'form_{window}'] = df_sorted['outcome_binary'].rolling(window=window, min_periods=1).mean().shift(1)
    df_sorted[f'score_form_{window}'] = df_sorted['outcome_numeric'].rolling(window=window, min_periods=1).mean().shift(1)

# Streak calculation
df_sorted['streak'] = 0
current_streak = 0
last_outcome = None

for idx, row in df_sorted.iterrows():
    if last_outcome is None:
        current_streak = 1 if row['outcome_binary'] == 1 else -1
    elif row['outcome_binary'] == 1:
        current_streak = current_streak + 1 if current_streak > 0 else 1
    else:
        current_streak = current_streak - 1 if current_streak < 0 else -1
    
    df_sorted.at[idx, 'streak'] = current_streak
    last_outcome = row['outcome_binary']

df_sorted['prev_streak'] = df_sorted['streak'].shift(1)

print("Form features calculated!")
df_sorted[['date', 'outcome', 'form_5', 'form_10', 'streak', 'prev_streak']].tail(10)

Form features calculated!


Unnamed: 0,date,outcome,form_5,form_10,streak,prev_streak
5000,2025-12-03 08:15:18.173000097,win,0.6,0.4,4,3.0
5001,2025-12-03 08:18:31.339999914,win,0.8,0.5,5,4.0
5002,2025-12-03 08:21:01.112999916,win,1.0,0.6,6,5.0
5003,2025-12-03 08:28:39.062000036,win,1.0,0.7,7,6.0
5004,2025-12-03 08:35:55.558000088,win,1.0,0.7,8,7.0
5005,2025-12-03 08:38:12.983999968,win,1.0,0.8,9,8.0
5006,2025-12-03 08:41:30.121999979,win,1.0,0.9,10,9.0
5007,2025-12-03 08:46:32.815000057,win,1.0,1.0,11,10.0
5008,2025-12-03 08:51:04.030999899,loss,1.0,1.0,-1,11.0
5009,2025-12-03 08:55:25.174000025,loss,0.8,0.9,-2,-1.0


In [25]:
# Form visualization
fig_form = go.Figure()
fig_form.add_trace(go.Scatter(
    x=df_sorted['date'], y=df_sorted['form_10'] * 100,
    mode='lines', name='10-Game Form',
    line=dict(color=COLOR_PRIMARY, width=2)
))
fig_form.add_trace(go.Scatter(
    x=df_sorted['date'], y=df_sorted['form_20'] * 100,
    mode='lines', name='20-Game Form',
    line=dict(color='orange', width=2)
))
fig_form.add_hline(y=50, line_dash="dash", line_color="gray")
fig_form.update_layout(
    title='Form (Rolling Win Rate) Over Time',
    xaxis_title='Date', yaxis_title='Win Rate (%)',
    template=CHART_TEMPLATE, hovermode='x unified'
)
fig_form.show()

In [26]:
# Hour of day analysis
hourly_stats = df.groupby('hour_of_day').agg({
    'outcome_binary': ['mean', 'count']
}).round(3)
hourly_stats.columns = ['win_rate', 'games']
hourly_stats = hourly_stats.reset_index()
hourly_stats['win_rate_pct'] = hourly_stats['win_rate'] * 100

fig_hourly = px.bar(
    hourly_stats, x='hour_of_day', y='games',
    color='win_rate_pct',
    title='Games Played and Win Rate by Hour of Day',
    labels={'hour_of_day': 'Hour (24h)', 'games': 'Number of Games'},
    color_continuous_scale='RdYlGn'
)
fig_hourly.update_layout(template=CHART_TEMPLATE)
fig_hourly.show()

In [27]:
# Day/Hour heatmap
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
heatmap_data = df.groupby(['day_of_week', 'hour_of_day'])['outcome_binary'].mean().unstack(fill_value=0) * 100

fig_heatmap = px.imshow(
    heatmap_data,
    labels=dict(x="Hour of Day", y="Day of Week", color="Win Rate %"),
    x=heatmap_data.columns, y=day_names,
    color_continuous_scale='RdYlGn',
    title='Win Rate Heatmap: Day of Week vs Hour'
)
fig_heatmap.update_layout(template=CHART_TEMPLATE)
fig_heatmap.show()

In [28]:
# Create feature set
df_features = df_sorted.copy()

# Feature 1: Rating-based
df_features['rating_gap'] = df_features['opponent_rating'] - df_features['player_rating']
df_features['rating_gap_abs'] = df_features['rating_gap'].abs()
df_features['is_underdog'] = (df_features['rating_gap'] > 0).astype(int)
df_features['rating_gap_squared'] = df_features['rating_gap'] ** 2

# Feature 2: Color encoding
df_features['is_white'] = (df_features['player_color'] == 'white').astype(int)

# Feature 3: Opening performance
opening_win_rates = df_features.groupby('opening_name')['outcome_binary'].transform('mean')
df_features['opening_personal_wr'] = opening_win_rates

# Feature 4: Color-specific performance
color_win_rates = df_features.groupby('player_color')['outcome_binary'].transform('mean')
df_features['color_personal_wr'] = color_win_rates

# Feature 5: ELO expected score
df_features['elo_expected'] = df_features['rating_gap'].apply(expected_score_elo)

# Feature 6: Time period
df_features['time_period'] = pd.cut(
    df_features['hour_of_day'],
    bins=[0, 6, 12, 18, 24],
    labels=['Night', 'Morning', 'Afternoon', 'Evening'],
    include_lowest=True
)

print("Basic features created!")
print(f"Total features: {len(df_features.columns)}")

Basic features created!
Total features: 46


In [30]:
df_features.head()

Unnamed: 0,game_id,date,speed,status,player_color,player_rating,opponent_rating,rating_diff_received,rating_gap,outcome,outcome_numeric,outcome_binary,opening_name,opening_eco,opening_family,opening_subfamily,num_moves,time_trouble,final_clock,min_clock,avg_time_per_move,time_variance,initial_time,increment,hour_of_day,day_of_week,month,rating_gap_bucket,elo_expected,rating_gap_rounded,elo_deviation,form_5,score_form_5,form_10,score_form_10,form_20,score_form_20,streak,prev_streak,rating_gap_abs,is_underdog,rating_gap_squared,is_white,opening_personal_wr,color_personal_wr,time_period
0,IrZxa0OO,2023-11-20 05:21:41.770999908,ultraBullet,mate,white,2307,2018,2,-289,win,1.0,1,Englund Gambit,A40,A,A4,19,1,9.43,9.43,0.505,0.065,15,0,5,0,11,<-200,0.841,-300.0,0.159,,,,,,,1,,289,0,83521,1,0.647,0.664,Night
1,WiKPy6ra,2023-11-20 05:22:05.135999918,ultraBullet,outoftime,black,2309,2013,2,-296,win,1.0,1,Queen's Pawn Game: Torre Attack,D03,D,D0,45,1,2.28,2.28,0.374,0.1,15,0,5,0,11,<-200,0.846,-300.0,0.154,1.0,1.0,1.0,1.0,1.0,1.0,2,1.0,296,0,87616,0,0.5,0.633,Night
2,vegkAErg,2023-11-20 05:22:47.364000082,ultraBullet,mate,white,2311,2008,2,-303,win,1.0,1,Pirc Defense,B00,B,B0,26,1,5.6,5.6,0.552,0.083,15,0,5,0,11,<-200,0.851,-300.0,0.149,1.0,1.0,1.0,1.0,1.0,1.0,3,2.0,303,0,91809,1,0.618,0.664,Night
3,MEuxbTcQ,2023-11-20 05:23:19.673000097,ultraBullet,outoftime,black,2313,2003,2,-310,win,1.0,1,"Alekhine Defense: Scandinavian Variation, Gesc...",B02,B,B0,37,1,3.43,3.43,0.385,0.057,15,0,5,0,11,<-200,0.856,-300.0,0.144,1.0,1.0,1.0,1.0,1.0,1.0,4,3.0,310,0,96100,0,0.6,0.633,Night
4,FKf1uUo3,2023-11-20 05:24:00.244999886,ultraBullet,outoftime,white,2315,1998,2,-317,win,1.0,1,Pirc Defense,B00,B,B0,35,1,2.39,2.39,0.485,0.086,15,0,5,0,11,<-200,0.861,-325.0,0.139,1.0,1.0,1.0,1.0,1.0,1.0,5,4.0,317,0,100489,1,0.618,0.664,Night


In [31]:
# Opening-specific features

df_features['opening_games_count'] = df_features.groupby('opening_name').cumcount()

def calc_opening_rolling_wr(group):
    return group.rolling(window=10, min_periods=1).mean().shift(1)

df_features['opening_rolling_wr'] = df_features.groupby('opening_name')['outcome_binary'].transform(calc_opening_rolling_wr)

df_features['opening_color_key'] = df_features['opening_name'] + '_' + df_features['player_color']
opening_color_wr = df_features.groupby('opening_color_key')['outcome_binary'].transform('mean')
df_features['opening_color_wr'] = opening_color_wr

eco_family_wr = df_features.groupby('opening_family')['outcome_binary'].transform('mean')
df_features['eco_family_wr'] = eco_family_wr

print("Opening-specific features created!")

Opening-specific features created!


In [32]:
# Player style features

df_features['avg_game_length'] = df_features['num_moves'].expanding().mean().shift(1)

df_features['is_short_game'] = (df_features['num_moves'] < df_features['num_moves'].median()).astype(int)
short_game_wr = df_features.groupby('is_short_game')['outcome_binary'].transform('mean')
df_features['game_length_preference_wr'] = short_game_wr

df_features['decisive_game'] = (df_features['outcome'] != 'draw').astype(int)
df_features['aggression_index'] = df_features['outcome_binary'].expanding().sum() / (df_features['decisive_game'].expanding().sum() + 1)
df_features['aggression_index'] = df_features['aggression_index'].shift(1)

df_features['is_draw'] = (df_features['outcome'] == 'draw').astype(int)
df_features['draw_tendency'] = df_features['is_draw'].expanding().mean().shift(1)

print("Player style features created!")

Player style features created!


In [33]:
# Time control normalization

df_features['initial_time_minutes'] = df_features['initial_time'] / 60
df_features['time_control_total'] = df_features['initial_time'] + 40 * df_features['increment']

df_features['time_efficiency'] = df_features['final_clock'] / (df_features['initial_time'] + 1)
df_features['time_efficiency'] = df_features['time_efficiency'].clip(0, 1)

df_features['time_control_category'] = pd.cut(
    df_features['time_control_total'],
    bins=[0, 180, 480, 900, 3600],
    labels=['bullet', 'blitz', 'rapid', 'classical']
)

tc_wr = df_features.groupby('time_control_category')['outcome_binary'].transform('mean')
df_features['time_control_wr'] = tc_wr

df_features['avg_time_normalized'] = df_features['avg_time_per_move'] / (df_features['initial_time_minutes'] + 1)

print("Time control features created!")

Time control features created!


In [34]:
# Opponent history features

df_features['opponent_rating_bucket'] = pd.cut(
    df_features['opponent_rating'],
    bins=[0, 1200, 1400, 1600, 1800, 2000, 2200, 3500],
    labels=['<1200', '1200-1400', '1400-1600', '1600-1800', '1800-2000', '2000-2200', '2200+']
)

opponent_bucket_wr = df_features.groupby('opponent_rating_bucket')['outcome_binary'].transform('mean')
df_features['opponent_bucket_wr'] = opponent_bucket_wr

df_features['vs_higher_rated'] = (df_features['rating_gap'] > 50).astype(int)
df_features['vs_higher_rated_wr'] = df_features.groupby('vs_higher_rated')['outcome_binary'].transform('mean')

underdog_games = df_features[df_features['is_underdog'] == 1]
if len(underdog_games) > 0:
    underdog_wr = underdog_games['outcome_binary'].mean()
    df_features['upset_potential'] = underdog_wr
else:
    df_features['upset_potential'] = 0.5

print("Opponent history features created!")

Opponent history features created!


In [35]:
# Momentum and consistency features

df_features['result_volatility'] = df_features['outcome_binary'].rolling(window=20, min_periods=5).std().shift(1)
df_features['consistency_score'] = 1 - df_features['result_volatility'].fillna(0.5)

df_features['is_hot'] = (df_features['form_10'] > 0.6).astype(int) if 'form_10' in df_features.columns else 0
df_features['is_cold'] = (df_features['form_10'] < 0.4).astype(int) if 'form_10' in df_features.columns else 0

df_features['date_only'] = df_features['date'].dt.date
df_features['games_today'] = df_features.groupby('date_only').cumcount()

df_features['session_games'] = 0
for idx in range(len(df_features)):
    current_time = df_features.loc[idx, 'date']
    two_hours_ago = current_time - timedelta(hours=2)
    session_mask = (df_features['date'] >= two_hours_ago) & (df_features['date'] < current_time)
    df_features.loc[idx, 'session_games'] = session_mask.sum()

print("Momentum and consistency features created!")

Momentum and consistency features created!


In [36]:
all_feature_cols = [
    # Rating features
    'rating_gap', 'rating_gap_abs', 'is_underdog', 'rating_gap_squared', 'elo_expected',
    # Color features
    'is_white', 'color_personal_wr',
    # Form features
    'form_5', 'form_10', 'form_20', 'prev_streak',
    # Opening features
    'opening_personal_wr', 'opening_games_count', 'opening_rolling_wr', 
    'opening_color_wr', 'eco_family_wr',
    # Time features
    'time_trouble', 'time_efficiency', 'time_control_wr', 'avg_time_normalized',
    # Style features
    'avg_game_length', 'aggression_index', 'draw_tendency',
    # Opponent features
    'opponent_bucket_wr', 'vs_higher_rated_wr', 'upset_potential',
    # Momentum features
    'result_volatility', 'consistency_score', 'is_hot', 'is_cold',
    'games_today', 'session_games',
    # Temporal
    'hour_of_day', 'day_of_week'
]

available_features = [f for f in all_feature_cols if f in df_features.columns]

print(f"\nTotal available features: {len(available_features)}")
print("\nFeature categories:")
print("  Rating: rating_gap, rating_gap_abs, is_underdog, elo_expected")
print("  Color: is_white, color_personal_wr")
print("  Form: form_5, form_10, form_20, prev_streak")
print("  Opening: opening_personal_wr, opening_rolling_wr, eco_family_wr")
print("  Time: time_trouble, time_efficiency, time_control_wr")
print("  Style: avg_game_length, aggression_index, draw_tendency")
print("  Opponent: opponent_bucket_wr, vs_higher_rated_wr")
print("  Momentum: result_volatility, consistency_score, is_hot, is_cold")


Total available features: 34

Feature categories:
  Rating: rating_gap, rating_gap_abs, is_underdog, elo_expected
  Color: is_white, color_personal_wr
  Form: form_5, form_10, form_20, prev_streak
  Opening: opening_personal_wr, opening_rolling_wr, eco_family_wr
  Time: time_trouble, time_efficiency, time_control_wr
  Style: avg_game_length, aggression_index, draw_tendency
  Opponent: opponent_bucket_wr, vs_higher_rated_wr
  Momentum: result_volatility, consistency_score, is_hot, is_cold


In [37]:
# Correlation matrix
correlation_cols = available_features + ['outcome_binary']
df_corr = df_features[correlation_cols].dropna()

correlation_matrix = df_corr.corr()

# Correlation heatmap
fig_corr = px.imshow(
    correlation_matrix,
    labels=dict(color="Correlation"),
    color_continuous_scale='RdBu_r',
    title='Feature Correlation Matrix',
    text_auto='.2f',
    aspect='auto'
)
fig_corr.update_layout(template=CHART_TEMPLATE, height=800, width=1000)
fig_corr.show()

In [38]:
# Correlation with outcome
outcome_correlations = correlation_matrix['outcome_binary'].drop('outcome_binary').sort_values()

fig_outcome_corr = px.bar(
    x=outcome_correlations.values,
    y=outcome_correlations.index,
    orientation='h',
    title='Feature Correlation with Win',
    labels={'x': 'Correlation', 'y': 'Feature'},
    color=outcome_correlations.values,
    color_continuous_scale='RdBu_r'
)
fig_outcome_corr.add_vline(x=0, line_dash="dash", line_color="gray")
fig_outcome_corr.update_layout(template=CHART_TEMPLATE, height=600)
fig_outcome_corr.show()

print("\nTop 10 Most Correlated Features:")
print(outcome_correlations.sort_values(key=abs, ascending=False).head(10).round(4))


Top 10 Most Correlated Features:
time_efficiency        0.404
elo_expected           0.399
rating_gap            -0.373
opening_color_wr       0.341
time_trouble          -0.322
rating_gap_abs         0.312
opponent_bucket_wr     0.303
opening_personal_wr    0.293
is_underdog           -0.274
vs_higher_rated_wr     0.254
Name: outcome_binary, dtype: float64


In [39]:
# Prepare data for feature selection
feature_selection_cols = [f for f in available_features if f in df_features.columns]
df_fs = df_features[feature_selection_cols + ['outcome_binary']].dropna()

X = df_fs[feature_selection_cols]
y = df_fs['outcome_binary']

print(f"Feature selection dataset: {len(df_fs)} samples, {len(feature_selection_cols)} features")

Feature selection dataset: 4364 samples, 34 features


In [40]:
# Method 1: Univariate Feature Selection (F-test)
selector_f = SelectKBest(f_classif, k='all')
selector_f.fit(X, y)

f_scores = pd.DataFrame({
    'feature': feature_selection_cols,
    'f_score': selector_f.scores_,
    'p_value': selector_f.pvalues_
}).sort_values('f_score', ascending=False)

print("F-Test Feature Importance:")
print("=" * 60)
print(f_scores.head(15).to_string(index=False))

F-Test Feature Importance:
            feature  f_score  p_value
    time_efficiency  849.542    0.000
       elo_expected  824.137    0.000
         rating_gap  704.691    0.000
   opening_color_wr  572.882    0.000
       time_trouble  504.571    0.000
     rating_gap_abs  472.203    0.000
 opponent_bucket_wr  440.208    0.000
opening_personal_wr  408.639    0.000
        is_underdog  353.737    0.000
 vs_higher_rated_wr  301.135    0.000
 rating_gap_squared  299.909    0.000
            form_10  181.396    0.000
            form_20  177.114    0.000
             is_hot  146.234    0.000
             form_5  123.677    0.000


In [41]:
# Method 2: Mutual Information
mi_scores = mutual_info_classif(X, y, random_state=RANDOM_STATE)

mi_df = pd.DataFrame({
    'feature': feature_selection_cols,
    'mi_score': mi_scores
}).sort_values('mi_score', ascending=False)

print("\nMutual Information Feature Importance:")
print("=" * 60)
print(mi_df.head(15).to_string(index=False))


Mutual Information Feature Importance:
            feature  mi_score
    time_efficiency     0.211
avg_time_normalized     0.104
         rating_gap     0.093
       elo_expected     0.083
 opponent_bucket_wr     0.068
     rating_gap_abs     0.066
 rating_gap_squared     0.065
       time_trouble     0.065
   opening_color_wr     0.059
 vs_higher_rated_wr     0.034
        is_underdog     0.033
opening_personal_wr     0.032
        games_today     0.029
            form_10     0.029
        hour_of_day     0.021


In [42]:
# Method 3: Recursive Feature Elimination (RFE)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

lr_rfe = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
rfe = RFE(lr_rfe, n_features_to_select=10, step=1)
rfe.fit(X_scaled, y)

rfe_ranking = pd.DataFrame({
    'feature': feature_selection_cols,
    'selected': rfe.support_,
    'ranking': rfe.ranking_
}).sort_values('ranking')

print("\nRFE Selected Features (Top 10):")
print("=" * 60)
print(rfe_ranking[rfe_ranking['selected']].to_string(index=False))


RFE Selected Features (Top 10):
           feature  selected  ranking
        rating_gap      True        1
    rating_gap_abs      True        1
rating_gap_squared      True        1
      elo_expected      True        1
opening_rolling_wr      True        1
  opening_color_wr      True        1
   avg_game_length      True        1
   time_efficiency      True        1
            is_hot      True        1
     draw_tendency      True        1


In [43]:
# Combine all feature selection methods
combined_scores = f_scores.merge(mi_df, on='feature').merge(rfe_ranking, on='feature')

# Normalize scores
combined_scores['f_score_norm'] = (combined_scores['f_score'] - combined_scores['f_score'].min()) / \
                                  (combined_scores['f_score'].max() - combined_scores['f_score'].min())
combined_scores['mi_score_norm'] = (combined_scores['mi_score'] - combined_scores['mi_score'].min()) / \
                                   (combined_scores['mi_score'].max() - combined_scores['mi_score'].min())
combined_scores['rfe_score_norm'] = 1 - (combined_scores['ranking'] - 1) / (combined_scores['ranking'].max() - 1)

# Combined score (weighted average)
combined_scores['combined_score'] = (combined_scores['f_score_norm'] * 0.3 + 
                                      combined_scores['mi_score_norm'] * 0.3 + 
                                      combined_scores['rfe_score_norm'] * 0.4)

combined_scores = combined_scores.sort_values('combined_score', ascending=False)

print("\nCombined Feature Importance:")
print("=" * 60)
print(combined_scores[['feature', 'f_score_norm', 'mi_score_norm', 'rfe_score_norm', 'combined_score']].head(15).to_string(index=False))


Combined Feature Importance:
            feature  f_score_norm  mi_score_norm  rfe_score_norm  combined_score
    time_efficiency         1.000          1.000           1.000           1.000
       elo_expected         0.972          0.396           1.000           0.810
         rating_gap         0.838          0.441           1.000           0.784
   opening_color_wr         0.691          0.282           1.000           0.692
     rating_gap_abs         0.578          0.314           1.000           0.668
 rating_gap_squared         0.385          0.308           1.000           0.608
opening_personal_wr         0.507          0.152           0.958           0.581
       time_trouble         0.614          0.307           0.750           0.576
 opponent_bucket_wr         0.542          0.324           0.708           0.543
             is_hot         0.213          0.062           1.000           0.483
avg_time_normalized         0.054          0.491           0.792           0.48

In [44]:
# Visualize combined feature importance
fig_importance = px.bar(
    combined_scores.head(20),
    x='combined_score',
    y='feature',
    orientation='h',
    title='Combined Feature Importance Score (Top 20)',
    labels={'combined_score': 'Importance Score', 'feature': 'Feature'},
    color='combined_score',
    color_continuous_scale='Viridis'
)
fig_importance.update_layout(
    template=CHART_TEMPLATE,
    yaxis={'categoryorder': 'total ascending'},
    height=600
)
fig_importance.show()

In [45]:
# Select final features
TOP_N_FEATURES = 15
selected_features = combined_scores.head(TOP_N_FEATURES)['feature'].tolist()

print(f"\nFinal Selected Features ({TOP_N_FEATURES}):")
print("=" * 60)
for i, feat in enumerate(selected_features, 1):
    print(f"  {i}. {feat}")


Final Selected Features (15):
  1. time_efficiency
  2. elo_expected
  3. rating_gap
  4. opening_color_wr
  5. rating_gap_abs
  6. rating_gap_squared
  7. opening_personal_wr
  8. time_trouble
  9. opponent_bucket_wr
  10. is_hot
  11. avg_time_normalized
  12. opening_rolling_wr
  13. result_volatility
  14. prev_streak
  15. draw_tendency


In [49]:
# Prepare modeling data
model_features = selected_features.copy()
df_model = df_features[model_features + ['outcome_binary', 'elo_expected']].dropna()

X = df_model[model_features]
y = df_model['outcome_binary']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Features: {len(model_features)}")

Training set: 3491 samples
Test set: 873 samples
Features: 15


In [53]:
# ELO Baseline 
y_pred_elo = df_model.loc[X_test.index, 'elo_expected']

if isinstance(y_pred_elo, pd.DataFrame):
    y_pred_elo = y_pred_elo.iloc[:, 0]

y_pred_elo = y_pred_elo.values.flatten()
y_pred_elo_binary = (y_pred_elo > 0.5).astype(int)

y_test_array = y_test.values.flatten().astype(int)

print("Shapes:", y_test_array.shape, y_pred_elo_binary.shape)

elo_metrics = {
    'accuracy': accuracy_score(y_test_array, y_pred_elo_binary),
    'roc_auc': roc_auc_score(y_test_array, y_pred_elo),
    'brier_score': brier_score_loss(y_test_array, y_pred_elo),
    'log_loss': log_loss(y_test_array, y_pred_elo)
}

print("\nELO Baseline Model Performance:")
for metric, value in elo_metrics.items():
    print(f"  {metric}: {value:.4f}")

Shapes: (873,) (873,)

ELO Baseline Model Performance:
  accuracy: 0.6861
  roc_auc: 0.7339
  brier_score: 0.1978
  log_loss: 0.5741


In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression with regularization
lr_model = LogisticRegression(
    random_state=RANDOM_STATE,
    max_iter=1000,
    C=1.0,
    penalty='l2',
    solver='lbfgs'
)

lr_model.fit(X_train_scaled, y_train)

# Predictions
y_pred_lr_proba = lr_model.predict_proba(X_test_scaled)[:, 1]
y_pred_lr = lr_model.predict(X_test_scaled)

lr_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_lr),
    'roc_auc': roc_auc_score(y_test, y_pred_lr_proba),
    'brier_score': brier_score_loss(y_test, y_pred_lr_proba),
    'log_loss': log_loss(y_test, y_pred_lr_proba)
}

print("Logistic Regression Performance:")
print("=" * 60)
for metric, value in lr_metrics.items():
    print(f"  {metric}: {value:.4f}")

print(f"\nImprovement over ELO: {(lr_metrics['roc_auc'] - elo_metrics['roc_auc'])*100:+.2f}% AUC")

Logistic Regression Performance:
  accuracy: 0.7709
  roc_auc: 0.8365
  brier_score: 0.1578
  log_loss: 0.4734

Improvement over ELO: +10.27% AUC


In [None]:
# Logistic Regression feature importance 
actual_features = X_train.columns.tolist()

print("Actual features:", actual_features)
print("Count:", len(actual_features))

lr_importance = pd.DataFrame({
    'feature': actual_features,
    'coefficient': lr_model.coef_[0],
    'abs_coefficient': np.abs(lr_model.coef_[0])
}).sort_values('abs_coefficient', ascending=False)

fig_lr_imp = px.bar(
    lr_importance,
    x='coefficient',
    y='feature',
    orientation='h',
    title='Logistic Regression Feature Coefficients',
    labels={'coefficient': 'Coefficient', 'feature': 'Feature'},
    color='coefficient',
    color_continuous_scale='RdBu_r'
)
fig_lr_imp.add_vline(x=0, line_dash="dash", line_color="gray")
fig_lr_imp.update_layout(template=CHART_TEMPLATE, yaxis={'categoryorder': 'total ascending'})
fig_lr_imp.show()

Actual features: ['time_efficiency', 'elo_expected', 'elo_expected', 'rating_gap', 'opening_color_wr', 'rating_gap_abs', 'rating_gap_squared', 'opening_personal_wr', 'time_trouble', 'opponent_bucket_wr', 'is_hot', 'avg_time_normalized', 'opening_rolling_wr', 'result_volatility', 'prev_streak', 'draw_tendency']
Count: 16


In [59]:
# XGBoost model
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=3,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=RANDOM_STATE,
    eval_metric='logloss',
    verbosity=0
)

X_train_np = X_train.values.astype(np.float32)
X_test_np = X_test.values.astype(np.float32)
y_train_np = y_train.values.astype(np.float32)
y_test_np = y_test.values.astype(np.float32)

# Fit
xgb_model.fit(
    X_train_np, y_train_np,
    eval_set=[(X_test_np, y_test_np)],
    verbose=False
)

# Predictions
y_pred_xgb_proba = xgb_model.predict_proba(X_test_np)[:, 1]
y_pred_xgb = xgb_model.predict(X_test_np)

xgb_metrics = {
    'accuracy': accuracy_score(y_test_np, y_pred_xgb),
    'roc_auc': roc_auc_score(y_test_np, y_pred_xgb_proba),
    'brier_score': brier_score_loss(y_test_np, y_pred_xgb_proba),
    'log_loss': log_loss(y_test_np, y_pred_xgb_proba)
}

print("XGBoost Performance:")
for metric, value in xgb_metrics.items():
    print(f"  {metric}: {value:.4f}")

print(f"\nImprovement over ELO: {(xgb_metrics['roc_auc'] - elo_metrics['roc_auc'])*100:+.2f}% AUC")

XGBoost Performance:
  accuracy: 0.8007
  roc_auc: 0.8800
  brier_score: 0.1350
  log_loss: 0.4129

Improvement over ELO: +14.61% AUC


In [61]:
# XGBoost feature importance
actual_features = X_train.columns.tolist()

xgb_importance = pd.DataFrame({
    'feature': actual_features,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

fig_xgb_imp = px.bar(
    xgb_importance,
    x='importance',
    y='feature',
    orientation='h',
    title='XGBoost Feature Importance',
    labels={'importance': 'Importance', 'feature': 'Feature'},
    color='importance',
    color_continuous_scale='Viridis'
)
fig_xgb_imp.update_layout(template=CHART_TEMPLATE, yaxis={'categoryorder': 'total ascending'})
fig_xgb_imp.show()

print("\nTop 10 XGBoost Features:")
print(xgb_importance.head(10).to_string(index=False))


Top 10 XGBoost Features:
            feature  importance
 opponent_bucket_wr       0.258
    time_efficiency       0.146
       time_trouble       0.130
         rating_gap       0.062
       elo_expected       0.062
   opening_color_wr       0.051
       elo_expected       0.044
             is_hot       0.036
opening_personal_wr       0.030
  result_volatility       0.029


In [63]:
X_train_clean = X_train.loc[:, ~X_train.columns.duplicated()]
X_test_clean = X_test.loc[:, ~X_test.columns.duplicated()]

print(f"Columns before: {X_train.shape[1]}, after: {X_train_clean.shape[1]}")

# LightGBM model
lgb_model = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=RANDOM_STATE,
    verbosity=-1
)

lgb_model.fit(
    X_train_clean, y_train,
    eval_set=[(X_test_clean, y_test)]
)

# Predictions
y_pred_lgb_proba = lgb_model.predict_proba(X_test_clean)[:, 1]
y_pred_lgb = lgb_model.predict(X_test_clean)

lgb_metrics = {
    'accuracy': accuracy_score(y_test, y_pred_lgb),
    'roc_auc': roc_auc_score(y_test, y_pred_lgb_proba),
    'brier_score': brier_score_loss(y_test, y_pred_lgb_proba),
    'log_loss': log_loss(y_test, y_pred_lgb_proba)
}

print("LightGBM Performance:")
for metric, value in lgb_metrics.items():
    print(f"  {metric}: {value:.4f}")

print(f"\nImprovement over ELO: {(lgb_metrics['roc_auc'] - elo_metrics['roc_auc'])*100:+.2f}% AUC")

Columns before: 16, after: 15
LightGBM Performance:
  accuracy: 0.8087
  roc_auc: 0.8879
  brier_score: 0.1270
  log_loss: 0.3834

Improvement over ELO: +15.40% AUC


In [64]:
# LightGBM feature importance
lgb_importance = pd.DataFrame({
    'feature': model_features,
    'importance': lgb_model.feature_importances_
}).sort_values('importance', ascending=False)
    
fig_lgb_imp = px.bar(
    lgb_importance,
    x='importance',
    y='feature',
    orientation='h',
    title='LightGBM Feature Importance',
    labels={'importance': 'Importance', 'feature': 'Feature'},
    color='importance',
    color_continuous_scale='Plasma'
)
fig_lgb_imp.update_layout(template=CHART_TEMPLATE, yaxis={'categoryorder': 'total ascending'})
fig_lgb_imp.show()

In [65]:
# Stratified K-Fold Cross-Validation
cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)

# Prepare full dataset for CV
X_full = df_model[model_features]
y_full = df_model['outcome_binary']
X_full_scaled = scaler.fit_transform(X_full)

print(f"Running {CV_FOLDS}-Fold Cross-Validation...")
print("=" * 60)

Running 5-Fold Cross-Validation...


In [66]:
# CV for Logistic Regression
lr_cv_scores = cross_val_score(lr_model, X_full_scaled, y_full, cv=cv, scoring='roc_auc')
lr_cv_acc = cross_val_score(lr_model, X_full_scaled, y_full, cv=cv, scoring='accuracy')

print(f"\nLogistic Regression CV Results:")
print(f"  AUC: {lr_cv_scores.mean():.4f} (+/- {lr_cv_scores.std()*2:.4f})")
print(f"  Accuracy: {lr_cv_acc.mean():.4f} (+/- {lr_cv_acc.std()*2:.4f})")
print(f"  Fold scores: {lr_cv_scores.round(4)}")


Logistic Regression CV Results:
  AUC: 0.8600 (+/- 0.0160)
  Accuracy: 0.7883 (+/- 0.0095)
  Fold scores: [0.8683 0.8448 0.8613 0.8621 0.8633]


In [68]:
X_full_np = X_full.values.astype(np.float32)
y_full_np = y_full.values.astype(np.float32)

# CV for XGBoost
xgb_cv_scores = cross_val_score(xgb_model, X_full_np, y_full_np, cv=cv, scoring='roc_auc')
xgb_cv_acc = cross_val_score(xgb_model, X_full_np, y_full_np, cv=cv, scoring='accuracy')

print(f"\nXGBoost CV Results:")
print(f"  AUC: {xgb_cv_scores.mean():.4f} (+/- {xgb_cv_scores.std()*2:.4f})")
print(f"  Accuracy: {xgb_cv_acc.mean():.4f} (+/- {xgb_cv_acc.std()*2:.4f})")
print(f"  Fold scores: {xgb_cv_scores.round(4)}")


XGBoost CV Results:
  AUC: 0.8886 (+/- 0.0221)
  Accuracy: 0.8213 (+/- 0.0271)
  Fold scores: [0.9041 0.8696 0.8881 0.89   0.8914]


In [70]:
X_full_clean = X_full.loc[:, ~X_full.columns.duplicated()]

print(f"Before: {X_full.shape[1]} columns")
print(f"After: {X_full_clean.shape[1]} columns")
print(f"Removed: {X_full.columns[X_full.columns.duplicated()].tolist()}")

# CV for LightGBM
lgb_cv_scores = cross_val_score(lgb_model, X_full_clean, y_full, cv=cv, scoring='roc_auc')
lgb_cv_acc = cross_val_score(lgb_model, X_full_clean, y_full, cv=cv, scoring='accuracy')

print(f"\nLightGBM CV Results:")
print(f"  AUC: {lgb_cv_scores.mean():.4f} (+/- {lgb_cv_scores.std()*2:.4f})")
print(f"  Accuracy: {lgb_cv_acc.mean():.4f} (+/- {lgb_cv_acc.std()*2:.4f})")
print(f"  Fold scores: {lgb_cv_scores.round(4)}")

Before: 16 columns
After: 15 columns
Removed: ['elo_expected']

LightGBM CV Results:
  AUC: 0.8918 (+/- 0.0232)
  Accuracy: 0.8201 (+/- 0.0206)
  Fold scores: [0.9107 0.8747 0.8878 0.8914 0.8945]


In [71]:
# Visualize CV results
cv_results = {
    'Logistic Regression': lr_cv_scores,
}
cv_results['XGBoost'] = xgb_cv_scores
cv_results['LightGBM'] = lgb_cv_scores

cv_df = pd.DataFrame(cv_results)
cv_df_melted = cv_df.melt(var_name='Model', value_name='AUC')

fig_cv = px.box(
    cv_df_melted,
    x='Model',
    y='AUC',
    title=f'{CV_FOLDS}-Fold Cross-Validation AUC Distribution',
    color='Model',
    points='all'
)
fig_cv.add_hline(y=elo_metrics['roc_auc'], line_dash="dash", line_color="red",
                 annotation_text=f"ELO Baseline: {elo_metrics['roc_auc']:.4f}")
fig_cv.update_layout(template=CHART_TEMPLATE, showlegend=False)
fig_cv.show()

In [72]:
# Calibration analysis for best models
print("Probability Calibration Analysis")
print("=" * 60)

def plot_calibration_curve(y_true, y_pred_proba, model_name, n_bins=10):
    prob_true, prob_pred = calibration_curve(y_true, y_pred_proba, n_bins=n_bins)
    return prob_true, prob_pred

Probability Calibration Analysis


In [73]:
# Calculate calibration curves
fig_calibration = go.Figure()

# ELO calibration
prob_true_elo, prob_pred_elo = plot_calibration_curve(y_test, y_pred_elo, 'ELO')
fig_calibration.add_trace(go.Scatter(
    x=prob_pred_elo, y=prob_true_elo,
    mode='lines+markers', name='ELO Baseline',
    line=dict(color='red')
))

# LR calibration
prob_true_lr, prob_pred_lr = plot_calibration_curve(y_test, y_pred_lr_proba, 'LR')
fig_calibration.add_trace(go.Scatter(
    x=prob_pred_lr, y=prob_true_lr,
    mode='lines+markers', name='Logistic Regression',
    line=dict(color='blue')
))

# XGBoost calibration
prob_true_xgb, prob_pred_xgb = plot_calibration_curve(y_test, y_pred_xgb_proba, 'XGBoost')
fig_calibration.add_trace(go.Scatter(
    x=prob_pred_xgb, y=prob_true_xgb,
    mode='lines+markers', name='XGBoost',
    line=dict(color='green')
))

# LightGBM calibration
prob_true_lgb, prob_pred_lgb = plot_calibration_curve(y_test, y_pred_lgb_proba, 'LightGBM')
fig_calibration.add_trace(go.Scatter(
    x=prob_pred_lgb, y=prob_true_lgb,
    mode='lines+markers', name='LightGBM',
    line=dict(color='purple')
))

# Perfect calibration line
fig_calibration.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines', name='Perfect Calibration',
    line=dict(color='gray', dash='dash')
))

fig_calibration.update_layout(
    title='Calibration Curves (Reliability Diagram)',
    xaxis_title='Mean Predicted Probability',
    yaxis_title='Fraction of Positives',
    template=CHART_TEMPLATE
)
fig_calibration.show()

In [75]:
# Apply Platt Scaling (Isotonic/Sigmoid Calibration)
print("\nApplying Probability Calibration (Platt Scaling)...")

X_train_np = X_train.values.astype(np.float32)
X_test_np = X_test.values.astype(np.float32)
y_train_np = y_train.values.astype(np.float32)
y_test_np = y_test.values.astype(np.float32)

xgb_calibrated = CalibratedClassifierCV(xgb_model, method='isotonic', cv=3)
xgb_calibrated.fit(X_train_np, y_train_np)

y_pred_xgb_cal = xgb_calibrated.predict_proba(X_test_np)[:, 1]

xgb_cal_metrics = {
    'accuracy': accuracy_score(y_test_np, (y_pred_xgb_cal > 0.5).astype(int)),
    'roc_auc': roc_auc_score(y_test_np, y_pred_xgb_cal),
    'brier_score': brier_score_loss(y_test_np, y_pred_xgb_cal),
    'log_loss': log_loss(y_test_np, y_pred_xgb_cal)
}

print(f"\nXGBoost (Calibrated) Performance:")
for metric, value in xgb_cal_metrics.items():
    print(f"  {metric}: {value:.4f}")
print(f"  Brier improvement: {(xgb_metrics['brier_score'] - xgb_cal_metrics['brier_score'])*100:+.2f}%")


Applying Probability Calibration (Platt Scaling)...

XGBoost (Calibrated) Performance:
  accuracy: 0.8087
  roc_auc: 0.8852
  brier_score: 0.1255
  log_loss: 0.3804
  Brier improvement: +0.95%


In [77]:
X_train_clean = X_train.loc[:, ~X_train.columns.duplicated()]
X_test_clean = X_test.loc[:, ~X_test.columns.duplicated()]

print(f"Columns: {X_train_clean.shape[1]}")

# Calibrate LightGBM
lgb_calibrated = CalibratedClassifierCV(lgb_model, method='isotonic', cv=3)
lgb_calibrated.fit(X_train_clean, y_train)

y_pred_lgb_cal = lgb_calibrated.predict_proba(X_test_clean)[:, 1]

lgb_cal_metrics = {
    'accuracy': accuracy_score(y_test, (y_pred_lgb_cal > 0.5).astype(int)),
    'roc_auc': roc_auc_score(y_test, y_pred_lgb_cal),
    'brier_score': brier_score_loss(y_test, y_pred_lgb_cal),
    'log_loss': log_loss(y_test, y_pred_lgb_cal)
}

print(f"\nLightGBM (Calibrated) Performance:")
for metric, value in lgb_cal_metrics.items():
    print(f"  {metric}: {value:.4f}")
print(f"  Brier improvement: {(lgb_metrics['brier_score'] - lgb_cal_metrics['brier_score'])*100:+.2f}%")

Columns: 15

LightGBM (Calibrated) Performance:
  accuracy: 0.8121
  roc_auc: 0.8871
  brier_score: 0.1235
  log_loss: 0.3698
  Brier improvement: +0.35%


In [78]:
# Compare calibration before and after
fig_cal_compare = go.Figure()

# Before calibration
prob_true_before, prob_pred_before = calibration_curve(y_test, y_pred_xgb_proba, n_bins=10)
fig_cal_compare.add_trace(go.Scatter(
    x=prob_pred_before, y=prob_true_before,
    mode='lines+markers', name='XGBoost (Before)',
    line=dict(color='green', dash='dash')
))
    
# After calibration
prob_true_after, prob_pred_after = calibration_curve(y_test, y_pred_xgb_cal, n_bins=10)
fig_cal_compare.add_trace(go.Scatter(
    x=prob_pred_after, y=prob_true_after,
    mode='lines+markers', name='XGBoost (Calibrated)',
    line=dict(color='green', width=3)
))

# Perfect line
fig_cal_compare.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines', name='Perfect',
    line=dict(color='gray', dash='dot')
))

fig_cal_compare.update_layout(
    title='Calibration: Before vs After Platt Scaling',
    xaxis_title='Mean Predicted Probability',
    yaxis_title='Fraction of Positives',
    template=CHART_TEMPLATE
)
fig_cal_compare.show()

### Ensemble Model

In [81]:
# Simple Averaging Ensemble
y_pred_ensemble_avg = (y_pred_lr_proba + y_pred_xgb_proba + y_pred_lgb_proba) / 3

avg_metrics = {
    'accuracy': accuracy_score(y_test, (y_pred_ensemble_avg > 0.5).astype(int)),
    'roc_auc': roc_auc_score(y_test, y_pred_ensemble_avg),
    'brier_score': brier_score_loss(y_test, y_pred_ensemble_avg),
    'log_loss': log_loss(y_test, y_pred_ensemble_avg)
}

print("\nSimple Averaging Ensemble:")
for metric, value in avg_metrics.items():
    print(f"  {metric}: {value:.4f}")


Simple Averaging Ensemble:
  accuracy: 0.8110
  roc_auc: 0.8841
  brier_score: 0.1300
  log_loss: 0.4004


In [82]:
# Weighted Averaging (based on CV performance)
weights = {'lr': 1.0}
total_weight = 1.0

y_pred_weighted = y_pred_lr_proba * weights['lr']

weights['xgb'] = xgb_cv_scores.mean() / lr_cv_scores.mean()
y_pred_weighted += y_pred_xgb_proba * weights['xgb']
total_weight += weights['xgb']

weights['lgb'] = lgb_cv_scores.mean() / lr_cv_scores.mean()
y_pred_weighted += y_pred_lgb_proba * weights['lgb']
total_weight += weights['lgb']

y_pred_weighted /= total_weight

weighted_metrics = {
    'accuracy': accuracy_score(y_test, (y_pred_weighted > 0.5).astype(int)),
    'roc_auc': roc_auc_score(y_test, y_pred_weighted),
    'brier_score': brier_score_loss(y_test, y_pred_weighted),
    'log_loss': log_loss(y_test, y_pred_weighted)
}

print(f"\nWeighted Averaging Ensemble (weights: {weights}):")
for metric, value in weighted_metrics.items():
    print(f"  {metric}: {value:.4f}")


Weighted Averaging Ensemble (weights: {'lr': 1.0, 'xgb': np.float64(1.0333280122632675), 'lgb': np.float64(1.037034952445635)}):
  accuracy: 0.8099
  roc_auc: 0.8842
  brier_score: 0.1299
  log_loss: 0.4000


In [83]:
#  Voting Classifier
estimators = [('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))]

estimators.append(('xgb', xgb.XGBClassifier(
    n_estimators=100, max_depth=5, learning_rate=0.1,
    random_state=RANDOM_STATE, use_label_encoder=False,
    eval_metric='logloss', verbosity=0
)))

estimators.append(('lgb', lgb.LGBMClassifier(
    n_estimators=100, max_depth=5, learning_rate=0.1,
    random_state=RANDOM_STATE, verbosity=-1
)))

voting_clf = VotingClassifier(
    estimators=estimators,
    voting='soft'
)

# Need to use scaled data for LR in voting classifier
voting_clf.fit(X_train_scaled, y_train)
y_pred_voting = voting_clf.predict_proba(X_test_scaled)[:, 1]

voting_metrics = {
    'accuracy': accuracy_score(y_test, (y_pred_voting > 0.5).astype(int)),
    'roc_auc': roc_auc_score(y_test, y_pred_voting),
    'brier_score': brier_score_loss(y_test, y_pred_voting),
    'log_loss': log_loss(y_test, y_pred_voting)
}

print("\nVoting Classifier (Soft Voting):")
for metric, value in voting_metrics.items():
    print(f"  {metric}: {value:.4f}")


Voting Classifier (Soft Voting):
  accuracy: 0.8041
  roc_auc: 0.8868
  brier_score: 0.1283
  log_loss: 0.3933


In [84]:
# Stacking Classifier
base_estimators = [('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))]

base_estimators.append(('xgb', xgb.XGBClassifier(
    n_estimators=100, max_depth=4, learning_rate=0.1,
    random_state=RANDOM_STATE, use_label_encoder=False,
    eval_metric='logloss', verbosity=0
)))

base_estimators.append(('lgb', lgb.LGBMClassifier(
    n_estimators=100, max_depth=4, learning_rate=0.1,
    random_state=RANDOM_STATE, verbosity=-1
)))

stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(random_state=RANDOM_STATE),
    cv=3,
    passthrough=False
)

stacking_clf.fit(X_train_scaled, y_train)
y_pred_stacking = stacking_clf.predict_proba(X_test_scaled)[:, 1]

stacking_metrics = {
    'accuracy': accuracy_score(y_test, (y_pred_stacking > 0.5).astype(int)),
    'roc_auc': roc_auc_score(y_test, y_pred_stacking),
    'brier_score': brier_score_loss(y_test, y_pred_stacking),
    'log_loss': log_loss(y_test, y_pred_stacking)
}

print("\nStacking Classifier:")
for metric, value in stacking_metrics.items():
    print(f"  {metric}: {value:.4f}")


Stacking Classifier:
  accuracy: 0.8076
  roc_auc: 0.8892
  brier_score: 0.1302
  log_loss: 0.4012


### Model Comparison

In [87]:
# Compile all model results
print("="*80)
print("FINAL MODEL COMPARISON")
print("="*80)

all_models = {
    'ELO Baseline': elo_metrics,
    'Logistic Regression': lr_metrics,
}

all_models['XGBoost'] = xgb_metrics
all_models['XGBoost (Calibrated)'] = xgb_cal_metrics

all_models['LightGBM'] = lgb_metrics
all_models['LightGBM (Calibrated)'] = lgb_cal_metrics

all_models['Ensemble (Averaging)'] = avg_metrics
all_models['Ensemble (Weighted)'] = weighted_metrics
all_models['Ensemble (Voting)'] = voting_metrics
all_models['Ensemble (Stacking)'] = stacking_metrics

# Create comparison DataFrame
comparison_df = pd.DataFrame(all_models).T
comparison_df = comparison_df.round(4)
comparison_df['AUC Improvement vs ELO'] = (comparison_df['roc_auc'] - elo_metrics['roc_auc']) * 100

print(comparison_df.to_string())

FINAL MODEL COMPARISON
                       accuracy  roc_auc  brier_score  log_loss  AUC Improvement vs ELO
ELO Baseline              0.686    0.734        0.198     0.574                   0.002
Logistic Regression       0.771    0.837        0.158     0.473                  10.262
XGBoost                   0.801    0.880        0.135     0.413                  14.612
XGBoost (Calibrated)      0.809    0.885        0.126     0.380                  15.132
LightGBM                  0.809    0.888        0.127     0.383                  15.402
LightGBM (Calibrated)     0.812    0.887        0.123     0.370                  15.322
Ensemble (Averaging)      0.811    0.884        0.130     0.400                  15.022
Ensemble (Weighted)       0.810    0.884        0.130     0.400                  15.032
Ensemble (Voting)         0.804    0.887        0.128     0.393                  15.292
Ensemble (Stacking)       0.808    0.889        0.130     0.401                  15.532


In [88]:
# Visualize model comparison
fig_comparison = make_subplots(
    rows=2, cols=2,
    subplot_titles=['ROC-AUC Score', 'Accuracy', 'Brier Score (lower is better)', 'Log Loss (lower is better)']
)

models = list(all_models.keys())
colors = px.colors.qualitative.Set2[:len(models)]

# ROC-AUC
fig_comparison.add_trace(
    go.Bar(x=models, y=comparison_df['roc_auc'], marker_color=colors, showlegend=False),
    row=1, col=1
)

# Accuracy
fig_comparison.add_trace(
    go.Bar(x=models, y=comparison_df['accuracy'], marker_color=colors, showlegend=False),
    row=1, col=2
)

# Brier Score
fig_comparison.add_trace(
    go.Bar(x=models, y=comparison_df['brier_score'], marker_color=colors, showlegend=False),
    row=2, col=1
)

# Log Loss
fig_comparison.add_trace(
    go.Bar(x=models, y=comparison_df['log_loss'], marker_color=colors, showlegend=False),
    row=2, col=2
)

fig_comparison.update_layout(
    title='Model Performance Comparison',
    template=CHART_TEMPLATE,
    height=700,
    showlegend=False
)
fig_comparison.update_xaxes(tickangle=-45)
fig_comparison.show()

In [89]:
# ROC Curves Comparison
fig_roc = go.Figure()

# ELO Baseline
fpr_elo, tpr_elo, _ = roc_curve(y_test, y_pred_elo)
fig_roc.add_trace(go.Scatter(
    x=fpr_elo, y=tpr_elo,
    name=f"ELO Baseline (AUC={elo_metrics['roc_auc']:.3f})",
    line=dict(color='red', dash='dash')
))

# Logistic Regression
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_lr_proba)
fig_roc.add_trace(go.Scatter(
    x=fpr_lr, y=tpr_lr,
    name=f"Logistic Regression (AUC={lr_metrics['roc_auc']:.3f})",
    line=dict(color='blue')
))

# XGBoost
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb_proba)
fig_roc.add_trace(go.Scatter(
    x=fpr_xgb, y=tpr_xgb,
    name=f"XGBoost (AUC={xgb_metrics['roc_auc']:.3f})",
    line=dict(color='green')
))

# LightGBM
fpr_lgb, tpr_lgb, _ = roc_curve(y_test, y_pred_lgb_proba)
fig_roc.add_trace(go.Scatter(
    x=fpr_lgb, y=tpr_lgb,
    name=f"LightGBM (AUC={lgb_metrics['roc_auc']:.3f})",
    line=dict(color='purple')
))

# Best Ensemble
best_ensemble_pred = y_pred_stacking
fpr_ens, tpr_ens, _ = roc_curve(y_test, best_ensemble_pred)
fig_roc.add_trace(go.Scatter(
    x=fpr_ens, y=tpr_ens,
    name=f"Stacking Ensemble (AUC={stacking_metrics['roc_auc']:.3f})",
    line=dict(color='orange', width=3)
))

# Random baseline
fig_roc.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    name='Random (AUC=0.500)',
    line=dict(color='gray', dash='dot')
))

fig_roc.update_layout(
    title='ROC Curves - All Models',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    template=CHART_TEMPLATE,
    legend=dict(x=0.6, y=0.1)
)
fig_roc.show()

In [91]:
# Select best model
best_model_name = comparison_df['roc_auc'].idxmax()
best_model_metrics = comparison_df.loc[best_model_name]

print("BEST MODEL: " + best_model_name)
print(f"\n  ROC-AUC:    {best_model_metrics['roc_auc']:.4f}")
print(f"  Accuracy:   {best_model_metrics['accuracy']:.4f}")
print(f"  Brier:      {best_model_metrics['brier_score']:.4f}")
print(f"  Log Loss:   {best_model_metrics['log_loss']:.4f}")
print(f"\n  Improvement over ELO: +{best_model_metrics['AUC Improvement vs ELO']:.2f}% AUC")

BEST MODEL: Ensemble (Stacking)

  ROC-AUC:    0.8892
  Accuracy:   0.8076
  Brier:      0.1302
  Log Loss:   0.4012

  Improvement over ELO: +15.53% AUC


In [92]:
# Feature importance summary (combined from all models)
print("FEATURE IMPORTANCE SUMMARY")

importance_combined = lr_importance[['feature', 'abs_coefficient']].copy()
importance_combined.columns = ['feature', 'lr_importance']
importance_combined['lr_importance'] = importance_combined['lr_importance'] / importance_combined['lr_importance'].max()

xgb_imp_norm = xgb_importance.copy()
xgb_imp_norm['importance'] = xgb_imp_norm['importance'] / xgb_imp_norm['importance'].max()
importance_combined = importance_combined.merge(
    xgb_imp_norm[['feature', 'importance']].rename(columns={'importance': 'xgb_importance'}),
    on='feature'
)

lgb_imp_norm = lgb_importance.copy()
lgb_imp_norm['importance'] = lgb_imp_norm['importance'] / lgb_imp_norm['importance'].max()
importance_combined = importance_combined.merge(
    lgb_imp_norm[['feature', 'importance']].rename(columns={'importance': 'lgb_importance'}),
    on='feature'
)

# Average importance
imp_cols = [c for c in importance_combined.columns if 'importance' in c]
importance_combined['avg_importance'] = importance_combined[imp_cols].mean(axis=1)
importance_combined = importance_combined.sort_values('avg_importance', ascending=False)

print("\nTop 10 Most Important Features (averaged across models):")
print("-" * 60)
for i, row in importance_combined.head(10).iterrows():
    print(f"  {importance_combined.head(10).index.get_loc(i)+1}. {row['feature']}: {row['avg_importance']:.3f}")

FEATURE IMPORTANCE SUMMARY

Top 10 Most Important Features (averaged across models):
------------------------------------------------------------
  1. time_efficiency: 0.855
  2. opening_color_wr: 0.479
  3. opponent_bucket_wr: 0.364
  4. elo_expected: 0.319
  5. elo_expected: 0.319
  6. avg_time_normalized: 0.298
  7. elo_expected: 0.296
  8. elo_expected: 0.296
  9. rating_gap_abs: 0.286
  10. draw_tendency: 0.268


In [93]:
# Visualize combined feature importance
fig_imp_combined = px.bar(
    importance_combined.head(15),
    x='avg_importance',
    y='feature',
    orientation='h',
    title='Combined Feature Importance (Average Across Models)',
    labels={'avg_importance': 'Normalized Importance', 'feature': 'Feature'},
    color='avg_importance',
    color_continuous_scale='Viridis'
)
fig_imp_combined.update_layout(
    template=CHART_TEMPLATE,
    yaxis={'categoryorder': 'total ascending'},
    height=500
)
fig_imp_combined.show()