In [1]:
import os
from datetime import timedelta, datetime, date

import polars as pl
from supabase import create_client
from google.cloud import storage, secretmanager

from data_wrangling import load_season, record_current_season, load_schedule
from elo_rating import elo_season
from modelling import lgbm_model
from data_collection import (
    season,
    collect_season_statistics,
    collect_season_filtered_table,
    collect_all_data,
    collect_season_data
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = storage.Client()
bucket = client.get_bucket("lgbm")
secret_client = secretmanager.SecretManagerServiceClient()
response = secret_client.access_secret_version(request={'name':'projects/898760610238/secrets/supabase/versions/1'})
creds = eval(response.payload.data.decode("UTF-8"))
connection = create_client(creds['postgres']['project_url'], creds['postgres']['api_key'])

In [3]:
mod = lgbm_model(connection, bucket, data_origin="google")

In [5]:
mod.feature_selection(force=True)

Running Feature Selection for 650 features.
Running Feature Selection for 600 features.
Running Feature Selection for 550 features.
Running Feature Selection for 500 features.
Running Feature Selection for 450 features.
Running Feature Selection for 400 features.
Running Feature Selection for 350 features.
Running Feature Selection for 300 features.
Running Feature Selection for 250 features.
Running Feature Selection for 200 features.
Running Feature Selection for 150 features.
Running Feature Selection for 100 features.
Running Feature Selection for 50 features.
Running Feature Selection for 4 features.
Running Feature Selection for 3 features.
Running Feature Selection for 2 features.
Feature selection completed.


In [6]:
mod.tune_hyperparameters()

[I 2025-03-12 12:08:29,192] A new study created in memory with name: no-name-c96894b5-0867-43e3-a506-b1e2edfb2f1c


Downloaded newest matching file: lgbm_evaluation_february_2025.json to lgbm_evaluation_february_2025.json


[I 2025-03-12 12:08:30,545] Trial 0 finished with value: 0.6063829787234043 and parameters: {'iterations': 604, 'learning_rate': 0.587073365581447, 'min_child_weight': 1.7520412014283067, 'subsample': 0.719168986869312, 'colsample_bytree': 0.32585041475316434}. Best is trial 0 with value: 0.6063829787234043.
[I 2025-03-12 12:08:31,251] Trial 1 finished with value: 0.6340425531914894 and parameters: {'iterations': 96, 'learning_rate': 0.2705437801616401, 'min_child_weight': 4.725832421536743, 'subsample': 0.8099549356247112, 'colsample_bytree': 0.6763486713924032}. Best is trial 1 with value: 0.6340425531914894.
[I 2025-03-12 12:08:33,201] Trial 2 finished with value: 0.6170212765957447 and parameters: {'iterations': 655, 'learning_rate': 0.010144492606921749, 'min_child_weight': 3.822391535996362, 'subsample': 0.7569564222284471, 'colsample_bytree': 0.012079257425279743}. Best is trial 1 with value: 0.6340425531914894.
[I 2025-03-12 12:08:34,148] Trial 3 finished with value: 0.6 and pa

Hyperparameter tuning completed.


In [51]:
df_1 = collect_all_data(
    'schedule',
    connection
)
df_2 = collect_all_data(
    'elo',
    connection
).drop('id')
df_3 = collect_all_data(
    'statistics_previous',
    connection
)
df_4 = collect_all_data(
    'statistics_recent_games',
    connection
)
df_5 = collect_all_data(
    'statistics_season',
    connection
)
df_6 = collect_all_data(
    'statistics_remainder',
    connection
)

In [114]:
df_2 = collect_all_data(
    'elo',
    connection
).drop('id')

In [115]:
df_2 = df_2.with_columns([
    pl.col("date").str.to_date()
]).with_columns([
    pl.col("elo_before").shift(8).over(pl.col("team_id"), order_by=pl.col("date")).alias("elo_before_8_games_ago"),
    pl.col("elo_after").shift(8).over(pl.col("team_id"), order_by=pl.col("date")).alias("elo_after_8_games_ago")
]).with_columns([
    (pl.col("elo_before") - pl.col("elo_before_8_games_ago")).alias("elo_before_change_absolute"),
    (pl.col("elo_before") / pl.col("elo_before_8_games_ago") - 1).alias("elo_before_change_relative"),
    (pl.col("elo_after") - pl.col("elo_after_8_games_ago")).alias("elo_after_change_absolute"),
    (pl.col("elo_after") / pl.col("elo_after_8_games_ago") - 1).alias("elo_after_change_relative")
])
current_elo = df_2.group_by('team_id').tail(1).select(
    ['team_id', 'elo_after', 'elo_after_change_absolute', 'elo_after_change_relative']
)
temp_df = df_1.join(
    df_3, on='game_id'
).join(
    df_4, on='game_id'
).join(
    df_5, on='game_id'
).join(
    df_6, on='game_id'
).join(
    df_2.drop([
        'elo_after', 'elo_after_change_absolute', 'elo_after_change_relative', 'date',
        'elo_before_8_games_ago', 'elo_after_8_games_ago'
    ]),
    left_on=['game_id', 'home_team_id'],
    right_on=['game_id', 'team_id'],
    how='left'
).join(
    df_2.drop([
        'elo_after', 'elo_after_change_absolute', 'elo_after_change_relative', 'date',
        'elo_before_8_games_ago', 'elo_after_8_games_ago'
    ]),
    left_on=['game_id', 'away_team_id'],
    right_on=['game_id', 'team_id'],
    how='left'
).join(
    current_elo,
    left_on='home_team_id',
    right_on='team_id'
).join(
    current_elo,
    left_on='away_team_id',
    right_on='team_id'
).with_columns([
    pl.coalesce(pl.col('elo_before'), pl.col('elo_after')).alias('elo_home_team'),
    pl.coalesce(pl.col('elo_before_right'), pl.col('elo_after_right')).alias('elo_away_team'),
    pl.coalesce(pl.col('elo_before_change_absolute'), pl.col('elo_after_change_absolute')).alias('elo_change_absolute_home_team'),
    pl.coalesce(pl.col('elo_before_change_absolute_right'), pl.col('elo_after_change_absolute_right')).alias('elo_change_absolute_away_team'),
    pl.coalesce(pl.col('elo_before_change_relative'), pl.col('elo_after_change_relative')).alias('elo_change_relative_home_team'),
    pl.coalesce(pl.col('elo_before_change_relative_right'), pl.col('elo_after_change_relative_right')).alias('elo_change_relative_away_team'),
    pl.col('date').str.to_date()
]).with_columns([
    pl.struct([
        'home_team_id', 'date', 'season_id'
    ]).map_elements(
        lambda x: temp_df.filter(
            (
                (
                    (pl.col('home_team_id') == x['home_team_id']) &
                    (pl.col('is_home_win')) &
                    ((100 + pl.col('elo_home_team')) < pl.col('elo_away_team'))
                ) |
                (
                    (pl.col('away_team_id') == x['home_team_id']) &
                    (~pl.col('is_home_win')) &
                    ((100 + pl.col('elo_home_team')) >= pl.col('elo_away_team'))
                    )
            ) &
            (
                pl.col('date') < x['date']
            ) &
            (
                pl.col('season_id') == x['season_id']
            )
        ).shape[0], return_dtype=pl.Int64
    ).alias('upsets_this_year_home'),
    pl.struct([
        'away_team_id', 'date', 'season_id'
    ]).map_elements(
        lambda x: temp_df.filter(
            (
                (
                    (pl.col('home_team_id') == x['away_team_id']) &
                    (pl.col('is_home_win')) &
                    ((100 + pl.col('elo_home_team')) < pl.col('elo_away_team'))
                ) |
                (
                    (pl.col('away_team_id') == x['away_team_id']) &
                    (~pl.col('is_home_win')) &
                    ((100 + pl.col('elo_home_team')) >= pl.col('elo_away_team'))
                    )
            ) &
            (
                pl.col('date') < x['date']
            ) &
            (
                pl.col('season_id') == x['season_id']
            )
        ).shape[0], return_dtype=pl.Int64
    ).alias('upsets_this_year_away')
]).drop([
    'elo_before', 'elo_before_right', 'elo_after', 'elo_after_right',
    'elo_before_change_absolute', 'elo_before_change_absolute_right',
    'elo_before_change_relative', 'elo_before_change_relative_right',
    'season_id'
])

In [116]:
temp_df

date,game_id,home_team_id,away_team_id,fieldGoalsMade_previous_game_home_team,fieldGoalsAttempted_previous_game_home_team,threePointersMade_previous_game_home_team,threePointersAttempted_previous_game_home_team,freeThrowsMade_previous_game_home_team,freeThrowsAttempted_previous_game_home_team,reboundsOffensive_previous_game_home_team,reboundsDefensive_previous_game_home_team,reboundsTotal_previous_game_home_team,assists_previous_game_home_team,steals_previous_game_home_team,blocks_previous_game_home_team,turnovers_previous_game_home_team,foulsPersonal_previous_game_home_team,points_previous_game_home_team,plusMinusPoints_previous_game_home_team,estimatedPace_previous_game_home_team,pace_previous_game_home_team,pacePer40_previous_game_home_team,possessions_previous_game_home_team,contestedShots_previous_game_home_team,contestedShots2pt_previous_game_home_team,contestedShots3pt_previous_game_home_team,deflections_previous_game_home_team,chargesDrawn_previous_game_home_team,screenAssists_previous_game_home_team,screenAssistPoints_previous_game_home_team,looseBallsRecoveredOffensive_previous_game_home_team,looseBallsRecoveredDefensive_previous_game_home_team,looseBallsRecoveredTotal_previous_game_home_team,offensiveBoxOuts_previous_game_home_team,defensiveBoxOuts_previous_game_home_team,boxOutPlayerTeamRebounds_previous_game_home_team,…,game_type,is_home_win,previous_games,h2h_current_year,h2h_previous_year,winning_percentage_home_team,days_since_last_game_home_team,has_won_last_game_home_team,games_last_7_days_home_team,current_winning_streak_home_team,current_losing_streak_home_team,winning_percentage_away_team,days_since_last_game_away_team,has_won_last_game_away_team,games_last_7_days_away_team,current_winning_streak_away_team,current_losing_streak_away_team,previous_home_wins,previous_home_games,previous_away_wins,previous_away_games,winning_percentage_home,winning_percentage_away,month,weekday,elo_after_change_absolute,elo_after_change_relative,elo_after_change_absolute_right,elo_after_change_relative_right,elo_home_team,elo_away_team,elo_change_absolute_home_team,elo_change_absolute_away_team,elo_change_relative_home_team,elo_change_relative_away_team,upsets_this_year_home,upsets_this_year_away
date,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,str,bool,i64,f64,f64,f64,i64,bool,i64,i64,i64,f64,i64,bool,i64,i64,i64,i64,i64,i64,i64,f64,f64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64
2021-10-19,"""0022100002""","""1610612747""","""1610612744""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",false,0,,0.75,,,,0,0,0,,,,0,0,0,0,0,0,0,,,"""10""","""2""",9.360779,0.005833,36.851355,0.023779,1538.263294,1527.121958,-9.313939,11.955392,-0.006018,0.00789,0,0
2021-10-19,"""0022100001""","""1610612749""","""1610612751""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",true,0,,0.6,,,,0,0,0,,,,0,0,0,0,0,0,0,,,"""10""","""2""",22.620177,0.014777,-27.073619,-0.019756,1651.783977,1600.111393,-8.685877,-25.881699,-0.005231,-0.015917,0,0
2021-10-20,"""0022100008""","""1610612750""","""1610612745""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",true,0,,0.666667,,,,0,0,0,,,,0,0,0,0,0,0,0,,,"""10""","""3""",24.36242,0.015148,1.146096,0.000737,1442.995056,1341.447128,43.604328,51.328944,0.03116,0.039786,0,0
2021-10-20,"""0022100012""","""1610612756""","""1610612743""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",false,0,,0.714286,,,,0,0,0,,,,0,0,0,0,0,0,0,,,"""10""","""3""",-8.559859,-0.005837,38.608897,0.023851,1642.604818,1568.866251,-59.622121,-45.221511,-0.035026,-0.028017,0,0
2021-10-20,"""0022100010""","""1610612759""","""1610612753""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",true,0,,1.0,,,,0,0,0,,,,0,0,0,0,0,0,0,,,"""10""","""3""",-17.894784,-0.012437,-30.718901,-0.021304,1472.422552,1328.226834,-1.848385,22.135496,-0.001254,0.016948,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2025-03-30,"""0022401085""","""1610612740""","""1610612766""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",,1,0.0,1.0,,2,,3,0,0,,2,,3,0,0,11,37,6,36,0.297297,0.166667,"""3""","""7""",9.716507,0.00712,-16.824608,-0.013169,1374.393844,1260.759942,9.716507,-16.824608,0.00712,-0.013169,10,10
2025-03-30,"""0022401082""","""1610612752""","""1610612757""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",,1,0.0,1.0,,2,,3,0,0,,3,,3,0,0,21,36,12,36,0.583333,0.333333,"""3""","""7""",4.995726,0.003125,17.975733,0.012337,1603.61282,1474.993238,4.995726,17.975733,0.003125,0.012337,9,21
2025-03-30,"""0022401081""","""1610612739""","""1610612746""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",,1,0.0,0.5,,2,,4,0,0,,2,,3,0,0,29,36,13,37,0.805556,0.351351,"""3""","""7""",39.244182,0.022744,4.79502,0.003149,1764.69477,1527.270026,39.244182,4.79502,0.022744,0.003149,7,8
2025-03-30,"""0022401083""","""1610612749""","""1610612737""",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,"""Regular Season""",,4,0.5,0.666667,,2,,3,0,0,,3,,3,0,0,22,37,15,37,0.594595,0.405405,"""3""","""7""",22.620177,0.014777,11.40871,0.007872,1553.362276,1460.661485,22.620177,11.40871,0.014777,0.007872,11,17


In [102]:
temp_df = df_1.join(
    df_3, on='game_id'
).join(
    df_4, on='game_id'
).join(
    df_5, on='game_id'
).join(
    df_6, on='game_id'
).join(
    df_2.drop([
        'elo_after', 'elo_after_change_absolute', 'elo_after_change_relative', 'date',
        'elo_before_8_games_ago', 'elo_after_8_games_ago'
    ]),
    left_on=['game_id', 'home_team_id'],
    right_on=['game_id', 'team_id'],
    how='left'
).join(
    df_2.drop([
        'elo_after', 'elo_after_change_absolute', 'elo_after_change_relative', 'date',
        'elo_before_8_games_ago', 'elo_after_8_games_ago'
    ]),
    left_on=['game_id', 'away_team_id'],
    right_on=['game_id', 'team_id'],
    how='left'
).join(
    current_elo,
    left_on='home_team_id',
    right_on='team_id'
).join(
    current_elo,
    left_on='away_team_id',
    right_on='team_id'
).with_columns([
    pl.coalesce(pl.col('elo_before'), pl.col('elo_after')).alias('elo_home_team'),
    pl.coalesce(pl.col('elo_before_right'), pl.col('elo_after_right')).alias('elo_away_team'),
    pl.coalesce(pl.col('elo_before_change_absolute'), pl.col('elo_after_change_absolute')).alias('elo_change_absolute_home_team'),
    pl.coalesce(pl.col('elo_before_change_absolute_right'), pl.col('elo_after_change_absolute_right')).alias('elo_change_absolute_away_team'),
    pl.coalesce(pl.col('elo_before_change_relative'), pl.col('elo_after_change_relative')).alias('elo_change_relative_home_team'),
    pl.coalesce(pl.col('elo_before_change_relative_right'), pl.col('elo_after_change_relative_right')).alias('elo_change_relative_away_team'),
    pl.col('date').str.to_date()
]).drop([
    'elo_before', 'elo_before_right', 'elo_after', 'elo_after_right',
    'elo_before_change_absolute', 'elo_before_change_absolute_right',
    'elo_before_change_relative', 'elo_before_change_relative_right'
])

In [None]:
df_7 = collect_all_data(
    'boxscore',
    connection
)

In [113]:
temp_df.select([
    "date", "game_id", "home_team_id", "away_team_id", "season_id",
    "elo_home_team", "elo_away_team", "is_home_win"
]).with_columns([
    pl.struct([
        'home_team_id', 'date', 'season_id'
    ]).map_elements(
        lambda x: temp_df.filter(
            (
                (
                    (pl.col('home_team_id') == x['home_team_id']) &
                    (pl.col('is_home_win')) &
                    ((100 + pl.col('elo_home_team')) < pl.col('elo_away_team'))
                ) |
                (
                    (pl.col('away_team_id') == x['home_team_id']) &
                    (~pl.col('is_home_win')) &
                    ((100 + pl.col('elo_home_team')) >= pl.col('elo_away_team'))
                    )
            ) &
            (
                pl.col('date') < x['date']
            ) &
            (
                pl.col('season_id') == x['season_id']
            )
        ).shape[0], return_dtype=pl.Int64
    ).alias('upsets_this_year_home'),
    pl.struct([
        'away_team_id', 'date', 'season_id'
    ]).map_elements(
        lambda x: temp_df.filter(
            (
                (
                    (pl.col('home_team_id') == x['away_team_id']) &
                    (pl.col('is_home_win')) &
                    ((100 + pl.col('elo_home_team')) < pl.col('elo_away_team'))
                ) |
                (
                    (pl.col('away_team_id') == x['away_team_id']) &
                    (~pl.col('is_home_win')) &
                    ((100 + pl.col('elo_home_team')) >= pl.col('elo_away_team'))
                    )
            ) &
            (
                pl.col('date') < x['date']
            ) &
            (
                pl.col('season_id') == x['season_id']
            )
        ).shape[0], return_dtype=pl.Int64
    ).alias('upsets_this_year_away')
])

date,game_id,home_team_id,away_team_id,season_id,elo_home_team,elo_away_team,is_home_win,upsets_this_year_home,upsets_this_year_away
date,str,str,str,str,f64,f64,bool,i64,i64
2021-10-19,"""0022100002""","""1610612747""","""1610612744""","""2021""",1538.263294,1527.121958,false,0,0
2021-10-19,"""0022100001""","""1610612749""","""1610612751""","""2021""",1651.783977,1600.111393,true,0,0
2021-10-20,"""0022100008""","""1610612750""","""1610612745""","""2021""",1442.995056,1341.447128,true,0,0
2021-10-20,"""0022100012""","""1610612756""","""1610612743""","""2021""",1642.604818,1568.866251,false,0,0
2021-10-20,"""0022100010""","""1610612759""","""1610612753""","""2021""",1472.422552,1328.226834,true,0,0
…,…,…,…,…,…,…,…,…,…
2025-03-30,"""0022401085""","""1610612740""","""1610612766""","""2024""",1363.556808,1260.759942,,10,10
2025-03-30,"""0022401082""","""1610612752""","""1610612757""","""2024""",1603.61282,1474.993238,,9,21
2025-03-30,"""0022401081""","""1610612739""","""1610612746""","""2024""",1764.221662,1538.107061,,7,8
2025-03-30,"""0022401083""","""1610612749""","""1610612737""","""2024""",1556.138603,1460.661485,,11,17


In [49]:
games.sort("date").with_columns([
    pl.struct([
        "home_team_id", "date"
    ]).map_elements(
        lambda x: games.filter(
            (
                (
                    (pl.col('home_team_id') == x['home_team_id']) &
                    (pl.col('points_home') > pl.col('points_away')) &
                    (pl.col('adjusted_elo_home') < pl.col('elo_away'))
                ) |
                (
                    (pl.col('away_team_id') == x['home_team_id']) &
                    (pl.col('points_home') < pl.col('points_away'))&
                    (pl.col('adjusted_elo_home') >= pl.col('elo_away'))
                    )
            ) &
            (
                pl.col('date') < x['date']
            )
            ).shape[0], return_dtype=pl.Int64
        ).alias('upsets_this_year_home_team'),
    pl.struct([
        "away_team_id", "date"
    ]).map_elements(
        lambda x: games.filter(
            (
                (
                    (pl.col('home_team_id') == x['away_team_id']) &
                    (pl.col('points_home') > pl.col('points_away')) &
                    (pl.col('adjusted_elo_home') < pl.col('elo_away'))
                ) |
                (
                    (pl.col('away_team_id') == x['away_team_id']) &
                    (pl.col('points_home') < pl.col('points_away'))&
                    (pl.col('adjusted_elo_home') >= pl.col('elo_away'))
                    )
            ) &
            (
                pl.col('date') < x['date']
            )
            ).shape[0], return_dtype=pl.Int64
        ).alias('upsets_this_year_away_team')
])

game_id,date,home_team_id,away_team_id,points_home,points_away,adjusted_elo_home,elo_away,upsets_this_year_home_team,upsets_this_year_away_team
str,date,str,str,i64,i64,f64,f64,i64,i64
"""0022400062""",2024-10-22,"""1610612747""","""1610612750""",110,103,1662.852873,1623.810174,0,0
"""0022400061""",2024-10-22,"""1610612738""","""1610612752""",132,109,1788.916473,1562.644899,0,0
"""0022400071""",2024-10-23,"""1610612746""","""1610612756""",113,116,1607.674867,1573.633687,0,0
"""0022400064""",2024-10-23,"""1610612737""","""1610612751""",120,116,1563.68,1436.480791,0,0
"""0022400069""",2024-10-23,"""1610612740""","""1610612741""",123,111,1641.05591,1482.365879,0,0
…,…,…,…,…,…,…,…,…,…
"""0022400926""",2025-03-09,"""1610612757""","""1610612765""",112,119,1588.118233,1572.762085,21,22
"""0022400923""",2025-03-09,"""1610612740""","""1610612763""",104,107,1467.003963,1584.367287,10,15
"""0022400925""",2025-03-09,"""1610612750""","""1610612759""",141,124,1728.976664,1432.591516,12,15
"""0022400927""",2025-03-09,"""1610612746""","""1610612758""",111,110,1635.148932,1556.648489,8,11


In [7]:
print('Collecting Schedule and Boxscores')
season(
    start_date=newest_date,
    end_date=newest_date + timedelta(days=3),
    connection=connection,
    season_id=season_id
)

Collecting Schedule and Boxscores


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [01:36<00:00, 32.01s/it]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [28:29<00:00, 106.84s/it]


In [28]:
print('Collecting Season Statistics')
season_2024 = load_season(
    season_dates.filter(pl.col('season_id') == season_id)['all_star_date'][0],
    season_dates.filter(pl.col('season_id') == season_id)['play_in_start'][0],
    season_dates.filter(pl.col('season_id') == season_id)['play_in_end'][0],
    connection=connection,
    season_id=season_id
)
previous_df, recent_games_df, remainder_df, season_df = collect_season_statistics(season_id, connection)
new_data_1 = season_2024.filter(~pl.col('game_id').is_in(previous_df['game_id'])).select(previous_df.columns)
new_data_2 = season_2024.filter(~pl.col('game_id').is_in(recent_games_df['game_id'])).select(recent_games_df.columns)
new_data_3 = season_2024.filter(~pl.col('game_id').is_in(remainder_df['game_id'])).select(remainder_df.columns)
new_data_4 = season_2024.filter(~pl.col('game_id').is_in(season_df['game_id'])).select(season_df.columns)

Collecting Season Statistics


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 29.62it/s]


In [29]:
update_date = remainder_df[["game_id", "is_home_win"]].join(
    season_2024[["game_id", "date", "is_home_win"]],
    on="game_id"
).filter(
    pl.col("is_home_win").is_null() & (pl.col("is_home_win_right").is_not_null())
)["date"].min()
update_data_1 = season_2024.filter(
    (pl.col("date") >= update_date) & (~pl.col("game_id").is_in(new_data_1["game_id"]))
).drop_nulls(
    ["fieldGoalsMade_previous_game_home_team", "fieldGoalsMade_previous_game_away_team"]
).select(previous_df.columns)
update_data_2 = season_2024.filter(
    (pl.col("date") >= update_date) & (~pl.col("game_id").is_in(new_data_2["game_id"]))
).select(recent_games_df.columns)
update_data_3 = season_2024.filter(
    (pl.col("date") >= update_date) & (~pl.col("game_id").is_in(new_data_3["game_id"]))
).select(remainder_df.columns)
update_data_4 = season_2024.filter(
    (pl.col("date") >= update_date) & (~pl.col("game_id").is_in(new_data_4["game_id"]))
).drop_nulls(
   ["fieldGoalsMade_109_home_team", "fieldGoalsMade_109_away_team"]
).select(season_df.columns)

In [31]:
if new_data_1.shape[0] > 0:
    response = (
        connection.table('statistics_previous').insert(
            new_data_1.to_dicts()
        ).execute()
    )
if new_data_2.shape[0] > 0:
    response = (
        connection.table('statistics_recent_games').insert(
            new_data_2.to_dicts()
        ).execute()
    )
if new_data_3.shape[0] > 0:
    response = (
        connection.table('statistics_remainder').insert(
            new_data_3.to_dicts()
        ).execute()
    )
if new_data_4.shape[0] > 0:
    response = (
        connection.table('statistics_season').insert(
            new_data_4.to_dicts()
        ).execute()
    )

if update_data_1.shape[0] > 0:
    response = (
        connection.table('statistics_previous').upsert(
            update_data_1.to_dicts()
        ).execute()
    )
if update_data_2.shape[0] > 0:
    response = (
        connection.table('statistics_recent_games').upsert(
            update_data_2.to_dicts()
        ).execute()
    )
if update_data_3.shape[0] > 0:
    response = (
        connection.table('statistics_remainder').upsert(
            update_data_3.to_dicts()
        ).execute()
    )
if update_data_4.shape[0] > 0:
    response = (
        connection.table('statistics_season').upsert(
            update_data_4.to_dicts()
        ).execute()
    )

In [32]:
h2h_current_year = load_season(
    season_dates.filter(pl.col('season_id') == season_id)['all_star_date'][0],
    season_dates.filter(pl.col('season_id') == season_id)['play_in_start'][0],
    season_dates.filter(pl.col('season_id') == season_id)['play_in_end'][0],
    connection=connection,
    season_id=season_id,
    return_h2h=True
)
h2h_supabase = collect_season_filtered_table(season_id, 'h2h', connection)
new_data_5  = h2h_current_year.filter(~pl.col('game_id').is_in(h2h_supabase['game_id'])).to_dicts()

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:01<00:00, 29.67it/s]


In [35]:
if len(new_data_5) > 0:
    response = (
        connection.table('h2h').insert(
            new_data_5
        ).execute()
    )
rec_current_year = record_current_season(
    season_dates.filter(pl.col('season_id') == season_id)['all_star_date'][0],
    season_dates.filter(pl.col('season_id') == season_id)['play_in_start'][0],
    season_dates.filter(pl.col('season_id') == season_id)['play_in_end'][0],
    connection=connection,
    season_id=season_id,
).drop_nulls('games_this_year_home_team')
rec_current_year_supabase = collect_season_filtered_table(season_id, 'record', connection)
schedule = collect_season_data(season_id, 'schedule', connection)
new_data_6 = rec_current_year.filter(
    ~pl.col('game_id').is_in(rec_current_year_supabase['game_id'])
).join(
    schedule[["game_id", "date"]],
    on="game_id"
).filter(
    pl.col("date").str.to_date() <= date.today()
).drop("date").to_dicts()

In [None]:
if len(new_data_6) > 0:
    response = (
        connection.table('record').insert(
            new_data_6
        ).execute()
    )
update_data_6 = rec_current_year.join(
    rec_current_year_supabase[["game_id", "points_home"]],
    on="game_id"
).filter(
    pl.col("points_home").is_not_null() & pl.col("points_home_right").is_null()
).drop("points_home_right").to_dicts()
if len(update_data_6) > 0:
    response = (
        connection.table('record').upsert(
            update_data_6
        ).execute()
    )

In [42]:
df_list = []
for s_id in season_dates['season_id']:
    if s_id == season_id:
        df_list.append(rec_current_year)
    else:
        df_list.append(
            collect_season_filtered_table(s_id, 'record', connection)
        )
df_list = [
    rec_current_year if s_id == season_id else collect_season_filtered_table(s_id, 'record', connection)
    for s_id in season_dates['season_id']
]
schedule_df = collect_all_data('schedule', connection)
df_list = [df.join(schedule_df, on='game_id').drop('season_id') for df in df_list]
df_list[-1] = df_list[-1][df_list[0].columns]
elo_df_list = []
for i in range(len(df_list)):
    if i == 0:
        elo_df_list.append(
            elo_season(df_list[i])
        )
    else:
        elo_df_list.append(
            elo_season(df_list[i].drop_nulls('points_home'), elo_df_list[i - 1])
        )
elo_df = pl.concat(elo_df_list)
elo_df_supabase = collect_all_data('elo', connection)
new_data_7 = elo_df.join(
    elo_df_supabase[['game_id', 'team_id', 'elo_before']],
    on=['game_id', 'team_id'],
    how='left'
).filter(pl.col('elo_before_right').is_null()).drop('elo_before_right').to_dicts()

In [44]:
if len(new_data_7) > 0:
    response = (
        connection.table('elo').insert(
            new_data_7
        ).execute()
    )

In [45]:
client = storage.Client()
bucket = client.get_bucket("lgbm")
mod = lgbm_model(
    connection = connection,
    bucket = bucket,
    data_origin="supabase"
)
mod.load_model()

In [54]:
mod.predict()

Downloaded newest matching file: lgbm_evaluation_february_2025.json to lgbm_evaluation_february_2025.json
7 predictions added


In [48]:
best_features, cutoff_date = self.load_best_features()
previous_predictions = collect_all_data('predictions', self.connection)

Downloaded newest matching file: lgbm_evaluation_february_2025.json to lgbm_evaluation_february_2025.json


In [49]:
cutoff_date = self.full_data.join(
    previous_predictions,
    on='game_id',
    how='inner'
)['date'].max()
X_new = self.full_data.to_dummies([
    'game_type', 'month', 'weekday'
]).filter(
    (pl.col('date') >= cutoff_date) &
    (pl.col('is_home_win').is_not_null()) &
    (~pl.col('game_id').is_in(previous_predictions['game_id']))
)

datetime.date(2025, 3, 2)

In [53]:
game_ids = X_new['game_id']
X_new = X_new.select(best_features).drop('date')
predictions = self.model.predict(X_new.to_numpy())
prediction_df = pl.DataFrame({
    'game_id': game_ids,
    'probability': predictions,
    'is_home_win': predictions >= 0.5
})
prediction_df

game_id,probability,is_home_win
str,f64,bool
"""0022400881""",0.022659,False
"""0022400879""",0.447127,False
"""0022400877""",0.994281,True
"""0022400876""",0.304303,False
"""0022400875""",0.185216,False
"""0022400880""",0.05036,False
"""0022400878""",0.982777,True
