In [6]:
%load_ext autoreload
%autoreload 2
%config InteractiveShell.ast_node_interactivity = "all"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import sys
import os

# Add the parent directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [4]:
import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path

In [5]:
path2023 = Path('..') / 'data' / 'processed' / 'processed_shots_2023.parquet'
path2024 = Path('..') / 'data' / 'processed' / 'processed_shots_2024.parquet'
path2025 = Path('..') / 'data' / 'processed' / 'processed_shots_2025.parquet'

processed2023 = pd.read_parquet(path2023, engine='pyarrow')
processed2024 = pd.read_parquet(path2024, engine='pyarrow')
processed2025 = pd.read_parquet(path2025, engine='pyarrow')

full_df = pd.concat([processed2023, processed2024, processed2025])
full_df.head()

Unnamed: 0,home,last_play,rebound,rush,home_skaters,away_skaters,x_coord,y_coord,shot_type,zone,shot_class,position,shooter_hand,shooting_pct,glove_hand,save_pct,angle,shot_on_glove,situation,target
0,Home,giveaway,No rebound,Rush,5,5,77,6,wrist,O,shot-on-goal,C,L,0.088,L,0.915213,26.57,LL,EV,0
1,Home,hit,No rebound,No rush,5,5,30,30,slap,O,shot-on-goal,D,R,0.0427,L,0.915213,26.95,RL,EV,0
2,Away,shot-on-goal,No rebound,No rush,5,5,35,30,wrist,O,shot-on-goal,D,L,0.0439,L,0.917598,29.05,LL,EV,0
3,Home,hit,No rebound,No rush,5,5,41,14,wrist,O,shot-on-goal,R,R,0.1086,L,0.915213,16.26,RL,EV,0
4,Home,hit,No rebound,No rush,5,5,46,17,wrist,O,shot-on-goal,R,R,0.1086,L,0.915213,21.57,RL,EV,0


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [8]:
cleaned_df = full_df.dropna()
X = cleaned_df.drop(['target', 'shot_class'], axis=1)
y = cleaned_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [9]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

cat_pipeline = make_pipeline(OneHotEncoder(handle_unknown="ignore"))
num_pipeline = make_pipeline(StandardScaler(), MinMaxScaler((-1,1)))

preprocessing = ColumnTransformer([
        ("cat", cat_pipeline, make_column_selector(dtype_include=object)),
    ],
    remainder=num_pipeline)

In [10]:
log_reg = make_pipeline(preprocessing, LogisticRegression())
log_reg.fit(X_train, y_train)
goal_predictions = log_reg.predict(X_train)
goal_predictions[:5].round(-2)
y.iloc[:5].values

array([0, 0, 0, 0, 0])

array([0, 0, 0, 0, 0])

In [11]:
probabilities_log = log_reg.predict_proba(X_test)
pred_test = log_reg.predict(X_test)

In [12]:
probabilities_log

array([[0.97693399, 0.02306601],
       [0.98286758, 0.01713242],
       [0.79802156, 0.20197844],
       ...,
       [0.79955193, 0.20044807],
       [0.85104428, 0.14895572],
       [0.97689727, 0.02310273]], shape=(136670, 2))

In [13]:
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report

auc_test = roc_auc_score(y_test, probabilities_log[:,1])
auc_train = roc_auc_score(y_train, log_reg.predict_proba(X_train)[:,1])

confusion_matrix(y_test, pred_test)
print(classification_report(y_test, pred_test))

array([[127515,      1],
       [  9147,      7]])

              precision    recall  f1-score   support

           0       0.93      1.00      0.97    127516
           1       0.88      0.00      0.00      9154

    accuracy                           0.93    136670
   macro avg       0.90      0.50      0.48    136670
weighted avg       0.93      0.93      0.90    136670



In [14]:
auc_test
auc_train

np.float64(0.7004215724443303)

np.float64(0.7028205862841514)

In [15]:
from src.experiment_utils import set_mlflow_tracking, log_model_to_mlflow
from dotenv import load_dotenv
load_dotenv()

mlflow = set_mlflow_tracking()

True

INFO:src.experiment_utils:MLflow tracking URI and credentials set.


In [16]:
log_model_to_mlflow(log_reg, X_test, "Logistic_regression", "AUC", score=auc_test)

2025/03/31 16:35:01 INFO mlflow.tracking.fluent: Experiment with name 'Logistic_regression' does not exist. Creating a new experiment.
INFO:src.experiment_utils:Experiment set to: Logistic_regression
INFO:src.experiment_utils:Logged AUC: 0.7004215724443303
INFO:src.experiment_utils:Model signature inferred.
Successfully registered model 'Pipeline'.
2025/03/31 16:35:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Pipeline, version 1
Created version '1' of model 'Pipeline'.
INFO:src.experiment_utils:Model logged with name: Pipeline


🏃 View run painted-jay-155 at: https://dagshub.com/nolantphillips/xG_models.mlflow/#/experiments/0/runs/26020dd486854d2e832e745fd1bb76cc
🧪 View experiment at: https://dagshub.com/nolantphillips/xG_models.mlflow/#/experiments/0


<mlflow.models.model.ModelInfo at 0x24e57569f70>