In [1]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# ensure project root on path for src imports
sys.path.append(os.path.abspath(".."))

from src.features import build_feature_pipeline

# load data with fallback path
DATA_PATH = "data/nba_shots_clean.csv"
if not os.path.exists(DATA_PATH):
    alt_path = os.path.join("..", "data", "nba_shots_clean.csv")
    if os.path.exists(alt_path):
        DATA_PATH = alt_path
    else:
        raise FileNotFoundError(
            "nba_shots_clean.csv not found. Run the cleaning notebook to generate it."
        )

df = pd.read_csv(DATA_PATH)
print("Loaded dataset shape:", df.shape)
dtype_counts = df.dtypes.astype(str).value_counts()
print("Datatype coverage (cleaned data):\n", dtype_counts)

available_feature_columns = [c for c in df.columns if c != "SHOT_MADE_FLAG"]
print(f"Available feature columns (excluding target): {len(available_feature_columns)}")
print("Sample preserved columns:", available_feature_columns[:10])

X = df[["LOC_X", "LOC_Y", "SHOT_DISTANCE", "YEAR", "SHOT_TYPE", "ACTION_TYPE"]]
y = df["SHOT_MADE_FLAG"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.head(), y_train.head()


Loaded dataset shape: (4231262, 27)
Datatype coverage (cleaned data):
 object     14
int64      10
float64     2
bool        1
Name: count, dtype: int64
Available feature columns (excluding target): 26
Sample preserved columns: ['TEAM_NAME', 'PLAYER_NAME', 'LOC_X', 'LOC_Y', 'SHOT_DISTANCE', 'SHOT_TYPE', 'ACTION_TYPE', 'YEAR', 'SEASON_1', 'SEASON_2']


(         LOC_X  LOC_Y  SHOT_DISTANCE  YEAR       SHOT_TYPE  \
 899817    -9.0  18.75             16  2008  2PT Field Goal   
 2984657  -10.5  18.55             16  2019  2PT Field Goal   
 498554     3.4   8.35              4  2006  2PT Field Goal   
 3820607    8.8  27.85             24  2023  3PT Field Goal   
 348900     4.3  12.45              8  2005  2PT Field Goal   
 
                       ACTION_TYPE  
 899817                  Jump Shot  
 2984657       Step Back Jump shot  
 498554   Driving Finger Roll Shot  
 3820607                 Jump Shot  
 348900         Fadeaway Jump Shot  ,
 899817     1
 2984657    0
 498554     0
 3820607    0
 348900     0
 Name: SHOT_MADE_FLAG, dtype: int64)

In [2]:
# Build preprocessing
preprocessor, feature_list = build_feature_pipeline(df)

# Baseline logistic regression
log_reg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", LogisticRegression(max_iter=500)),
    ]
)

log_reg_model.fit(X_train, y_train)

y_pred_lr = log_reg_model.predict(X_test)
y_prob_lr = log_reg_model.predict_proba(X_test)[:, 1]

print("Accuracy (LR):", round(accuracy_score(y_test, y_pred_lr), 3))
print("ROC-AUC (LR):", round(roc_auc_score(y_test, y_prob_lr), 3))


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=500).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy (LR): 0.626
ROC-AUC (LR): 0.644


In [3]:
# XGBoost model (optional; requires libomp on macOS)
xgb_available = True
try:
    from xgboost import XGBClassifier
except Exception as e:  # ImportError or missing libomp
    xgb_available = False
    print("XGBoost not available. Install with `pip install xgboost` and on macOS run `brew install libomp`.")
    print("Skipping XGBoost training. Error:", e)

if xgb_available:
    try:
        model_xgb = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                (
                    "clf",
                    XGBClassifier(
                        n_estimators=300,
                        learning_rate=0.05,
                        max_depth=6,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        eval_metric="logloss",
                        n_jobs=-1,
                    ),
                ),
            ]
        )

        model_xgb.fit(X_train, y_train)

        y_prob_xgb = model_xgb.predict_proba(X_test)[:, 1]
        print("ROC-AUC (XGB):", round(roc_auc_score(y_test, y_prob_xgb), 3))
    except Exception as e:
        xgb_available = False
        print("XGBoost failed to train; skipping. Error:", e)
        print("On macOS, run: brew install libomp")


ROC-AUC (XGB): 0.675


In [4]:
import joblib

# only save if XGBoost trained successfully
if "model_xgb" in locals() and xgb_available:
    os.makedirs("models", exist_ok=True)
    model_path = "models/shot_model_xgb.pkl"
    joblib.dump(model_xgb, model_path)
    print("Saved XGB model to", model_path)
else:
    print("XGBoost model not available; skipping save.")


Saved XGB model to models/shot_model_xgb.pkl
