In [None]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

# ensure project root on path for src imports
sys.path.append(os.path.abspath(".."))

from src.features import build_feature_pipeline

# load data with fallback path
DATA_PATH = "data/nba_shots_clean.csv"
if not os.path.exists(DATA_PATH):
    alt_path = os.path.join("..", "data", "nba_shots_clean.csv")
    if os.path.exists(alt_path):
        DATA_PATH = alt_path
    else:
        raise FileNotFoundError(
            "nba_shots_clean.csv not found. Run the cleaning notebook to generate it."
        )

df = pd.read_csv(DATA_PATH)
print("Loaded dataset shape:", df.shape)
dtype_counts = df.dtypes.astype(str).value_counts()
print("Datatype coverage (cleaned data):\n", dtype_counts)

available_feature_columns = [c for c in df.columns if c != "SHOT_MADE_FLAG"]
print(f"Available feature columns (excluding target): {len(available_feature_columns)}")
print("Sample preserved columns:", available_feature_columns[:10])

X = df[["LOC_X", "LOC_Y", "SHOT_DISTANCE", "YEAR", "SHOT_TYPE", "ACTION_TYPE"]]
y = df["SHOT_MADE_FLAG"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.head(), y_train.head()


In [None]:
# Build preprocessing
preprocessor, feature_list = build_feature_pipeline(df)

# Baseline logistic regression
log_reg_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", LogisticRegression(max_iter=500)),
    ]
)

log_reg_model.fit(X_train, y_train)

y_pred_lr = log_reg_model.predict(X_test)
y_prob_lr = log_reg_model.predict_proba(X_test)[:, 1]

print("Accuracy (LR):", round(accuracy_score(y_test, y_pred_lr), 3))
print("ROC-AUC (LR):", round(roc_auc_score(y_test, y_prob_lr), 3))


In [None]:
# XGBoost model (optional; requires libomp on macOS)
xgb_available = True
try:
    from xgboost import XGBClassifier
except Exception as e:  # ImportError or missing libomp
    xgb_available = False
    print("XGBoost not available. Install with `pip install xgboost` and on macOS run `brew install libomp`.")
    print("Skipping XGBoost training. Error:", e)

if xgb_available:
    try:
        model_xgb = Pipeline(
            steps=[
                ("preprocess", preprocessor),
                (
                    "clf",
                    XGBClassifier(
                        n_estimators=300,
                        learning_rate=0.05,
                        max_depth=6,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        eval_metric="logloss",
                        n_jobs=-1,
                    ),
                ),
            ]
        )

        model_xgb.fit(X_train, y_train)

        y_prob_xgb = model_xgb.predict_proba(X_test)[:, 1]
        print("ROC-AUC (XGB):", round(roc_auc_score(y_test, y_prob_xgb), 3))
    except Exception as e:
        xgb_available = False
        print("XGBoost failed to train; skipping. Error:", e)
        print("On macOS, run: brew install libomp")


In [None]:
import joblib

# only save if XGBoost trained successfully
if "model_xgb" in locals() and xgb_available:
    os.makedirs("models", exist_ok=True)
    model_path = "models/shot_model_xgb.pkl"
    joblib.dump(model_xgb, model_path)
    print("Saved XGB model to", model_path)
else:
    print("XGBoost model not available; skipping save.")
