In [54]:
from pathlib import Path
import pandas as pd
import numpy as np

DATA = Path("../data/processed")
train = pd.read_csv(DATA / "train.csv")

print("train shape:", train.shape)
train.head()


train shape: (5036, 9)


Unnamed: 0,Season,win_pct_diff,avg_margin_diff,avg_pf_diff,avg_pa_diff,std_margin_diff,seed_diff,seed_diff.1,label
0,1985,-0.030303,-6.830303,-4.4,2.430303,-4.413265,-1,-1,1
1,1985,-0.05931,-0.110345,1.224828,1.335172,-1.382197,-5,-5,1
2,1985,0.546616,20.114943,9.98212,-10.132822,-1.22493,15,15,1
3,1985,0.062169,2.177249,3.199735,1.022487,0.221971,-1,-1,1
4,1985,0.025926,1.077778,8.477778,7.4,1.282731,11,11,1


In [55]:
from pathlib import Path
import pandas as pd

DATA = Path("../data/processed")
train = pd.read_csv(DATA / "train.csv")

print(train.shape)
print(train.columns)
train.head()


(5036, 9)
Index(['Season', 'win_pct_diff', 'avg_margin_diff', 'avg_pf_diff',
       'avg_pa_diff', 'std_margin_diff', 'seed_diff', 'seed_diff.1', 'label'],
      dtype='object')


Unnamed: 0,Season,win_pct_diff,avg_margin_diff,avg_pf_diff,avg_pa_diff,std_margin_diff,seed_diff,seed_diff.1,label
0,1985,-0.030303,-6.830303,-4.4,2.430303,-4.413265,-1,-1,1
1,1985,-0.05931,-0.110345,1.224828,1.335172,-1.382197,-5,-5,1
2,1985,0.546616,20.114943,9.98212,-10.132822,-1.22493,15,15,1
3,1985,0.062169,2.177249,3.199735,1.022487,0.221971,-1,-1,1
4,1985,0.025926,1.077778,8.477778,7.4,1.282731,11,11,1


In [56]:
from sklearn.model_selection import GroupShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score

X_cols = [c for c in train.columns if c.endswith("_diff") or c == "seed_diff"]
y = train["label"].astype(int).values
groups = train["Season"].values

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(train[X_cols], y, groups=groups))

X_train, X_test = train.loc[train_idx, X_cols], train.loc[test_idx, X_cols]
y_train, y_test = y[train_idx], y[test_idx]

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000))
])

pipe.fit(X_train, y_train)

proba = pipe.predict_proba(X_test)[:, 1]
pred = (proba >= 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, pred))
print("ROC AUC :", roc_auc_score(y_test, proba))
print("LogLoss :", log_loss(y_test, proba))


Accuracy: 0.6316793893129771
ROC AUC : 0.6925040790163743
LogLoss : 0.6389514457520175


In [57]:
coefs = pd.Series(pipe.named_steps["clf"].coef_[0], index=X_cols).sort_values(key=abs, ascending=False)
coefs.head(15)

avg_margin_diff    7.982399e-01
avg_pf_diff        4.351215e-01
avg_pa_diff       -2.496565e-01
win_pct_diff      -1.051894e-01
std_margin_diff   -4.920149e-02
seed_diff         -1.371344e-16
dtype: float64

In [58]:
import joblib

MODEL_DIR = Path("../models")
MODEL_DIR.mkdir(exist_ok=True)

joblib.dump(pipe, MODEL_DIR / "logreg_baseline.joblib")
print("Saved:", (MODEL_DIR / "logreg_baseline.joblib").resolve())


Saved: /Users/owen/mm_model/models/logreg_baseline.joblib


In [59]:
print(train.columns)


Index(['Season', 'win_pct_diff', 'avg_margin_diff', 'avg_pf_diff',
       'avg_pa_diff', 'std_margin_diff', 'seed_diff', 'seed_diff.1', 'label'],
      dtype='object')


In [60]:
X_cols = [
    c for c in train.columns
    if c.endswith("_diff") or c == "seed_diff"
]

y = train["label"].astype(int).values
groups = train["Season"].values

print("Features used:", X_cols)


Features used: ['win_pct_diff', 'avg_margin_diff', 'avg_pf_diff', 'avg_pa_diff', 'std_margin_diff', 'seed_diff']
