In [None]:
import pandas as pd
import google
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import joblib
from google.colab import drive
import numpy as np

In [None]:
# Replace with your file access paradigm
drive.mount('/content/drive/', force_remount=True)
file_path = "{Your filepath here}"
df = pd.read_csv(file_path)

**All Columns**

In [None]:

# Initial set of features from the datafile that can be considered in the
features = ["season", "week", "neutral", "conference_game", "venue_id",
            "home_team_id", "home_in_acc", "home_in_aac", "home_in_big12",
            "home_in_big10", "home_in_cusa", "home_independent", "home_in_mac",
            "home_in_mwc", "home_in_pac12", "home_in_sec", "home_in_sunbelt",
            "home_time_change", "away_team_id", "away_in_acc", "away_in_aac",
            "away_in_big12", "away_in_big10", "away_in_cusa",
            "away_independent", "away_in_mac", "away_in_mwc", "away_in_pac12",
            "away_in_sec", "away_in_sunbelt", "away_fcs", "away_time_change",
            "home_coach_interim", "away_coach_interim", "tenure_delta",
            "spread", "home_favorite", "temperature", "dew_point", "humidity",
            "precipitation", "wind_speed", "weather_condition",
            "home_pregame_elo", "replaced_home_elo", "away_pregame_elo",
            "over_under","game_indoors", "wind_dir", "atm_pressure"]

In [None]:
# Split the file into games that have already happened and games that are upcoming.
# The file only has games through the upcoming week, week 6.

past_games_df = df[df["covered"].notna()].copy()
future_games_df = df[df["covered"].isna()].copy()

X = past_games_df[features]
y = past_games_df["covered"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

**Baseline Models**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report

models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42, class_weight="balanced"),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(f"=== {name} ===")
    print(classification_report(y_test, preds))
    print()


In [None]:
# Look at mutual information values to determine which features might be irrelevant
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X, y, discrete_features='auto')
mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print(mi_series)

**Try to remove some features and encode team IDs**

In [None]:
# Keep some that seem like they could be relevant with features yet to be
# identified
trimmed_features = ["neutral", "conference_game",
            "home_in_acc", "home_in_aac", "home_in_big12",
            "home_in_big10", "home_in_cusa", "home_independent", "home_in_mac",
            "home_in_mwc", "home_in_pac12", "home_in_sec", "home_in_sunbelt",
            "home_time_change", "away_in_acc", "away_in_aac",
            "away_in_big12", "away_in_big10", "away_in_cusa",
            "away_independent", "away_in_mac", "away_in_mwc", "away_in_pac12",
            "away_in_sec", "away_in_sunbelt", "away_fcs", "away_time_change",
            "home_coach_interim", "away_coach_interim", "tenure_delta",
            "spread", "home_favorite", "temperature", "dew_point", "humidity",
            "precipitation", "weather_condition",
            "home_pregame_elo", "away_pregame_elo",
            "wind_dir", "atm_pressure"]

In [None]:
pip install scikit-learn category-encoders

Create a random forest model for now, try other ones later.

In [None]:

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import make_scorer, f1_score, roc_auc_score, precision_score, recall_score
from category_encoders import TargetEncoder

# Split into observed and future games
df_train = df[df["covered"].notna()].copy()
df_future = df[df["covered"].isna()].copy()

y = df_train["covered"]

# Remove some of the irrelevant features
num_cols = trimmed_features

highcard_cols = ["home_team_id","away_team_id"]  # target-encode these


use_cols = [c for c in (num_cols + highcard_cols)]

X = df_train[use_cols].copy()

# --- 3) Preprocess: impute, encode ---
# TargetEncoder is applied inside the CV pipeline => no leakage
preprocess = ColumnTransformer(
    transformers=[
        ("tgt", TargetEncoder(cols=[c for c in highcard_cols if c in X.columns],
                              smoothing=5.0, min_samples_leaf=20),
                [c for c in highcard_cols if c in X.columns]),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)

# Create a random forest model
model = RandomForestClassifier(
    n_estimators=500,
    max_depth=5,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline([
    ("prep", preprocess),
    ("clf", model)
])

# --- 5) K-fold CV (stratified) ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "acc": make_scorer(accuracy_score),
    "f1": make_scorer(f1_score),
    "roc_auc": make_scorer(roc_auc_score),
    "prec": make_scorer(precision_score),
    "rec": make_scorer(recall_score),
}

cv_res = cross_validate(pipe, X, y, scoring=scoring, cv=cv, n_jobs=-1, return_train_score=False)
summary = {k: (np.mean(v), np.std(v)) for k, v in cv_res.items() if k.startswith("test_")}
print({metric.replace("test_",""): f"{m:.3f} ± {s:.3f}" for metric, (m,s) in summary.items()})

# --- 6) Fit on full train and predict this week (optional) ---
pipe.fit(X, y)
print()
if not df_future.empty:
    X_future = df_future[use_cols].copy()
    # Imputation/encoding happen inside .predict/.predict_proba
    probs = pipe.predict_proba(X_future)[:,1]
    picks = (probs >= 0.5).astype(int)
    out = df_future.assign(pred_prob=probs, home_cover=picks)
    print(out[["home_team", "away_team", "spread","home_cover", "pred_prob"]])
