In [168]:
import subprocess
import os
import argparse
import pandas as pd
import numpy as np
import requests
from sqlalchemy import create_engine
from dotenv import load_dotenv
import logging
from sqlalchemy import text
import duckdb
import datetime
from bs4 import BeautifulSoup
pd.set_option('display.max_columns', None)

load_dotenv()
DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
engine = create_engine(DATABASE_URL)

In [169]:
engine 
start_year=2024 
end_year=2024 
parquet_folder = 'data/raw/nflfastr/'

In [170]:
weekly_df = pd.read_sql("SELECT * FROM weekly_stats", engine)
players_df = pd.read_sql("SELECT * FROM players", engine)
teams_df = pd.read_sql('SELECT * FROM teams', engine)
games_df = pd.read_sql('SELECT * FROM games', engine)
depth_df = pd.read_sql('SELECT * FROM depth_chart', engine)
injuries_df = pd.read_sql('SELECT * FROM injuries', engine)
schedule_df = pd.read_sql("SELECT season, week, stadium, game_date FROM games", engine)
base_features_df = pd.read_sql("SELECT * FROM player_weekly_features", engine)
pre_features_df = pd.read_sql("SELECT * FROM player_weekly_features", engine)
final_df = pd.read_sql("SELECT * FROM features", engine)
qb_df = pd.read_sql("SELECT * FROM qb_features", engine)
rb_df = pd.read_sql("SELECT * FROM rb_features", engine)
wr_df = pd.read_sql("SELECT * FROM wr_features", engine)
te_df = pd.read_sql("SELECT * FROM te_features", engine)

In [210]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import joblib
import argparse
from dotenv import load_dotenv
import os
from datetime import datetime
pd.set_option('mode.chained_assignment', None)

In [253]:
POSITION = 'qb'
MODEL_DIR = "models"
TABLE_NAME = f"{POSITION}_features"
TARGET = "fantasy_points"
EXCLUDE_COLS = ["player_id", "player_name", "season", "week", "player_display_name", "position", "team_abbreviation", TARGET]

BASE_DIR = os.path.dirname(os.path.abspath(__file__))
MODEL_DIR = os.path.join(BASE_DIR, "models")
LOG_DIR = os.path.join(BASE_DIR, "model_logs")
# ------------------------
# FUNCTIONS
# ------------------------
def load_data():
    load_dotenv()
    DATABASE_URL = f"postgresql://{os.getenv('DB_USER')}:{os.getenv('DB_PASSWORD')}@{os.getenv('DB_HOST')}:{os.getenv('DB_PORT')}/{os.getenv('DB_NAME')}"
    engine = create_engine(DATABASE_URL)

    df = pd.read_sql(f"SELECT * FROM {TABLE_NAME}", engine)
    return df


def train_test_split_for_week(df, season, week):
    """
    Simulate predicting for a specific season/week by holding out that week as test.
    All prior weeks are used for training.
    """
    train_df = df[(df["season"] < season) | ((df["season"] == season) & (df["week"] < week))]
    test_df = df[(df["season"] == season) & (df["week"] == week)]
    return train_df, test_df


def get_features(df, train_df, test_df):
    """Select feature columns dynamically and handle categorical/bool types automatically."""
    features = [col for col in df.columns if col not in EXCLUDE_COLS]

    # Detect categorical columns (object or category types)
    cat_cols = train_df.select_dtypes(include=["object", "category"]).columns
    for col in cat_cols:
        train_df[col] = train_df[col].astype("category")
        test_df[col] = test_df[col].astype("category")

    # Detect boolean-like columns (0/1 or True/False)
    bool_cols = [col for col in train_df.columns 
                 if train_df[col].dropna().nunique() == 2 and 
                 sorted(train_df[col].dropna().unique()) in [[0, 1], [False, True], ['false','true']]]
    for col in bool_cols:
        train_df[col] = train_df[col].astype(bool)
        test_df[col] = test_df[col].astype(bool)

    train_df['dome'] = train_df['dome'].astype(bool)
    test_df['dome'] = test_df['dome'].astype(bool)

    X_train = train_df[features]
    y_train = train_df[TARGET]

    X_test = test_df[features]
    y_test = test_df[TARGET]

    return features, X_train, y_train, X_test, y_test

def train_model(train_df, features):
    """Train an XGBoost regression model."""
    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        enable_categorical=True
    )
    model.fit(train_df[features], train_df[TARGET])
    return model

def evaluate_model(model, X_train, y_train, X_test, y_test, season, week,
                   position=POSITION, model_name=None, log_dir=LOG_DIR):
    if model_name is None:
        model_name = f"{position}_model_{datetime.now().strftime('%Y%m%d%H%M%S')}"
    print("Evaluating model performance...")

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    metrics = {
        "season": season,
        "week": week,
        "position": position,
        "rmse_train": np.sqrt(mean_squared_error(y_train, y_pred_train)),
        "mae_train": mean_absolute_error(y_train, y_pred_train),
        "r2_train": r2_score(y_train, y_pred_train),
        "rmse_test": np.sqrt(mean_squared_error(y_test, y_pred_test)),
        "mae_test": mean_absolute_error(y_test, y_pred_test),
        "r2_test": r2_score(y_test, y_pred_test),
        "timestamp": datetime.now().strftime("%Y%m%d%H%M%S"),
        "model_name": model_name
    }

    print("\nModel Evaluation Metrics:")
    for k, v in metrics.items():
        if isinstance(v, float):
            print(f"{k}: {v:.4f}")
        else:
            print(f"{k}: {v}")

    # Logging path
    os.makedirs(log_dir, exist_ok=True)
    csv_path = os.path.join(log_dir, "metrics_log.csv")

    if os.path.exists(csv_path):
        df_log = pd.read_csv(csv_path)
    else:
        df_log = pd.DataFrame()

    # Find best previous model for this season/week/position by test MAE
    best_mae = df_log.loc[
        (df_log['season'] == season) &
        (df_log['week'] == week) &
        (df_log['position'] == position),
        'mae_test'
    ].min() if not df_log.empty else np.inf

    if metrics['mae_test'] < best_mae:
        print(f"New model outperforms existing best MAE ({best_mae:.4f}). Will save model.")
        save_flag = True
    else:
        print(f"Model did NOT outperform existing best MAE ({best_mae:.4f}). Model will NOT be saved.")
        save_flag = False

    # Append new metrics anyway
    df_log = pd.concat([df_log, pd.DataFrame([metrics])], ignore_index=True)
    df_log.to_csv(csv_path, index=False)
    print(f"Metrics appended to {csv_path}")

    return metrics, save_flag


def save_model(model, filename = f"qb_model_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.pkl"):
    """Save model to disk."""
    os.makedirs(MODEL_DIR, exist_ok=True)
    path = os.path.join(MODEL_DIR, filename)
    joblib.dump(model, path)
    print(f"Model saved to {path}")

NameError: name '__file__' is not defined

In [None]:
df = load_data()
train_df, test_df = train_test_split_for_week(df, season=2024, week=8)
features, X_train, y_train, X_test, y_test = get_features(df, train_df, test_df)
model = train_model(train_df, features)
metrics, save_flag = evaluate_model(model, X_train, y_train, X_test, y_test)
save_model(model)


Model Evaluation Metrics:
rmse_train: 0.1099
mae_train: 0.0776
r2_train: 0.9998
rmse_test: 2.7148
mae_test: 1.9712
r2_test: 0.8896
timestamp: 2025-08-08 00:13:43
model_name: qb_model_2025-08-07 23:41:23
Metrics appended to model_logs/metrics_log.csv


ValueError: too many values to unpack (expected 2)

In [206]:
evaluate_model(model, X_train, y_train, X_test, y_test)


Model Evaluation Metrics:
rmse_train: 0.1549
mae_train: 0.1099
r2_train: 0.9997
rmse_test: 2.4957
mae_test: 1.7383
r2_test: 0.9067
timestamp: 2025-08-07 23:31:30
model_name: qb_model_2025-08-07 23:31:26
Metrics appended to modeling/model_logs/metrics_log.csv


{'rmse_train': np.float64(0.15489133389522242),
 'mae_train': 0.10993546221180071,
 'r2_train': 0.9997002206054424,
 'rmse_test': np.float64(2.4956915882256125),
 'mae_test': 1.7383012952449475,
 'r2_test': 0.9066947024917293,
 'timestamp': '2025-08-07 23:31:30',
 'model_name': 'qb_model_2025-08-07 23:31:26'}

In [254]:
df = pd.read_csv('../src/modeling/model_logs/metrics_log.csv')

In [255]:
df

Unnamed: 0,season,week,position,rmse_train,mae_train,r2_train,rmse_test,mae_test,r2_test,timestamp,model_name
0,2024.0,16.0,qb,0.138277,0.099496,0.999763,1.953717,1.550636,0.927511,20250808162936,qb_model_season2024_week16_20250808162936.pkl
1,,,,0.15236,0.083271,0.999283,0.379498,0.208738,0.995105,20250808162939,wr_model_season2024_week16_20250808162939.pkl
2,,,,0.119383,0.077948,0.999703,1.227154,0.613614,0.97792,20250808162942,rb_model_season2024_week16_20250808162942.pkl
3,,,,0.052052,0.034624,0.999852,0.405181,0.205259,0.985529,20250808162944,te_model_season2024_week16_20250808162944.pkl


In [243]:
datetime.now().strftime("%Y%m%d%H%M%S")

'20250807234827'