# 02 - Model Training & Prediction
Trains multiple regression models (Random Forest, XGBoost, CatBoost, Ridge) on the master dataset to predict rookie fantasy points (Half-PPR + TE Premium) from college stats and draft capital.

**Input:** `data/processed/df_master.csv`, `data/processed/df_master_rookies_2025.csv`
**Output:** `data/output/df_rookie_predictions_2025.csv`

### If only using fantasy points as target, remove other rookie stats from df

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool

In [None]:
df = pd.read_csv("data/processed/df_master.csv")

In [None]:
# Keep only college stats + target (remove individual rookie stat columns except fantasy points)
columns_to_keep = [col for col in df.columns if not col.startswith('R_') or col == 'R_fantasy_points_halfppr_tep']
df_filtered = df[columns_to_keep].copy()

# Remove identifier and draft columns not used as features
columns_to_remove = ['player_name', 'player_id_x', 'player_id_y',
                     'draft_year', 'draft_round', 'draft_pick_overall', 'age_on_draft_day']
df_filtered.drop(columns=columns_to_remove, inplace=True)

In [5]:
target = 'R_fantasy_points_halfppr_tep'  # column to predict
X = df_filtered.drop(columns=[target])
y = df_filtered[target]

# Identify categorical columns
categorical_cols = ['position', 'team', 'C_conference', 'C_team'] 
numeric_cols = [col for col in X.columns if col not in categorical_cols]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ]
)

# Random Forest

In [8]:
model_RF = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])

# Train the model
model_RF.fit(X_train, y_train)

# Predict
y_pred = model_RF.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse:.2f}, R2: {r2:.2f}')


MSE: 2468.86, R2: 0.29




### Random Forest Analysis
This model shows poor performance, likely due to the small dataset (n=228). The MSE indicates predictions are off by ~50 fantasy points (sqrt of 2468.86). The R2 value of 0.29 means the model only explains ~30% of the variance.

Increasing the sample to 569 (2014-2024) did not improve results (MSE: 3867.75, R2: 0.25), suggesting the additional older data may introduce noise rather than signal.

# XGBoost

In [10]:
# XGBoost regressor
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Full pipeline
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Train the model
pipeline_xgb.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [11]:
# Predict
y_pred = pipeline_xgb.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}, R2: {r2:.2f}")

# Optional: cross-validation
cv_scores = cross_val_score(pipeline_xgb, X, y, cv=5, scoring='r2')
print(f"5-fold CV R2: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")



MSE: 2909.64, R2: 0.16




5-fold CV R2: 0.05 ± 0.45




# CatBoost

In [85]:
from catboost import CatBoostRegressor, Pool

# Convert data into CatBoost Pool (efficient handling of categoricals)
train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
test_pool = Pool(X_test, y_test, cat_features=categorical_cols)

# CatBoost model
cat_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=0
)

cat_model.fit(train_pool)
y_pred = cat_model.predict(test_pool)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"CatBoost -> MSE: {mse:.2f}, R2: {r2:.2f}")


CatBoost -> MSE: 2616.79, R2: 0.24


# Linear Regression (Ridge)

In [None]:
# Train per-position Ridge regression models
results = {}
models = {}

for pos in X['position'].unique():
    mask = X['position'] == pos
    X_pos = X[mask].drop(columns=['position'])
    y_pos = y[mask]

    cat_cols = ['team', 'C_conference', 'C_team']
    num_cols = [col for col in X_pos.columns if col not in cat_cols]

    pos_preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
        ]
    )

    linear_model = Pipeline([
        ('preprocessor', pos_preprocessor),
        ('ridge', Ridge(alpha=1.0))
    ])

    X_train_pos, X_test_pos, y_train_pos, y_test_pos = train_test_split(
        X_pos, y_pos, test_size=0.2, random_state=42
    )

    linear_model.fit(X_train_pos, y_train_pos)
    y_pred_pos = linear_model.predict(X_test_pos)

    mse = mean_squared_error(y_test_pos, y_pred_pos)
    r2 = r2_score(y_test_pos, y_pred_pos)
    results[pos] = (mse, r2)
    models[pos] = linear_model

print("Per-position Ridge regression results:")
for pos, (mse, r2) in results.items():
    print(f"  {pos} -> MSE: {mse:.2f}, R2: {r2:.2f}")

# Results


## Model Results: 2014-2024 Dataset
| Model | MSE | R2 |
|-------|-----|-----|
| CatBoost | 3664.83 | 0.29 |
| XGBoost | 4066.48 | 0.21 |
| Random Forest | 3867.75 | 0.25 |
| Ridge (QB) | 13757.22 | -0.51 |
| Ridge (WR) | 3909.36 | -0.05 |
| Ridge (RB) | 3505.07 | 0.06 |
| Ridge (TE) | 20680.57 | -16.27 |

## Model Results: 2019-2024 Dataset
| Model | MSE | R2 |
|-------|-----|-----|
| Random Forest | 2468.86 | 0.29 |
| CatBoost | 2587.57 | 0.25 |
| XGBoost | 2909.64 | 0.16 |
| Ridge (QB) | 5107.94 | 0.47 |
| Ridge (WR) | 2225.32 | 0.21 |
| Ridge (RB) | 3264.00 | -0.35 |
| Ridge (TE) | 57577.92 | -61.77 |

Random Forest performed best overall. Ridge regression for QBs showed the highest R2 but per-position models suffer from small sample sizes (especially TE).

# Predict Fantasy Points


In [None]:
rookies = pd.read_csv('data/processed/df_master_rookies_2025.csv')
print(rookies.columns)

In [17]:
# Check for any missing values 
missing_rows = rookies[rookies.isnull().any(axis=1)]
print(missing_rows)


Empty DataFrame
Columns: [player_name, position, team, draft_year, draft_round, draft_pick_overall, age_on_draft_day, C_season, player_id, C_team, C_conference, C_passing_TD, C_passing_YDS, C_passing_INT, C_rushing_TD, C_rushing_YDS, C_receiving_REC, C_receiving_TD, C_receiving_YDS, C_fumbles_LOST, C_passing_ATT, C_passing_COMPLETIONS, C_passing_PCT, C_passing_YPA, C_rushing_CAR, C_rushing_YPC, C_rushing_LONG, C_receiving_YPR, C_receiving_LONG, C_fumbles_FUM, C_conference_strength, height_in, weight_lb]
Index: []

[0 rows x 33 columns]


In [18]:
# Predict fantasy points
rookies_predictions_xgb = pipeline_xgb.predict(rookies)
rookies_predictions_rf = model_RF.predict(rookies)

# Add predictions to the dataframe
rookies['predicted_fp_xgb'] = rookies_predictions_xgb
rookies['predicted_fp_rf'] = rookies_predictions_rf

print(rookies[['player_name', 'position', 'predicted_fp_rf']])

         player_name position  predicted_fp_rf
0           Cam Ward       QB         231.1271
1      Travis Hunter       WR         155.6807
2      Ashton Jeanty       RB         200.2985
3   Colston Loveland       TE         156.3228
4       Tyler Warren       TE         163.9444
..               ...      ...              ...
68  Konata Mumpfield       WR          29.9110
69    Dominic Lovett       WR          19.5389
70    Moliki Matavao       TE          33.2332
71     Junior Bergen       WR          25.1842
72       Luke Lachey       TE          37.5547

[73 rows x 3 columns]




In [21]:
predictions = []

for i, row in rookies.iterrows():
    pos = row['position']

    if pos not in models:
        continue  # skip if we don’t have a model for that position

    model = models[pos]

    # Drop 'position' because models were trained without it
    X_new = pd.DataFrame([row.drop('position')])

    # Predict fantasy points
    pred_fp = model.predict(X_new)[0]

    predictions.append({
        'player_name': row['player_name'],
        'position': pos,
        'predicted_fp': pred_fp
    })

rookie_preds_df = pd.DataFrame(predictions)
print(rookie_preds_df.head())




        player_name position  predicted_fp
0          Cam Ward       QB    192.361710
1     Travis Hunter       WR    130.341975
2     Ashton Jeanty       RB    144.881038
3  Colston Loveland       TE     87.275760
4      Tyler Warren       TE    138.175403




In [22]:
# Add linear predictions to the dataframe
rookies['predicted_fp_linear'] = rookie_preds_df["predicted_fp"]


In [None]:
# Save predictions to CSV
rookies.to_csv('data/output/df_rookie_predictions_2025.csv', index=False)