## WR_FFPG_NN
Artificial Neural Network (ANN) that predicts Fantasy Football Points per Game (FFPG) based on provided Wide Receiver (WR) metrics.
ANN trained on WR data from 2018-2022 NFL seasons. Same model then tested on 2023 season.
ANN performance compared against polynomial regression method (polyfit).

Made by: Nikhil Gupta

### Libraries

In [2]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import numpy as np
import plotly.graph_objects as go
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

### Set-up

In [None]:
# Set device for PyTorch
device = torch.device("cpu")  # Simple and efficient for this use case

# Set dataset filenames
train_filename = "data/wr_data_18-22.csv"
valid_filename = "data/wr_data_23.csv"

# Define model filename
model_filename = "ffpg_model.pt"

# Initialize I/O metrics
input_metrics = ['targets_per_route_run', 'yards_per_route_run',
'air_yards_share', 'target_share', 'yards_after_catch_per_reception']
output_metrics = ['fantasy_points_per_game']

# Set hyperparameters
n_hidden = [16] # Width of each hidden layer

### Define custom dataset

In [23]:
class CustomDataset(Dataset):
    """Custom dataset for wide receiver fantasy points per game prediction."""
    def __init__(self, filename, input_metrics, output_metrics):
        """
        Args:
            filename (str): Path to the CSV file containing the dataset.
            input_metrics (list): List of input feature names.
            output_metrics (list): List of output target names.
        """
        # Read in dataset
        df = pd.read_csv(filename).sort_values(by=output_metrics, ascending=False)
        
        # Format metrics
        df['targets_per_route_run'] = df['targets_per_route_run']*100
        df['yards_per_route_run'] = df['yards_per_route_run']*10
        
        # Save off instance variables
        self.names = df['name'].astype('string').values
        self.seasons = df['season'].astype('string').values
        self.X = torch.tensor(df[input_metrics].values, dtype=torch.float32)
        self.Y = torch.tensor(df[output_metrics].values, dtype=torch.float32)
        
    def __len__(self):
        """Return the number of samples in the dataset."""
        return len(self.X)
    
    def __getitem__(self, index):
        """Return a single sample from the dataset.
        Args:
            index (int): Index of the sample to retrieve.
        Returns:
            tuple: A tuple containing the input features and the target value.
        """
        return self.X[index], self.Y[index]

### Define ANN

In [24]:
class SimpleNN(nn.Module):
    """A simple feedforward neural network for regression tasks."""
    def __init__(self, num_features, n_hidden):
        """ Args:
            num_features (int): Number of input features.
            n_hidden (list): List containing the number of neurons in each hidden layer.
        """
        super().__init__()

        # Initialize layers
        layers = []
        input_size = num_features
        
        # Create hidden layers
        for hidden_size in n_hidden:
            layers.append(nn.Linear(input_size, hidden_size))
            layers.append(nn.LeakyReLU())
            input_size = hidden_size
        
        # Output layer
        layers.append(nn.Linear(input_size, 1))
        layers.append(nn.ReLU())
        
        # Combine layers into a sequential model
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        """Forward pass through the network.
        Args:
            x (torch.Tensor): Input tensor.
        Returns:
            torch.Tensor: Output tensor after passing through the network.
        """
        return self.network(x)

### Define training loop

In [25]:
def train_model(filename, input_metrics, output_metrics, n_hidden, batch_size=32, epochs=1000):
    """Train a neural network model on the provided dataset.
    Args:
        filename (str): Path to the CSV file containing the dataset.
        input_metrics (list): List of input feature names.
        output_metrics (list): List of output target names.
        n_hidden (list): List containing the number of neurons in each hidden layer.
        batch_size (int): Size of each batch for training.
        epochs (int): Maximum number of epochs to train.
    Returns:
        tuple: A tuple containing the trained model, dataset, and final loss.
    """
    # Create dataset and dataloader
    dataset = CustomDataset(filename, input_metrics, output_metrics)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Model, loss function, and optimizer
    num_features = len(input_metrics)
    model = SimpleNN(num_features, n_hidden)
    model.to(device) # send model to device
    loss_func = nn.MSELoss()
    optimizer = optim.Adam(model.parameters())

    # Training loop with best model tracking
    best_loss = torch.inf
    best_model_state = None
    for epoch in range(epochs):
        epoch_losses = []

        for batch_idx, (X_batch, Y_batch) in enumerate(dataloader):
            # send tensors to device
            X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        
            # Forward pass
            outputs = model(X_batch)
            loss = loss_func(outputs, Y_batch)
            epoch_losses.append(loss.item())
            
            # Backward pass and optimize       
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Calculate average epoch loss
        avg_epoch_loss = sum(epoch_losses) / len(epoch_losses)
        
        # Save best model if this epoch is better
        if avg_epoch_loss < best_loss:
            best_loss = avg_epoch_loss
            best_model_state = model.state_dict().copy()  # Deep copy of model parameters
            print(f'Epoch {epoch+1}/{epochs}, avg loss {avg_epoch_loss:.6f} ✅ NEW BEST!')
        else:
            print(f'Epoch {epoch+1}/{epochs}, avg loss {avg_epoch_loss:.6f}')
    
    # Load the best model parameters before returning
    if best_model_state is not None:
        model.load_state_dict(best_model_state)
        print(f'🏆 Loaded best model with loss: {best_loss:.6f}')

    return model.cpu(), dataset, best_loss

### Train ANN

In [26]:
# Check if saved model exists
if os.path.exists(model_filename):
    print(f"Loading existing model from {model_filename}")
    
    # Create model architecture (must match saved model)
    num_features = len(input_metrics)
    model = SimpleNN(num_features, n_hidden)
    
    # Load the saved parameters
    model.load_state_dict(torch.load(model_filename, map_location='cpu', weights_only=False))
    model.eval()  # Set to evaluation mode
    
    # Load the training dataset for consistency
    train_dataset = CustomDataset(train_filename, input_metrics, output_metrics)
    
    print("✅ Model loaded successfully!")
    
else:
    print(f"No existing model found. Training new model...")
    
    # Train new model (saves best params during training)
    model, train_dataset, best_loss = train_model(train_filename, input_metrics, output_metrics, 
                                                  n_hidden)
    
    # Save the best model parameters (already loaded in model)
    torch.save(model.state_dict(), model_filename)
    print(f"✅ Best model trained and saved to {model_filename}")
    print(f"Best training loss: {best_loss:.4f}")

Loading existing model from ffpg_model.pt
✅ Model loaded successfully!


### Train polynomial regression model

In [27]:
# Create polynomial regression model using the same 5 features
print("Training Polynomial Regression model...")
poly_model = Pipeline([
    ('poly', PolynomialFeatures(degree=3, include_bias=True)),  # Try degree 3
    ('linear', LinearRegression())
])

# Train on same data as neural network
poly_model.fit(train_dataset.X.numpy(), train_dataset.Y.numpy().flatten())

print("✅ Polynomial Regression model trained!")

Training Polynomial Regression model...
✅ Polynomial Regression model trained!


### Compare training performance of ANN and Polynomial Regression

In [28]:
# Extract training data
train_x = train_dataset.X
train_y = train_dataset.Y
train_names = train_dataset.names
train_seasons = train_dataset.seasons
train_samples = range(len(train_dataset.X))

# Get predictions from the neural network
with torch.no_grad():
    train_nn_pred = model(train_x)

# Get predictions from the polynomial regression model
train_poly_pred = poly_model.predict(train_x.numpy())

# Performance Comparison
print("\n" + "="*50)
print("PERFORMANCE COMPARISON: Neural Network vs Polynomial Regression")
print("="*50)

train_nn_mse = mean_squared_error(train_dataset.Y.numpy(), train_nn_pred.detach().numpy())
train_poly_mse = mean_squared_error(train_dataset.Y.numpy(), train_poly_pred)
train_nn_r2 = r2_score(train_dataset.Y.numpy(), train_nn_pred.detach().numpy())
train_poly_r2 = r2_score(train_dataset.Y.numpy(), train_poly_pred)

print(f"TRAINING DATA:")
print(f"  Neural Network    - MSE: {train_nn_mse:.4f}, R²: {train_nn_r2:.4f}")
print(f"  Polynomial Reg    - MSE: {train_poly_mse:.4f}, R²: {train_poly_r2:.4f}")
print(f"  Winner: {'Neural Network' if train_nn_mse < train_poly_mse else 'Polynomial Regression'}")


PERFORMANCE COMPARISON: Neural Network vs Polynomial Regression
TRAINING DATA:
  Neural Network    - MSE: 2.2942, R²: 0.8984
  Polynomial Reg    - MSE: 2.3143, R²: 0.8975
  Winner: Neural Network


### Visualize training performance

In [None]:
# Create Plotly figure
fig = go.Figure()

# Add truth points
fig.add_trace(go.Scatter(
    x=list(train_samples), 
    y=train_y.flatten(), 
    mode='markers', 
    name='Truth FFPG',
    text=[f'{name} ({season}): {val:.2f}' for name, season, val in zip(train_names, train_seasons, train_y.flatten())],
    hovertemplate='%{text}<extra></extra>',
    marker=dict(size=6, color='blue')
))

# Add ANN predicted points
fig.add_trace(go.Scatter(
    x=list(train_samples), 
    y=train_nn_pred.flatten(), 
    mode='markers', 
    name='ANN Pred FFPG',
    opacity=0.5,
    text=[f'{name} ({season}): {val:.2f}' for name, season, val in zip(train_names, train_seasons, train_nn_pred.flatten())],
    hovertemplate='%{text}<extra></extra>',
    marker=dict(size=6, color='red')
))

# Add ANN trend line (degree 3 polyfit through ANN predictions)
coeffs_ann = np.polyfit(train_samples, train_nn_pred.flatten(), 3)
poly_ann = np.poly1d(coeffs_ann)
fig.add_trace(go.Scatter(
    x=list(train_samples),
    y=poly_ann(train_samples),
    mode='lines',
    name='ANN - Line of Best Fit',
    line=dict(color='red', width=2, dash='dash'),
    hovertemplate='ANN trend line<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Training Performance | WR data: 2018-2022<br><sub>Neural Network Results</sub>',
    xaxis_title='WR samples',
    yaxis_title='Fantasy Points per Game',
    legend=dict(x=1, y=1, xanchor='right', yanchor='top'),
    showlegend=True,
    hovermode='closest',
    width=1000,
    height=800
)

# Add grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig.show()

### Compare validation performance of ANN and Polynomial Regression

In [35]:
# Create validation dataset and dataloader
valid_dataset = CustomDataset(valid_filename, input_metrics, output_metrics)
valid_x = valid_dataset.X
valid_y = valid_dataset.Y

# Extract validation data
valid_x = valid_dataset.X
valid_y = valid_dataset.Y
valid_names = valid_dataset.names
valid_seasons = valid_dataset.seasons
valid_samples = range(len(valid_dataset.X))

# Get predictions from the neural network
with torch.no_grad():
    valid_nn_pred = model(valid_x)

# Get predictions from the polynomial regression model
valid_poly_pred = poly_model.predict(valid_x.numpy())

# Performance Comparison
print("\n" + "="*50)
print("VALIDATION PERFORMANCE: Neural Network vs Polynomial Regression")
print("="*50)

valid_nn_mse = mean_squared_error(valid_dataset.Y.numpy(), valid_nn_pred.detach().numpy())
valid_poly_mse = mean_squared_error(valid_dataset.Y.numpy(), valid_poly_pred)
valid_nn_r2 = r2_score(valid_dataset.Y.numpy(), valid_nn_pred.detach().numpy())
valid_poly_r2 = r2_score(valid_dataset.Y.numpy(), valid_poly_pred)

print(f"VALIDATION DATA (2023):")
print(f"  Neural Network    - MSE: {valid_nn_mse:.4f}, R²: {valid_nn_r2:.4f}")
print(f"  Polynomial Reg    - MSE: {valid_poly_mse:.4f}, R²: {valid_poly_r2:.4f}")
print(f"  Winner: {'Neural Network' if valid_nn_mse < valid_poly_mse else 'Polynomial Regression'}")
print("="*50)


VALIDATION PERFORMANCE: Neural Network vs Polynomial Regression
VALIDATION DATA (2023):
  Neural Network    - MSE: 1.4047, R²: 0.9493
  Polynomial Reg    - MSE: 1.7019, R²: 0.9385
  Winner: Neural Network


### Visualize validation performance

In [36]:
# Create Plotly figure
fig = go.Figure()

# Add truth points
fig.add_trace(go.Scatter(
    x=list(valid_samples), 
    y=valid_y.flatten(), 
    mode='markers', 
    name='Truth FFPG',
    text=[f'{name} ({season}): {val:.2f}' for name, season, val in zip(valid_names, valid_seasons, valid_y.flatten())],
    hovertemplate='%{text}<extra></extra>',
    marker=dict(size=6, color='blue')
))

# Add ANN predicted points
fig.add_trace(go.Scatter(
    x=list(valid_samples), 
    y=valid_nn_pred.flatten(), 
    mode='markers', 
    name='ANN Pred FFPG',
    opacity=0.5,
    text=[f'{name} ({season}): {val:.2f}' for name, season, val in zip(valid_names, valid_seasons, valid_nn_pred.flatten())],
    hovertemplate='%{text}<extra></extra>',
    marker=dict(size=6, color='red')
))

# Add ANN trend line (degree 3 polyfit through ANN predictions)
coeffs_ann = np.polyfit(valid_samples, valid_nn_pred.flatten(), 3)
poly_ann = np.poly1d(coeffs_ann)
fig.add_trace(go.Scatter(
    x=list(valid_samples),
    y=poly_ann(valid_samples),
    mode='lines',
    name='ANN - Line of Best Fit',
    line=dict(color='red', width=2, dash='dash'),
    hovertemplate='ANN trend line<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Validation Performance | WR data: 2023<br><sub>Neural Network Results</sub>',
    xaxis_title='WR samples',
    yaxis_title='Fantasy Points per Game',
    legend=dict(x=1, y=1, xanchor='right', yanchor='top'),
    showlegend=True,
    hovermode='closest',
    width=1000,
    height=800
)

# Add grid
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray')

fig.show()

### Conclusion:
Results from training and validation show the 5 chosen WR metrics demonstrate strong predictive power for FFPG (R² > 0.94). The neural network outperforms polynomial regression with 17% lower MSE, though both approaches achieve high accuracy on the 2023 validation data.