In [None]:
import os
import pandas as pd

# Define paths
pro_football_focus_data = 'data/pro_football_ref.xlsx'
model_path = 'model_path/trained_model.pth'

# Load and preprocess the pro football focus data.
df = pd.read_excel(pro_football_focus_data)
# Filter to only include wide receivers (WR)
df = df[df['FantPos'] == 'RB'].copy()
df.replace([float('inf'), -float('inf')], 0, inplace=True)
df.fillna(0, inplace=True)

# Calculate points per game.
df.loc[:, 'PPR/G'] = df['PPR'] / df['G']

print(df.head(10))

    YEAR  Rk           Player   Tm FantPos  Age   G  GS  QBCmp  QBAtt  ...  \
0   2024   1   Saquon Barkley  PHI      RB   27  16  16      0      0  ...   
1   2024   2    Derrick Henry  BAL      RB   30  17  17      0      0  ...   
2   2024   3     Jahmyr Gibbs  DET      RB   22  17   4      0      0  ...   
5   2024   6   Bijan Robinson  ATL      RB   22  17  17      0      0  ...   
6   2024   7      Josh Jacobs  GNB      RB   26  17  17      0      0  ...   
9   2024  10   Kyren Williams  LAR      RB   24  16  16      0      0  ...   
10  2024  11       James Cook  BUF      RB   25  16  16      0      0  ...   
13  2024  14  Jonathan Taylor  IND      RB   25  14  13      0      0  ...   
16  2024  17    De'Von Achane  MIA      RB   23  17  16      0      0  ...   
20  2024  21     James Conner  ARI      RB   29  16  16      0      0  ...   

    RecYds    Y/R  RecTD  Fmb  FL  TotTD  14:00:00  2PP    PPR      PPR/G  
0      278   8.42      2    2   1     15       3.0  0.0  355.3  2

In [23]:
# Copy the 2024 data into separate dataframe.
df_2024 = df[df['YEAR'] == 2024].copy()
player_names_2024 = df_2024['Player'].reset_index(drop=True)
df_2024 = df_2024.drop(columns=['Player'])

print(df_2024.head(10))

    YEAR  Rk   Tm FantPos  Age   G  GS  QBCmp  QBAtt  QBYds  ...  RecYds  \
0   2024   1  PHI      RB   27  16  16      0      0      0  ...     278   
1   2024   2  BAL      RB   30  17  17      0      0      0  ...     193   
2   2024   3  DET      RB   22  17   4      0      0      0  ...     517   
5   2024   6  ATL      RB   22  17  17      0      0      0  ...     431   
6   2024   7  GNB      RB   26  17  17      0      0      0  ...     342   
9   2024  10  LAR      RB   24  16  16      0      0      0  ...     182   
10  2024  11  BUF      RB   25  16  16      0      0      0  ...     258   
13  2024  14  IND      RB   25  14  13      0      0      0  ...     136   
16  2024  17  MIA      RB   23  17  16      0      0      0  ...     592   
20  2024  21  ARI      RB   29  16  16      0      0      0  ...     414   

      Y/R  RecTD  Fmb  FL  TotTD  14:00:00  2PP    PPR      PPR/G  
0    8.42      2    2   1     15       3.0  0.0  355.3  22.206250  
1   10.16      2    3   1  

In [24]:
# Shift to represent the following year's points per game
df['NextYearPPR/G'] = df.groupby('Player')['PPR/G'].shift(-1)

# Remove rows where the target is NaN (i.e., no following year data)
df = df[df['NextYearPPR/G'].notna()]

print(df.size)

13200


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define features and target.
feature_names = ['Age', 'Tgt', 'Rec', 'RecTD', 'RecYds', 'RushTD', 'RushYds', 'PPR/G']
target = 'NextYearPPR/G'

# Split the data into training, validation, and test sets
X = df[feature_names]
y = df[target]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
print(f'{X_train}')
print(f'{y_train}')
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_2024 = df_2024[feature_names]

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_2024 = scaler.transform(X_2024)

# Ensure X_train, X_val, X_test, and X_2024 are correctly shaped for LSTM
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], 1, X_val.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
X_2024 = X_2024.reshape(X_2024.shape[0], 1, X_2024.shape[1])

# Check to see standardized data.
#print(f'Size {y_train.size}')
#print(f'Size {X_val.size}')
#print(f'Size {y_val.size}')

      Age  Tgt  Rec  RecTD  RecYds  RushTD  RushYds      PPR/G
999    24   46   39      0     337       2      719  10.840000
997    24   63   49      4     234       3      612  10.211765
10     25   38   32      2     258      16     1009  16.668750
1509   30   29   26      5     247      10     1018  16.166667
1702   22  124  107      6     867       7     1098  24.093750
...   ...  ...  ...    ...     ...     ...      ...        ...
493    26   41   27      0     154       1      432   5.475000
1296   24   76   63      4     482       3      380  12.075000
1639   24  104   79      3     456       0      213  10.243750
2081   28   18    9      0      84       3      406   6.909091
469    28   56   50      0     455       1      238   7.135294

[264 rows x 8 columns]
999      7.550000
997     16.420000
10      13.688235
1509    11.875000
1702    14.287500
          ...    
493     12.747059
1296     7.368750
1639    14.618750
2081    10.687500
469      8.881250
Name: NextYearPPR/G, L

In [44]:
from nnclass.simple_nn import SimpleLSTM
from torch.utils.data import TensorDataset, DataLoader
import torch

# Create SimpleLTSM.
input_size = X_train.shape[2]
hidden_size = 32
output_size = 1
lstm = SimpleLSTM(input_size, hidden_size, output_size)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [45]:
import torch.optim as optim
import torch.nn as nn

# Train the model.
criterion = nn.MSELoss()
optimizer = optim.Adam(lstm.parameters())

num_epochs = 350
count = 0
for epoch in range(num_epochs):
    lstm.train() # Set model to training mode
    for inputs, labels in train_loader:
        # Forward pass
        outputs = lstm(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    print(f'Count: {count}')

Epoch [1/350], Loss: 100.5155
Count: 9
Epoch [2/350], Loss: 150.1387
Count: 18
Epoch [3/350], Loss: 178.5220
Count: 27
Epoch [4/350], Loss: 141.5231
Count: 36
Epoch [5/350], Loss: 134.2557
Count: 45
Epoch [6/350], Loss: 182.0955
Count: 54
Epoch [7/350], Loss: 118.5887
Count: 63
Epoch [8/350], Loss: 130.1607
Count: 72
Epoch [9/350], Loss: 106.9088
Count: 81
Epoch [10/350], Loss: 121.9189
Count: 90
Epoch [11/350], Loss: 120.4822
Count: 99
Epoch [12/350], Loss: 128.7321
Count: 108
Epoch [13/350], Loss: 107.3495
Count: 117
Epoch [14/350], Loss: 94.3045
Count: 126
Epoch [15/350], Loss: 95.6247
Count: 135
Epoch [16/350], Loss: 191.7386
Count: 144
Epoch [17/350], Loss: 44.8503
Count: 153
Epoch [18/350], Loss: 95.9307
Count: 162
Epoch [19/350], Loss: 109.0004
Count: 171
Epoch [20/350], Loss: 115.1640
Count: 180
Epoch [21/350], Loss: 105.1117
Count: 189
Epoch [22/350], Loss: 102.4305
Count: 198
Epoch [23/350], Loss: 33.2707
Count: 207
Epoch [24/350], Loss: 117.7615
Count: 216
Epoch [25/350], Lo

In [46]:
# Evaluate the model.
lstm.eval() # Set model to eval mode
total_loss = 0
for inputs, labels in val_loader:
    # Forward pass
    outputs = lstm(inputs)
    loss = criterion(outputs, labels)
    total_loss += loss
print(f'Evaluation, Total Loss: {total_loss.item():.4f}')

Evaluation, Total Loss: 40.2699


In [47]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

y_train_pred = lstm(X_train_tensor).detach().numpy()
y_test_pred = lstm(X_test_tensor).detach().numpy()

# Get the diff values for evaluation.
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f'Training RMSE: {rmse_train:.4f}, MAE: {mae_train:.4f}, R²: {r2_train:.4f}')
print(f'Testing RMSE: {rmse_test:.4f}, MAE: {mae_test:.4f}, R²: {r2_test:.4f}')

Training RMSE: 3.4108, MAE: 2.7154, R²: 0.5017
Testing RMSE: 3.9281, MAE: 3.0760, R²: 0.5056


In [48]:
# Return results using model.
predictions = lstm(torch.tensor(X_2024, dtype=torch.float32)).detach().numpy()
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])
predictions_df['Player'] = player_names_2024
print(f'{predictions_df}')

    Prediction                 Player
0    17.025743         Saquon Barkley
1    14.789425          Derrick Henry
2    14.506031           Jahmyr Gibbs
3    14.234646         Bijan Robinson
4    14.734671            Josh Jacobs
..         ...                    ...
77    5.971714            Blake Corum
78    8.246283         D'Onta Foreman
79    7.096632         Patrick Taylor
80    9.067770  Cordarrelle Patterson
81   14.593166         Michael Carter

[82 rows x 2 columns]


In [49]:
# Save the new DataFrame to an Excel file
file_path = 'results/simple_lstm_rb.xlsx'
predictions_df.to_excel(file_path, index=False)