In [2]:
import os
import pandas as pd

# Define paths
pro_football_focus_data = 'data/pro_football_ref.xlsx'
model_path = 'model_path/trained_model.pth'

# Load and preprocess the pro football focus data.
df = pd.read_excel(pro_football_focus_data)
# Filter to only include wide receivers (WR)
df = df[df['FantPos'] == 'WR'].copy()
df.replace([float('inf'), -float('inf')], 0, inplace=True)
df.fillna(0, inplace=True)

# Calculate points per game.
df.loc[:, 'PPR/G'] = df['PPR'] / df['G']

print(df.head(10))

  machar = _get_machar(dtype)


    YEAR  Rk             Player   Tm FantPos  Age   G  GS  Cmp  Att  ...  \
4   2024   5      Ja'Marr Chase  CIN      WR   24  17  16    0    0  ...   
11  2024  12   Justin Jefferson  MIN      WR   25  17  17    1    1  ...   
17  2024  18  Amon-Ra St. Brown  DET      WR   25  17  17    1    1  ...   
18  2024  19       Brian Thomas  JAX      WR   22  17  16    0    0  ...   
22  2024  23     Terry McLaurin  WAS      WR   29  17  17    0    0  ...   
25  2024  26       Drake London  ATL      WR   23  17  17    0    0  ...   
34  2024  35         Mike Evans  TAM      WR   31  14  14    0    0  ...   
36  2024  37       Malik Nabers  NYG      WR   21  15  13    0    1  ...   
39  2024  40        CeeDee Lamb  DAL      WR   25  15  15    0    0  ...   
41  2024  42   Courtland Sutton  DEN      WR   29  17  13    2    2  ...   

    Yds.2    Y/R  TD.2  Fmb  FL  TD.3  14:00:00  2PP    PPR      PPR/G  
4    1708  13.45    17    0   0    17       0.0  0.0  403.0  23.705882  
11   1533  14.88 

In [3]:
# Copy the 2024 data into separate dataframe.
df_2024 = df[df['YEAR'] == 2024].copy()
player_names_2024 = df_2024['Player'].reset_index(drop=True)
df_2024 = df_2024.drop(columns=['Player'])

print(df_2024.head(10))

    YEAR  Rk   Tm FantPos  Age   G  GS  Cmp  Att  Yds  ...  Yds.2    Y/R  \
4   2024   5  CIN      WR   24  17  16    0    0    0  ...   1708  13.45   
11  2024  12  MIN      WR   25  17  17    1    1   22  ...   1533  14.88   
17  2024  18  DET      WR   25  17  17    1    1    7  ...   1263  10.98   
18  2024  19  JAX      WR   22  17  16    0    0    0  ...   1282  14.74   
22  2024  23  WAS      WR   29  17  17    0    0    0  ...   1096  13.37   
25  2024  26  ATL      WR   23  17  17    0    0    0  ...   1271  12.71   
34  2024  35  TAM      WR   31  14  14    0    0    0  ...   1004  13.57   
36  2024  37  NYG      WR   21  15  13    0    1    0  ...   1204  11.05   
39  2024  40  DAL      WR   25  15  15    0    0    0  ...   1194  11.82   
41  2024  42  DEN      WR   29  17  13    2    2   30  ...   1081  13.35   

    TD.2  Fmb  FL  TD.3  14:00:00  2PP    PPR      PPR/G  
4     17    0   0    17       0.0  0.0  403.0  23.705882  
11    10    1   0    10       0.0  0.0  317.5

In [4]:
# Shift to represent the following year's points per game
df['NextYearPPR/G'] = df.groupby('Player')['PPR/G'].shift(-1)

# Remove rows where the target is NaN (i.e., no following year data)
df = df[df['NextYearPPR/G'].notna()]

print(df.size)

18540


In [27]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define features and target.
feature_names = ['Age', 'Tgt', 'Rec', 'Yds.2', 'PPR/G']
target = 'NextYearPPR/G'

# Split the data into training, validation, and test sets
X = df[feature_names]
y = df[target]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
print(f'{X_train}')
print(f'{y_train}')
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
X_2024 = df_2024[feature_names]

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)
X_2024 = scaler.transform(X_2024)

# Ensure X_train, X_val, X_test, and X_2024 are correctly shaped for LSTM
X_train = X_train.reshape(X_train.shape[0], 1, X_train.shape[1])
X_val = X_val.reshape(X_val.shape[0], 1, X_val.shape[1])
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
X_2024 = X_2024.reshape(X_2024.shape[0], 1, X_2024.shape[1])

# Check to see standardized data.
#print(f'Size {y_train.size}')
#print(f'Size {X_val.size}')
#print(f'Size {y_val.size}')

      Age  Tgt  Rec  Yds.2      PPR/G
1034   29   69   45    556  15.244444
1625   27   64   42    651  10.221429
1025   25   94   66    770  11.642857
2076   24   83   61    631   8.043750
109    32  121   70    744  12.293333
...   ...  ...  ...    ...        ...
214    28   67   33    375   7.425000
333    31  175  103   1144  15.611765
881    29   44   20    282   3.262500
1445   27   32   20    338  13.160000
324    25  105   75   1342  15.575000

[370 rows x 5 columns]
1034    15.318750
1625     8.612500
1025    11.318750
2076     8.866667
109     21.453846
          ...    
214     11.746154
333     19.735294
881      8.147059
1445    15.500000
324     13.400000
Name: NextYearPPR/G, Length: 370, dtype: float64


In [28]:
from nnclass.simple_nn import SimpleLSTM
from torch.utils.data import TensorDataset, DataLoader
import torch

# Create SimpleLTSM.
input_size = X_train.shape[2]
hidden_size = 32
output_size = 1
lstm = SimpleLSTM(input_size, hidden_size, output_size)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).view(-1, 1)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [29]:
import torch.optim as optim
import torch.nn as nn

# Train the model.
criterion = nn.MSELoss()
optimizer = optim.Adam(lstm.parameters())

num_epochs = 750
count = 0
for epoch in range(num_epochs):
    lstm.train() # Set model to training mode
    for inputs, labels in train_loader:
        # Forward pass
        outputs = lstm(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
    print(f'Count: {count}')

Epoch [1/750], Loss: 127.4833
Count: 12
Epoch [2/750], Loss: 121.2383
Count: 24
Epoch [3/750], Loss: 116.1457
Count: 36
Epoch [4/750], Loss: 157.8345
Count: 48
Epoch [5/750], Loss: 115.1941
Count: 60
Epoch [6/750], Loss: 151.1011
Count: 72
Epoch [7/750], Loss: 170.7756
Count: 84
Epoch [8/750], Loss: 101.7715
Count: 96
Epoch [9/750], Loss: 156.7645
Count: 108
Epoch [10/750], Loss: 91.4719
Count: 120
Epoch [11/750], Loss: 132.3840
Count: 132
Epoch [12/750], Loss: 104.3698
Count: 144
Epoch [13/750], Loss: 125.1248
Count: 156
Epoch [14/750], Loss: 124.1508
Count: 168
Epoch [15/750], Loss: 92.7383
Count: 180
Epoch [16/750], Loss: 113.3530
Count: 192
Epoch [17/750], Loss: 107.9936
Count: 204
Epoch [18/750], Loss: 72.3917
Count: 216
Epoch [19/750], Loss: 83.3252
Count: 228
Epoch [20/750], Loss: 77.5191
Count: 240
Epoch [21/750], Loss: 83.0116
Count: 252
Epoch [22/750], Loss: 68.1567
Count: 264
Epoch [23/750], Loss: 57.2378
Count: 276
Epoch [24/750], Loss: 48.3545
Count: 288
Epoch [25/750], Lo

In [30]:
# Evaluate the model.
lstm.eval() # Set model to eval mode
total_loss = 0
for inputs, labels in val_loader:
    # Forward pass
    outputs = lstm(inputs)
    loss = criterion(outputs, labels)
    total_loss += loss
print(f'Evaluation, Total Loss: {total_loss.item():.4f}')

Evaluation, Total Loss: 31.1009


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

y_train_pred = lstm(X_train_tensor).detach().numpy()
y_test_pred = lstm(X_test_tensor).detach().numpy()

# Get the diff values for evaluation.
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
mae_train = mean_absolute_error(y_train, y_train_pred)
r2_train = r2_score(y_train, y_train_pred)

rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae_test = mean_absolute_error(y_test, y_test_pred)
r2_test = r2_score(y_test, y_test_pred)

print(f'Training RMSE: {rmse_train:.4f}, MAE: {mae_train:.4f}, R²: {r2_train:.4f}')
print(f'Testing RMSE: {rmse_test:.4f}, MAE: {mae_test:.4f}, R²: {r2_test:.4f}')

Training RMSE: 2.9204, MAE: 2.2155, R²: 0.5565
Testing RMSE: 3.2233, MAE: 2.4749, R²: 0.5042


In [32]:
# Return results using model.
predictions = lstm(torch.tensor(X_2024, dtype=torch.float32)).detach().numpy()
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])
predictions_df['Player'] = player_names_2024
print(f'{predictions_df}')

     Prediction              Player
0     16.548477       Ja'Marr Chase
1     16.039267    Justin Jefferson
2     14.806870   Amon-Ra St. Brown
3     12.582546        Brian Thomas
4     14.714628      Terry McLaurin
..          ...                 ...
113    8.018454           DJ Turner
114    4.933768  Jordan Whittington
115    7.055645  Cedrick Wilson Jr.
116    5.482774    Malik Washington
117    5.606704        Derius Davis

[118 rows x 2 columns]


In [33]:
# Save the new DataFrame to an Excel file
file_path = 'results/simple_lstm.xlsx'
predictions_df.to_excel(file_path, index=False)