In [393]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.model_selection import train_test_split

In [394]:
# converts 'MM:SS' strings to float minutes
def convert_mp(mp):
    if isinstance(mp, str):
        if ':' in mp:
            try:
                mins, secs = mp.split(':')
                return int(mins) + int(secs) / 60
            except:
                return None
        else:
            try:
                return float(mp)
            except:
                return None
    elif isinstance(mp, (int, float)):
        return float(mp)
    else:
        return None


In [395]:
# import data
data = pd.read_csv('../data-collection/bbref_players_games_simple 2/g/gilgesh01_Shai_Gilgeous-Alexander_last3.csv')

# make sure games are ordered oldest -> newest
# change 'Date' if your CSV uses a different column for game order
if 'Date' in data.columns:
    data = data.sort_values('Date')

# engineer simple context features directly on the DataFrame (do not modify source CSV)
# is_home: 1 if home, 0 if away
data['is_home'] = (data['Unnamed: 5'] != '@').astype(float)
# result_win: 1 if team won, 0 if lost
data['result_win'] = data['Result'].str.startswith('W').astype(float)

# features (from previous game) and target (binary above/below average)
features = [
    'MP', 'FGA', '3PA', 'FTA',
    'FG%', '3P%', '2P%', 'eFG%', 'FT%',
    'TRB', 'AST', 'TOV',
    'GmSc', '+/-'
 ]

# convert MP to float minutes
data['MP'] = data['MP'].apply(convert_mp)

# convert all feature and target columns to numeric
for col in features + ['PTS']:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# use previous game's stats as features: shift by 1 row
data[features] = data[features].shift(1)

current_ppg = 32.4
data['above_ppg'] = (data['PTS'] > current_ppg).astype(float)

# target is now binary
target = ['above_ppg']

# drop first row and any row with NaNs in prev-game features or target
data = data.dropna(subset=features + target)

# use ALL remaining games
X_np = data[features].values
y_np = data[target].values

# 60 train / 20 test if you really want fixed counts:
X_train_np, X_test_np, y_train_np, y_test_np = train_test_split(
    X_np, y_np, train_size=60, test_size=20, random_state=42
)

# convert to torch tensors
X_train = torch.tensor(X_train_np, dtype=torch.float32)
y_train = torch.tensor(y_train_np, dtype=torch.float32)
X_test = torch.tensor(X_test_np, dtype=torch.float32)
y_test = torch.tensor(y_test_np, dtype=torch.float32)

In [396]:
# define model, loss function, and optimizer
model = nn.Sequential(
    nn.Linear(X_train.shape[1], 1),
    nn.Sigmoid()
)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)

# number of epochs
epochs = 1000

# training loop
for _ in range(epochs):
    optimizer.zero_grad()
    loss = criterion(model(X_train), y_train)
    loss.backward()
    optimizer.step()

print("Weights:", model[0].weight.data)
print("Bias:", model[0].bias.data)

Weights: tensor([[ 0.0605, -0.1945,  0.1250,  0.0464, -0.1524, -0.4987,  0.3967, -0.4800,
          0.5223, -0.1544, -0.0459,  0.2977,  0.0533,  0.0152]])
Bias: tensor([-0.7449])


In [None]:
with torch.no_grad():
    y_pred = model(X_test)  
    #print("Test Predictions:", y_pred)
    #print("Actual Values:", y_test)

# above average vs. below average calculation
predicted_above_below = []
actual_above_below = []
for pred in y_pred:
    if pred.item() > 0.5:
        predicted_above_below.append("Above Average")
    else:
        predicted_above_below.append("Below Average")

for actual in y_test:
    if actual.item() > 0.5:
        actual_above_below.append("Above Average")
    else:
        actual_above_below.append("Below Average")

correct = 0
incorrect = 0

# compare predicted vs actual
for i in range(len(y_test)):
    #print(f"Predicted: {predicted_above_below[i]}, Actual: {actual_above_below[i]}")
    if predicted_above_below[i] == actual_above_below[i]:
        correct += 1
    else:
        incorrect += 1

print(str(correct) + "-" + str(incorrect))
# accuracy is hovering right around 50%
# we will want to include the option to run against 
# the sports betting line as well and compare to odds and 
# implied probabilities and return on investment

10-10
