In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from collections import OrderedDict
from ipynb.fs.full.UtilFunctions import format_season
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [2]:
position_to_number = {
    "PG": 0,
    "SG": 1,
    "SF": 2,
    "PF": 3,
    "C": 4
}

In [3]:
def pre_process(seasons):
    df = None
    for season in seasons:
        cols_to_drop = None
        cols_to_keep = ['Player', 'Pos', 'Tm', 'TRB%', 'AST%', 'DRB%', 'ORB%', 'BLK%', '3PAr', '3PA', 'TOV%', 'STL%', 'PF', 'FGA', 'DBPM', '3P%', 'PTS', 'FTr']
        first_year, second_year = format_season(season)
        player_data = pd.read_csv(f"DataCollection/Player_Stats/player_stats_{first_year}-{second_year}.csv")
        if season > 1997:
            cols_to_drop = ['index', 'TRB', 'DRB', 'ORB', 'AST', 'G', 'GS', '2PAr', 'FT', 'FG', 'FG%', 'BLK', 'STL', 'Dunks', 
                                  'Heaves', 'HeavesAttempted', 'WS', 'FTA', 'TOV', 'Age', '2P', '3P', 'VORP', 'FGA', '3PA', '2PA', 'PF', 'PTS', 'OWS', 'DWS', 'OBPM', 'DBPM', 'BPM']
            shooting_data = pd.read_csv(f"DataCollection/Player_Shooting_Stats/Regular_Season/player_shooting_stats_{first_year}-{second_year}.csv")
            player_data = pd.merge(player_data, shooting_data.loc[:,~shooting_data.columns.isin(['Pos', 'Age', 'G', 'MP', 'FG%', "3PAr", "2P%", "3P%"])], on=['Player', 'Tm'])

        else:
            cols_to_drop = ['index', 'TRB', 'DRB', 'ORB', 'AST', 'G', 'GS', 'FT', 'FG', 'FG%', 'BLK', 'STL',
                                'WS', 'FTA', 'TOV', 'Age', '2P', '3P', 'VORP', 'FGA', '3PA', '2PA', 'PF', 'PTS', 'OWS', 'DWS', 'OBPM', 'DBPM', 'BPM']
            
        player_data = player_data.fillna(0)
        player_data = player_data[(player_data["MP"] > 15) & (player_data["G"] >= 30)]
        player_data = player_data.sort_values(by=['G'], ascending=False)
        player_data = player_data[player_data["Tm"] != "TOT"]
        player_data = player_data.drop_duplicates(subset ="Player",keep = "first")
        player_data.reset_index(inplace=True)
        player_data = player_data[cols_to_keep]        

        if df is None:
            df = player_data
        else:
            df = pd.concat([df, player_data])
            
    df.reset_index(inplace=True)
    df.drop('index', axis=1, inplace=True)
    return df

In [4]:
#def train_test_split(seasons, training_seasons):
 #   train_df = pre_process(seasons[:-1], training_seasons)
  #  test_df = pre_process(seasons[-1:], training_seasons)
   # return train_df.loc[:, ~train_df.columns.isin(['Pos'])], test_df.loc[:, ~test_df.columns.isin(['Pos'])], train_df['Pos'], test_df['Pos']

    

In [5]:
def model_by_year(seasons, training_seasons=4):
    output_layers = []
    accuracies = pd.DataFrame()
    for season in seasons:
        X_train, X_test, y_train, y_test = train_test_split(range(season,season+training_seasons+1), training_seasons)
        y_train = [position_to_number[y] for y in y_train.values]
        y_test = [position_to_number[y] for y in y_test.values]
        X_train = torch.tensor(X_train.values, dtype=torch.float32)
        X_test = torch.tensor(X_test.values, dtype=torch.float32)
        y_train = torch.tensor(y_train, dtype=torch.long)
        y_test = torch.tensor(y_test, dtype=torch.long)

        n_samples, n_features = X_train.shape
        model = nn.Sequential(OrderedDict([
                ('dense1', nn.Linear(n_features, 21)),
                ('act1', nn.ReLU()),
                ('dense2', nn.Linear(21, 10)),
                ('act1', nn.ReLU()),
                ('output', nn.Linear(10, 5))
            ]))

        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
        loss_fn = nn.CrossEntropyLoss()
        for epoch in range(iterations):
            y_pred = model(X_train)
            loss = loss_fn(y_pred, y_train)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        with torch.no_grad():
            y_pred = model(X_test)
            _, preds = torch.max(y_pred, 1)
            accuracies[season+training_seasons] = accuracy_score(preds, y_test)
            output_layers.append(model.output.weight) 
    return output_layers, accuracies

In [6]:
def generate_train_test_split(seasons):
    #Pre process data and select features/target
    df = pre_process(seasons)
    features = df.loc[:, ~df.columns.isin(["Player", "Pos", "Tm"])]
    target = df['Pos']

    #Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    #Convert target to numbers
    y_train = [position_to_number[y] for y in y_train.values]   
    y_test = [position_to_number[y] for y in y_test.values]

    #Convert data to tensors
    X_train = torch.tensor(X_train.values, dtype=torch.float32)
    X_test = torch.tensor(X_test.values, dtype=torch.float32)
    y_train = torch.tensor(y_train, dtype=torch.long)
    y_test = torch.tensor(y_test, dtype=torch.long)

    return X_train, X_test, y_train, y_test

In [7]:
X_train, X_test, y_train, y_test = generate_train_test_split(range(2000,2023))

In [8]:
n_samples, n_features = X_train.shape

In [9]:
model = nn.Sequential(OrderedDict([
    ('dense1', nn.Linear(n_features, 512)),
    ('act1', nn.ReLU()),
    ('dense2', nn.Linear(512, 256)),
    ('act2', nn.ReLU()),
    ('dense3', nn.Linear(256, 128)),
    ('act3', nn.ReLU()),
    ('dense4', nn.Linear(128, 64)),
    ('act4', nn.ReLU()),
    ('dense5', nn.Linear(64, 32)),
    ('act5', nn.ReLU()),
    ('output', nn.Linear(32, 5))
]))

In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [11]:
loss_fn = nn.CrossEntropyLoss()

In [12]:
iterations = 1200

In [13]:
for epoch in range(iterations):
    y_pred = model(X_train)
    loss = loss_fn(y_pred, y_train)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if epoch % 100 == 0:
        print(f"epoch: {epoch}, loss: {loss.item():.3f}")


epoch: 0, loss: 1.614
epoch: 100, loss: 0.673
epoch: 200, loss: 0.620
epoch: 300, loss: 0.510
epoch: 400, loss: 0.401
epoch: 500, loss: 0.356
epoch: 600, loss: 0.511
epoch: 700, loss: 0.441
epoch: 800, loss: 0.402
epoch: 900, loss: 0.277
epoch: 1000, loss: 0.119
epoch: 1100, loss: 0.491


In [14]:
with torch.no_grad():
    y_pred = model(X_test)
    _, preds = torch.max(y_pred, 1)
    print(accuracy_score(preds, y_test))

0.6333853354134166


In [15]:
model_specialist = nn.Sequential(OrderedDict([
    ('dense1', nn.Linear(n_features, 512)),
    ('act1', nn.ReLU()),
    ('dense2', nn.Linear(512, 256)),
    ('act2', nn.ReLU()),
    ('dense3', nn.Linear(256, 128)),
    ('act3', nn.ReLU()),
    ('dense4', nn.Linear(128, 64)),
    ('act4', nn.ReLU()),
    ('dense5', nn.Linear(64, 32)),
    ('act5', nn.ReLU()),
    ('output', nn.Linear(32, 5))
]))

model_specialist.load_state_dict(model.state_dict())



<All keys matched successfully>

In [16]:
X_train_sp, X_test_sp, y_train_sp, y_test_sp = generate_train_test_split(range(2000, 2001))

In [17]:
optimizer_sp = torch.optim.Adam(model.parameters(), lr=0.01)

In [18]:
loss_fn_sp = nn.CrossEntropyLoss()

In [22]:
iterations_sp = 500

In [23]:
for epoch in range(iterations_sp):
    y_pred_sp = model_specialist(X_train_sp)
    loss = loss_fn(y_pred_sp, y_train_sp)
    loss.backward()
    optimizer_sp.step()
    optimizer_sp.zero_grad()

    if epoch % 100 == 0:
        print(f"epoch: {epoch}, loss: {loss.item():.3f}")


epoch: 0, loss: 0.542
epoch: 100, loss: 0.542
epoch: 200, loss: 0.542
epoch: 300, loss: 0.542
epoch: 400, loss: 0.542


In [21]:
with torch.no_grad():
    y_pred_sp = model(X_test_sp)
    _, preds = torch.max(y_pred_sp, 1)
    print(accuracy_score(preds, y_test_sp))

0.7735849056603774
