In [99]:
import os
import glob
from typing import Tuple, List, Union

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch import optim

In [100]:
def get_data(_type: str) -> Tuple[torch.tensor, torch.tensor]:
    """Retrieve X, Y data from the proper directory. You can
    specify whether you want it to be pulled from /train, /dev, 
    or /test, and all the *-data.csv files will be loaded in.
    """
    
    if _type not in {'train', 'dev', 'test'}:
        msg = f"{_type} not supported. Try 'train', 'dev', or 'test'."
        raise RuntimeError(msg)
    
    df = pd.DataFrame()
    
    data_path = os.path.join('data', _type, '*-data.csv')
    for fp in glob.glob(data_path):
        # Cast to float because othewise we run into a type
        # mismatch error in PyTorch
        season_df = pd.read_csv(
            fp, index_col=[0, 1], header=[0, 1, 2], dtype='float32')
        
        df = df.append(season_df)
    
    features = df[['this', 'other']]
    scores = df['TEAM_PTS']
    msg = 'Uh oh, you might be losing features!'
    assert len(features.columns) + len(scores.columns) == len(df.columns), msg
    
    features = torch.from_numpy(features.values)
    scores = torch.from_numpy(scores.values)
    
    return features, scores

In [101]:
def log_two_layers(n: int) -> List[Union[nn.Linear, nn.ReLU]]:
    """Given an input of size n, construct a series
    of neural network layers that decrease logarithmically.
    """
    shift_bit_length = lambda x: 1 << (x - 1).bit_length() - 1
    
    layers = []
    while n > 4:
        power_of_two = shift_bit_length(n)
        layers.append( nn.Linear(n, power_of_two) )
        layers.append( nn.ReLU() )
        n = power_of_two
    # Now n == 4 and we add a final regression layer
    layers.append( nn.Linear(n, 1) )
    return layers

## Main Driver Logic

In [102]:
# Roughly follows https://pytorch.org/tutorials/beginner/nn_tutorial.html#

# Model hyperparameters
num_epochs = 200
batch_size = 100
learning_rate = 0.0001

# Load data as torch.tensors
x_train, y_train = get_data('train')
x_validate, y_validate = get_data('dev')

# Define our model layers by decreasing powers of two
model = nn.Sequential(*log_two_layers(x_train.shape[1]))
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

validate_ds = TensorDataset(x_validate, y_validate)
validate_dl = DataLoader(validate_ds, batch_size=batch_size * 2)

# L1 loss more robust to outliers
# loss_func = F.l1_loss
loss_func = F.mse_loss

for epoch in range(num_epochs):
    model.train()
    for xb, yb in train_dl:
        pred = model(xb)
        loss = loss_func(pred, yb)
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    with torch.no_grad():
        validate_loss = sum(loss_func(model(xb), yb) for xb, yb in validate_dl)

    # Print epoch number and average validation loss
    print(epoch, validate_loss / len(validate_dl))

In [103]:
x_test, y_test = get_data('test')

pred = model(x_test)

In [105]:
pred

tensor([[101.3214],
        [101.3214],
        [101.3214],
        ...,
        [101.3214],
        [101.3214],
        [101.3214]], grad_fn=<AddmmBackward>)