In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
import pickle
import random
import train
from model import NNModelEx, NNModelBCE

pd.set_option('display.max_columns', 999)

In [2]:
# For this model, the data preprocessing part is already completed with the exception of scaling.
# so we just need to scale here.

In [3]:
def get_ref_X_y(df):
    X_cols = [c for c in df.columns if c.startswith('tc2x_')]
    y_cols = [c for c in df.columns if c.startswith('y')]
    return (df[X_cols], df[y_cols])

In [4]:
raw_data = {} # loads raw data and stores as a dict cache

def dataset_key(dataset='', validation=False):
    return dataset+('test' if validation else 'train')


def load_data(raw, dataset='', validation=False):
    '''
    Return dataframe matching data set and validation. Dictionary input will be updated.

    Parameters
    ----------
    raw : dict
        dictionary which caches the dataframes and will be updated accordingly

    dataset : str
        which dataset to use? valid input includes: empty str for full set, sample_, and secret_

    validation : bool
        load validation set? if true then use _test, otherwise use _train.  Note secret_ doesn't have _train
    '''
    key = dataset+('test' if validation else 'train')
    if key not in raw:
        print(f"Loading data to cache for: {key}")
        raw[key] = pd.read_pickle(f'./data/{key}.pkl')
    return raw[key]

In [5]:
configurations = {
    'dataset' : 't2/', # '', 'sample_', 'secret_'
    'model_identifier' : "tc2_2",
    'model_path' : f"./models",
    'model': NNModelBCE,
    'device' : 'cpu',
    'random_seed' : 0,
    'lr' : 3e-3,
    'weight_decay' : 0.3, #Adam
    'max_epochs' : 50000,
    'do_validate' : True,
    'model_definition' : [
        ('l', (500,)), ('r', (True,)),
        ('l', (500,)), ('r', (True,)),
        ('l', (500,)), ('r', (True,)),
        ('l', (500,)), ('r', (True,)),
        ('l', (500,)), ('r', (True,)),
        ('l', (500,)), ('r', (True,)),
        ('l', (500,)), ('r', (True,)),
        ('l', (500,)), ('r', (True,)),
    ],
    'train_params' : {
        'batch_size': 10000,
        'shuffle': True,
        'num_workers': 3,
        'pin_memory': True,
    },
    'test_params' : {
        'batch_size': 200000,
        'num_workers': 1,
        'pin_memory': True,
    },
}

In [6]:
%%time

train_df = load_data(raw_data,dataset=configurations['dataset'],validation=False)
test_df = load_data(raw_data,dataset=configurations['dataset'],validation=True)

X_train, y_train = get_ref_X_y(train_df)
X_test, y_test = get_ref_X_y(test_df)

Loading data to cache for: t2/train
Loading data to cache for: t2/test
CPU times: user 150 ms, sys: 349 ms, total: 499 ms
Wall time: 498 ms


In [7]:
import torch

net, loss_func, optimizer, mean_losses, next_epoch, = train.load_model_with_config(configurations, X_train, False)

optimizer = torch.optim.AdamW(net.parameters(), lr=5e-4, weight_decay=3e-1, eps=1e-8, amsgrad=False)


train.save_model_with_config(configurations, net=net, loss_func=loss_func, optimizer=optimizer,
                   mean_losses=mean_losses, next_epoch=next_epoch+1,
                  )


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
model, mean_losses = train.train_model(X_train, y_train, X_test, y_test, configurations, force_train=False)

In [None]:
model, _, _, mean_losses, _ = train.load_model_with_config(configurations)

tl, vl = zip(*mean_losses)

fig,ax = plt.subplots()
ax.plot(tl, label="Training Loss")
ax.plot(vl, label="Validation Loss")

fig.legend()
plt.show()

In [None]:
trained_model = model

In [None]:
y_train_pred = train.predict(trained_model, X_train, y_train, device="cpu") # get predictions for each train
y_train_pred_df = pd.DataFrame(y_train_pred, columns=y_train.columns)  # put results into a dataframe
y_test_pred = train.predict(trained_model, X_test, y_test, device="cpu") # get predictions for each train
y_test_pred_df = pd.DataFrame(y_test_pred, columns=y_test.columns)  # put results into a dataframe

In [None]:
print(f'    Train set MAE (L1) loss: {mean_absolute_error(y_train, y_train_pred_df)}')
print(f'    Train set MSE (L2) loss: {mean_squared_error(y_train, y_train_pred_df)}')

# random.seed(0)
# sample = random.sample(list(y_train_pred_df.index), 10)

print("Train - Ground Truth (normalized):")
display(y_train)
# print("Train - Ground Truth (non-normalized):")
# display(normalize_data.normalize_all_columns(y_train.iloc[:,3:].loc[sample].copy(), reverse=True))  # see ground truths
print("Train - Prediction (normalized):")
display(y_train_pred_df)
# print("Train - Prediction (non-normalized):")
# display(normalize_data.normalize_all_columns(y_train_pred_df.loc[sample].copy(), reverse=True))  # See predictions

In [None]:
y_train_pred_df.describe()

In [None]:
print(f'    Test set MAE (L1) loss: {mean_absolute_error(y_test, y_test_pred_df)}')
print(f'    Test set MSE (L2) loss: {mean_squared_error(y_test, y_test_pred_df)}')

random.seed(0)
# sample = random.sample(list(y_train_pred_df.index), 10)
# sample = [0,1]

print("Train - Ground Truth (normalized):")
display(y_train.loc)
# print("Train - Ground Truth (non-normalized):")
# display(normalize_data.normalize_all_columns(y_train.iloc[:,3:].loc[sample].copy(), reverse=True))  # see ground truths
print("Train - Prediction (normalized):")
display(y_train_pred_df.loc)
# print("Train - Prediction (non-normalized):")
# display(normalize_data.normalize_all_columns(y_train_pred_df.loc[sample].copy(), reverse=True))  # See predictions