In [1]:
# !gdown https://drive.google.com/file/d/1Hv4RAltBumSfOkRacoX8qrfDYfd_NDss/view?usp=drive_link --fuzzy

In [2]:
# !unzip Dataset_AML_Assignment1_Part1.zip

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [9]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split, ConcatDataset
import torch.nn.functional as F
from IPython import display 
from tqdm.notebook import tqdm
import random
import math, time, os
from matplotlib import pyplot as plt
import pickle as pkl


device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
print(device)
prefix = './data_dump_lstm_rerun'

cuda:1


In [2]:
class SinusodialDatasetSequential(Dataset):
    def __init__(self, df, mode='era', window_size=10):
        """ creating label columns of eras and targets """

        # df.sort_values(by=['day', 'row_num'], inplace=True)
        # self.eras = df['era_label']
        self.X = df.iloc[:, :26]
        self.y = df[f'{mode}']
        self.window_size = window_size

    def __len__(self):
        return self.X.shape[0]-self.window_size+1

    def __getitem__(self, idx):
        """ you are working on the sample idx+self.window_size = actual idx"""
        # if self.eras.iloc[idx] != self.eras.iloc[idx+self.window_size-1]:
        #     for i in range(idx, idx+self.window_size):
        #         if self.eras.iloc[idx] != self.eras.iloc[i]:
        #             break
        #     X = torch.tensor(self.X.iloc[idx:i].values, dtype=torch.float32)
        #     y = torch.tensor(int(self.y.iloc[i-1]), dtype=torch.long)
        #     return X, y
          
        X = torch.tensor(self.X.iloc[idx:idx+self.window_size].values, dtype=torch.float32)
        y = torch.tensor(int(self.y.iloc[idx+self.window_size-1]), dtype=torch.long)
        return X, y

In [3]:
def make_data_splits_sequential(df, mode, batch_size=32, train_perc=0.7, val_test_perc=0.5):

    def encode(v, class_values):
        return class_values.index(v)

    #adding new era_label column indexed 0, 1,...
    class_values = df[f'{mode}'].unique().tolist()
    df[f'{mode}'] = df[f'{mode}'].apply(lambda x: encode(x, class_values))
    df.reset_index(drop=True, inplace=True)

    train_dataset_list = []
    val_dataset_list = []
    test_dataset_list = []

    for era in df['era'].unique():
        condition = df['era'] == era
        df_era = df[condition]

        data = SinusodialDatasetSequential(df_era, mode=mode)
        
        train_samples = int(len(data)*train_perc)
        val_test_samples = len(data)-train_samples

        data_train, data_test = random_split(data, [train_samples, val_test_samples])

        val_samples = int(len(data_test)*val_test_perc)
        test_samples = len(data_test)-val_samples
        data_val, data_test = random_split(data_test, [val_samples, test_samples])

        train_dataset_list.append(data_train)
        val_dataset_list.append(data_val)
        test_dataset_list.append(data_test)
    
    dataset_train = ConcatDataset(train_dataset_list)
    dataset_val = ConcatDataset(val_dataset_list)
    dataset_test = ConcatDataset(test_dataset_list)

    print("Train-val-test lengths: ", len(dataset_train), len(dataset_val), len(dataset_test))

    loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    loader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)
    loader_test = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)

    return loader_train, loader_val, loader_test

#LSTM

In [4]:
import torch.nn.init as init

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, output_size, hidden_size = 64, num_layers = 4, lr=5e-4, weight_decay=1e-3):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_size, num_layers, batch_first=True)
        self.classifier = nn.Linear(hidden_size, output_size)
        self.log_softmax = nn.LogSoftmax(dim=1)
        self.optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)

        torch.nn.utils.clip_grad_norm_(self.parameters(), max_norm=1.0)
        for name, param in self.lstm.named_parameters():
            if 'weight' in name:
                init.xavier_normal_(param.data)
            if 'bias' in name:
                init.constant_(param.data, 0.0)

    def forward(self, x):
        x, _ = self.lstm(x)
        return self.log_softmax(self.classifier(x[:, -1, :]))

In [5]:
def accuracy(y_pred, y_test, verbose=True):
    m = y_test.shape[0]
    predicted = torch.max(y_pred, 1)[1]
    correct = (predicted == y_test).float().sum().item()
    if verbose:
        print(correct,m)
    accuracy = correct/m
    return accuracy, correct

In [6]:
def Test(net, loader_test, mode, noise_level, device='cpu', Loss=nn.NLLLoss(reduction='sum')):
    net.eval()
    total_samples = 0
    correct_samples = 0
    loss = 0.0
    for (X, y) in loader_test:
        X=X.to(device)
        y=y.to(device)
        total_samples += y.shape[0]
        y_out = net(X)
        _, i_cor_sam = accuracy(y_out,y,verbose=False)
        correct_samples += i_cor_sam
        loss += Loss(y_out, y).cpu().detach().item()
    acc = correct_samples / total_samples
    loss /= total_samples
    net.train()
    return loss, acc

In [7]:
def Train(Net, data, mode, noise_level, epochs=20, lr=5e-2, Loss=nn.NLLLoss(reduction='sum'), verbose=False, device='cpu',
          val_ds=None, plot_accs=False, plot_losses=False):
    model_save_time = time.time()
    losses = []
    accs = []
    val_losses=[]
    val_accL=[]
    Net.to(device)
    for e in range(epochs):
        Net.train()
        step=0
        tot_loss = 0.0
        start_time = time.time()
        correct_samples = 0
        total_samples = 0
        for (X,y) in data:
            X=X.to(device)
            y=y.to(device)
            total_samples += y.shape[0]
            y_out = Net(X)
            loss = Loss(y_out, y)
            Net.optimizer.zero_grad()
            loss.backward()
            Net.optimizer.step()
            step+=1
            tot_loss+=loss
            if verbose:
                _, i_cor_sam = accuracy(y_out,y,verbose=False)
                correct_samples += i_cor_sam
        end_time = time.time()
        t = end_time-start_time
        l = tot_loss.item()/total_samples
        losses += [l]
        if verbose:
            a = correct_samples/total_samples
            accs += [a]
            print('Epoch %2d Loss: %2.5e Accuracy: %2.5f Epoch Time: %2.5f' %(e,l,a,t))

        val_loss, val_acc = Test(Net, val_ds, mode, noise_level, device)
        val_losses.append(val_loss)
        val_accL.append(val_acc)

        torch.save(Net.state_dict(), f'{prefix}/net_{noise_level}_{mode}_{str(model_save_time)}.pth')

    return Net, losses, accs, val_losses, val_accL

In [8]:
def plot_loss_acc(losses, accs, val_losses, val_accs, mode, noise_level):

    plt.plot(np.array(accs),color='red', label='Train accuracy')
    plt.plot(np.array(val_accs),color='blue', label='Val accuracy')
    plt.legend()
    plt.savefig(f'{prefix}/acc_{mode}_{noise_level}.png')
    plt.clf()

    plt.plot(np.array(losses),color='red', label='Train loss')
    plt.plot(np.array(val_losses),color='blue', label='Val loss')
    plt.legend()
    plt.savefig(f'{prefix}/loss_{mode}_{noise_level}.png')
    plt.clf()
    return

In [12]:
device = "cuda:2"

In [16]:
df_paths = ['../Datasets/df_syn_train_0_0_.csv',
            # '../Datasets/df_synA_train_shuffled.csv',
            # '../Datasets/df_synA_test_hard_shuffled_sample.csv'
            ]

# modes = ['era', 'target_5_val', 'target_10_val']
modes = ['era', 'target_5_val', 'target_10_val']
noise_levels = ['none', 'low', 'high']
epoch_num = [30, 30, 30]
# epoch_num = [50, 10, 10]
batch_sizes = [32, 128, 128]
# lrs = [1e-4, 1e-3, 5e-3]

losses_arr = []
accs_arr = []

for i in range(1):
    df = pd.read_csv(df_paths[i])
    df.sort_values(by=['era', 'day', 'row_num'], inplace=True)    

    loss_per_mode = []
    acc_per_mode = []

    for j in range(len(modes)):
        mode = modes[j]
        noise_level = noise_levels[i]
        epochs = epoch_num[j]
        # lr = lrs[j]
        print("Noise Level:", noise_level, "Mode:", mode)
        loader_train, loader_val, loader_test = make_data_splits_sequential(df, mode=mode, \
                                                                       batch_size=batch_sizes[i])
        
        #one-vs-all

        net = LSTMClassifier(26, len(df[f'{mode}'].unique()), lr = 1e-4)
        
        mode += '_lstm'
        net, losses, accs, val_losses, val_accL = Train(net, loader_train, mode, noise_level, \
                                      epochs=epochs, verbose=True, device=device, val_ds=loader_val, \
                                      plot_accs=True, plot_losses=True)
        plot_loss_acc(losses, accs, val_losses, val_accL, mode, noise_levels[i])
        
        ###########Testing code
        test_loss, test_acc = Test(net, loader_test, mode, noise_level, device=device)
        print("Noise Level:", noise_level, "Mode:", mode, "Test Acc:", test_acc)
        loss_per_mode.append(test_loss)
        acc_per_mode.append(test_acc)

    losses_arr.append(loss_per_mode)
    accs_arr.append(acc_per_mode)

Noise Level: none Mode: era
Train-val-test lengths:  5376 1152 1164
Epoch  0 Loss: 2.48660e+00 Accuracy: 0.08333 Epoch Time: 1.59145
Epoch  1 Loss: 2.48596e+00 Accuracy: 0.08333 Epoch Time: 1.54746
Epoch  2 Loss: 2.48566e+00 Accuracy: 0.08333 Epoch Time: 1.50343
Epoch  3 Loss: 2.48548e+00 Accuracy: 0.08333 Epoch Time: 1.56626
Epoch  4 Loss: 2.48532e+00 Accuracy: 0.07812 Epoch Time: 1.54669
Epoch  5 Loss: 2.48518e+00 Accuracy: 0.08333 Epoch Time: 1.43904
Epoch  6 Loss: 2.48494e+00 Accuracy: 0.07943 Epoch Time: 1.60418
Epoch  7 Loss: 2.48271e+00 Accuracy: 0.09877 Epoch Time: 1.54509
Epoch  8 Loss: 2.39782e+00 Accuracy: 0.16778 Epoch Time: 1.49032
Epoch  9 Loss: 2.21412e+00 Accuracy: 0.22526 Epoch Time: 1.53412
Epoch 10 Loss: 2.10641e+00 Accuracy: 0.25502 Epoch Time: 1.46828
Epoch 11 Loss: 2.04188e+00 Accuracy: 0.26451 Epoch Time: 1.58715
Epoch 12 Loss: 1.98598e+00 Accuracy: 0.27679 Epoch Time: 1.55588
Epoch 13 Loss: 1.92521e+00 Accuracy: 0.28330 Epoch Time: 1.52156
Epoch 14 Loss: 1.87457

<Figure size 640x480 with 0 Axes>

In [10]:
df_paths = ['../Datasets/df_syn_train_0_0_.csv',
            # '../Datasets/df_synA_train_shuffled.csv',
            # '../Datasets/df_synA_test_hard_shuffled_sample.csv'
            ]

# modes = ['era', 'target_5_val', 'target_10_val']
modes = ['era', 'target_5_val', 'target_10_val']
noise_levels = ['none', 'low', 'high']
epoch_num = [30, 30, 30]
# epoch_num = [50, 10, 10]
batch_sizes = [32, 128, 128]
# lrs = [1e-4, 1e-3, 5e-3]

losses_arr = []
accs_arr = []

for i in range(1):
    df = pd.read_csv(df_paths[i])
    df.sort_values(by=['era', 'day', 'row_num'], inplace=True)    

    loss_per_mode = []
    acc_per_mode = []

    for j in range(len(modes)):
        mode = modes[j]
        noise_level = noise_levels[i]
        epochs = epoch_num[j]
        # lr = lrs[j]
        print("Noise Level:", noise_level, "Mode:", mode)
        loader_train, loader_val, loader_test = make_data_splits_sequential(df, mode=mode, \
                                                                       batch_size=batch_sizes[i])
        
        #one-vs-all

        net = LSTMClassifier(26, len(df[f'{mode}'].unique()), lr = 1e-6)
        
        mode += '_lstm'
        net, losses, accs, val_losses, val_accL = Train(net, loader_train, mode, noise_level, \
                                      epochs=epochs, verbose=True, device=device, val_ds=loader_val, \
                                      plot_accs=True, plot_losses=True)
        plot_loss_acc(losses, accs, val_losses, val_accL, mode, noise_levels[i])
        
        ###########Testing code
        test_loss, test_acc = Test(net, loader_test, mode, noise_level, device=device)
        print("Noise Level:", noise_level, "Mode:", mode, "Test Acc:", test_acc)
        loss_per_mode.append(test_loss)
        acc_per_mode.append(test_acc)

    losses_arr.append(loss_per_mode)
    accs_arr.append(acc_per_mode)

Noise Level: none Mode: era
Train-val-test lengths:  5376 1152 1164
Epoch  0 Loss: 2.48633e+00 Accuracy: 0.08333 Epoch Time: 1.13366
Epoch  1 Loss: 2.48627e+00 Accuracy: 0.08333 Epoch Time: 1.02789
Epoch  2 Loss: 2.48621e+00 Accuracy: 0.08333 Epoch Time: 1.11677
Epoch  3 Loss: 2.48615e+00 Accuracy: 0.08333 Epoch Time: 1.09451
Epoch  4 Loss: 2.48608e+00 Accuracy: 0.08333 Epoch Time: 1.10065
Epoch  5 Loss: 2.48604e+00 Accuracy: 0.08333 Epoch Time: 1.04029
Epoch  6 Loss: 2.48598e+00 Accuracy: 0.08333 Epoch Time: 1.04676
Epoch  7 Loss: 2.48591e+00 Accuracy: 0.08333 Epoch Time: 1.02317
Epoch  8 Loss: 2.48586e+00 Accuracy: 0.08333 Epoch Time: 0.95467
Epoch  9 Loss: 2.48580e+00 Accuracy: 0.08333 Epoch Time: 0.86529
Epoch 10 Loss: 2.48575e+00 Accuracy: 0.08333 Epoch Time: 0.89431
Epoch 11 Loss: 2.48570e+00 Accuracy: 0.08333 Epoch Time: 0.93223
Epoch 12 Loss: 2.48563e+00 Accuracy: 0.08333 Epoch Time: 0.88663
Epoch 13 Loss: 2.48558e+00 Accuracy: 0.08333 Epoch Time: 0.91788
Epoch 14 Loss: 2.48552

<Figure size 640x480 with 0 Axes>

In [11]:
with open(f'{prefix}/losses_dump.pkl', 'wb') as f:
    pkl.dump(losses_arr, f)

with open(f'{prefix}/accs_dump.pkl', 'wb') as f:
    pkl.dump(accs_arr, f)