In [4]:
# !gdown https://drive.google.com/file/d/1Hv4RAltBumSfOkRacoX8qrfDYfd_NDss/view?usp=drive_link --fuzzy

Downloading...
From: https://drive.google.com/uc?id=1Hv4RAltBumSfOkRacoX8qrfDYfd_NDss
To: /content/Dataset_AML_Assignment1_Part1.zip
  0% 0.00/11.2M [00:00<?, ?B/s]100% 11.2M/11.2M [00:00<00:00, 134MB/s]


In [5]:
# !unzip Dataset_AML_Assignment1_Part1.zip

Archive:  Dataset_AML_Assignment1_Part1.zip
  inflating: df_syn_train_0_0_.csv   
  inflating: df_synA_test_hard_shuffled_sample.csv  
  inflating: df_synA_train_shuffled.csv  


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
from IPython import display
from tqdm.notebook import tqdm
import random
import math, time, os
from matplotlib import pyplot as plt

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')
prefix = './data_dump_ensemble'


In [None]:
# NAL
# Ensemble
# Autoencoder
# SubTab
# Meta Learning - clean+dirty

In [2]:
class SinusodialDataset(Dataset):
    def __init__(self, df, mode='era'):
        """ creating label columns of eras and targets """
        self.X = df.iloc[:, :24]
        if mode == 'era':
          self.y = df['era_label']
        else:
          self.y = df[f'{mode}']

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        X = torch.tensor(self.X.iloc[idx].values, dtype=torch.float32)
        y = torch.tensor(int(self.y.iloc[idx]), dtype=torch.long)
        return X, y

In [3]:
def make_data_splits(df, mode, batch_size=32, train_perc=0.7, val_test_perc=0.5):

    def encode(v, class_values):
        return class_values.index(v)

    #adding new era_label column indexed 0, 1,...
    class_values = df['era'].unique().tolist()
    df['era_label'] = df['era'].apply(lambda x: encode(x, class_values))
    df.reset_index(drop=True, inplace=True)

    train_samples = int(len(df)*train_perc)
    val_test_samples = len(df)-train_samples

    data = SinusodialDataset(df, mode=mode)
    data_train, data_test = random_split(data, [train_samples, val_test_samples])

    val_samples = int(len(data_test)*0.5)
    test_samples = len(data_test)-val_samples
    data_val, data_test = random_split(data_test, [val_samples, test_samples])

    print("Train-val-test lengths: ", len(data_train), len(data_val), len(data_test))

    loader_train = DataLoader(data_train, batch_size=batch_size, shuffle=True)
    loader_val = DataLoader(data_val, batch_size=batch_size, shuffle=False)
    loader_test = DataLoader(data_test, batch_size=batch_size, shuffle=False)

    return loader_train, loader_val, loader_test, data

In [4]:
class MLP(nn.Module):
    def __init__(self, dims, task='classification', lr=1e-3, weight_decay=0):
        super(MLP,self).__init__()
        self.dims=dims
        self.task=task
        self.layers=nn.ModuleList()
        for i in range(len(self.dims)-2):
            self.layers.append(nn.Linear(dims[i],dims[i+1]))
            self.layers.append(nn.ReLU())
        self.layers.append(nn.Linear(dims[i+1],dims[i+2]))
        self.layers.append(nn.LogSoftmax(dim=1))
        self.optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)

    def forward(self,x):
        x = x.float()
        for l in self.layers:
            x = l(x)
        return x

In [5]:
def accuracy(Net, X_test, y_test, verbose=True):
    Net.eval()
    m = X_test.shape[0]
    y_pred = Net(X_test)
    predicted = torch.max(y_pred, 1)[1]
    correct = (predicted == y_test).float().sum().item()
    if verbose:
        print(correct,m)
    accuracy = correct/m
    Net.train()
    return accuracy, correct

In [6]:
def Test(net, loader_test, mode, noise_level, \
         device='cpu', Loss=nn.NLLLoss(reduction='sum')):
    net.eval()
    total_samples = 0
    correct_samples = 0
    loss = 0.0
    for (X, y) in loader_test:
        X=X.to(device)
        y=y.to(device)
        total_samples += y.shape[0]
        _, i_cor_sam = accuracy(net,X,y,verbose=False)
        correct_samples += i_cor_sam
        loss += Loss(net(X), y).cpu().detach().item()
    acc = correct_samples / total_samples
    loss /= total_samples
    return loss, acc

In [18]:
def Train(Net, data, mode, noise_level, epochs=20, lr=5e-2, Loss=nn.NLLLoss(reduction='sum'), verbose=False, device='cpu',
          val_ds=None, plot_accs=False, plot_losses=False):
    model_save_time = time.time()
    losses = []
    accs = []
    val_losses=[]
    val_accL=[]
    Net.to(device)
    for e in range(epochs):
        Net.train()
        step=0
        tot_loss=0.0
        start_time = time.time()
        correct_samples = 0
        total_samples = 0
        for (X,y) in data:
            X=X.to(device)
            y=y.to(device)
            total_samples += y.shape[0]
            y_pred = Net(X)
            loss = Loss(y_pred,y)
            Net.optimizer.zero_grad()
            loss.backward()
            Net.optimizer.step()
            step+=1
            tot_loss+=loss
            if verbose:
                _, i_cor_sam = accuracy(Net,X,y,verbose=False)
                correct_samples += i_cor_sam
        end_time = time.time()
        t = end_time-start_time
        l = tot_loss.item()/total_samples
        losses += [l]
        a = correct_samples/total_samples
        accs += [a]

        if verbose:
            print('Epoch %2d Loss: %2.5e Accuracy: %2.5f Epoch Time: %2.5f' %(e,l,a,t))

        val_loss, val_acc = Test(Net, val_ds, mode, noise_level, device)
        val_losses.append(val_loss)
        val_accL.append(val_acc)

        torch.save(Net.state_dict(), f'{prefix}/net_{noise_level}_{mode}_{str(model_save_time)}.pth')

    return Net, losses, accs, val_losses, val_accL

In [9]:
def plot_loss_acc(losses, accs, val_losses, val_accs, mode, noise_level):

    plt.plot(np.array(accs),color='red', label='Train accuracy')
    plt.plot(np.array(val_accs),color='blue', label='Val accuracy')
    plt.legend()
    plt.savefig(f'{prefix}/acc_{mode}_{noise_level}.png')
    plt.clf()

    plt.plot(np.array(losses),color='red', label='Train loss')
    plt.plot(np.array(val_losses),color='blue', label='Val loss')
    plt.legend()
    plt.savefig(f'{prefix}/loss_{mode}_{noise_level}.png')
    plt.clf()
    return

In [16]:
def checkModel(net, mode, noise_level, loader_train, loader_val, loader_test):
  net, losses, accs, val_losses, val_accL = Train(net, loader_train, mode, noise_level, \
                              epochs=15, verbose=True, device=device, val_ds=loader_val, \
                              plot_accs=True, plot_losses=True)

  plot_loss_acc(losses, accs, val_losses, val_accL, mode, noise_level)
  #Testing code
  test_loss, test_acc = Test(net, loader_test, mode, noise_level, device=device)
  return net, accs[-1]

In [13]:
def TestEnsemble(nets, loader_test, mode, noise_level, \
         device='cpu', Loss=nn.NLLLoss(reduction='sum')):
    total_samples = 0
    correct_samples = 0

    for (X, y) in loader_test:
        X=X.to(device)
        y=y.to(device)
        sample_size = y.shape[0]
        total_samples += sample_size

        y_preds = []
        all_classes = 0

        for net in nets:
          net.eval()
          __y_pred = net(X)
          all_classes = __y_pred.shape[-1]
          predicted = torch.max(__y_pred, 1)[1]
          y_preds.append(predicted.cpu().detach().numpy())


        predicted = []

        for i in range(sample_size):
          voting_array = [0] * all_classes
          for model_num in range(len(nets)):
            voting_array[y_preds[model_num][i]] += 1
          max_val = -1
          max_ind = -1
          for ind in range(len(voting_array)):
            if voting_array[ind] > max_val:
              max_val = voting_array[ind]
              max_ind = ind

          predicted.append(max_ind)

        predicted = torch.tensor(predicted).to(device)
        i_cor_sam = (predicted == y).float().sum().item()
        correct_samples += i_cor_sam

    acc = correct_samples / total_samples
    return acc

In [19]:
df_paths = ['./df_syn_train_0_0_.csv',
            './df_synA_train_shuffled.csv',
            './df_synA_test_hard_shuffled_sample.csv']

noise_levels = ['none', 'low', 'high']
batch_sizes = [32, 128, 128]

losses_arr = []
accs_arr = []

for i in range(0, 3):
  df = pd.read_csv(df_paths[i])

  modes = ['era', 'target_5_val', 'target_10_val']

  loss_per_mode = []
  acc_per_mode = []

  for mode in modes:
    noise_level = noise_levels[i]
    print("Noise Level:", noise_level, "| Mode:", mode)
    loader_train, loader_val, loader_test, data = make_data_splits(df, mode=mode, \
                                                                    batch_size=batch_sizes[i])

    nets = [MLP(dims=[data.X.shape[1], 64, 32, len(data.y.unique())]),
            MLP(dims=[data.X.shape[1], 128, 64, 32, 16, len(data.y.unique())]),
            MLP(dims=[data.X.shape[1], 32, len(data.y.unique())]),
            MLP(dims=[data.X.shape[1], 64, 128, 64, 24, len(data.y.unique())]),
            MLP(dims=[data.X.shape[1], 32, 64, 128, len(data.y.unique())])
            ]

    mode += '_ensemble'
    check_accs = []
    counter = 0
    for net in nets:
      net = net.to(device)
      net, _acc = checkModel(net, mode, noise_level, loader_train, loader_val, loader_test)
      check_accs.append(_acc)
      # print(f'Model {counter} finished - {_acc}%')
      counter += 1

    # for j in range(len(nets)):
      # print(print(f'Model {j} accuracy - {check_accs[j]}%'))

    ensemble_acc = TestEnsemble(nets, loader_test, mode, noise_level, device=device)
    print(f'"Noise Level: {noise_level} | Mode: {mode} | Ensemble model acc: {ensemble_acc}')

    # loss_per_mode.append(test_loss)
    acc_per_mode.append(ensemble_acc)

  # losses_arr.append(loss_per_mode)
  accs_arr.append(acc_per_mode)

Noise Level: none | Mode: era
Train-val-test lengths:  5460 1170 1170
Epoch  0 Loss: 2.25870e+00 Accuracy: 0.27491 Epoch Time: 1.53230
Epoch  1 Loss: 1.43765e+00 Accuracy: 0.51758 Epoch Time: 1.32880
Epoch  2 Loss: 1.01301e+00 Accuracy: 0.67601 Epoch Time: 1.57535
Epoch  3 Loss: 8.29714e-01 Accuracy: 0.73352 Epoch Time: 1.19665
Epoch  4 Loss: 7.32480e-01 Accuracy: 0.76099 Epoch Time: 1.48896
Epoch  5 Loss: 6.67785e-01 Accuracy: 0.77930 Epoch Time: 1.28448
Epoch  6 Loss: 6.25810e-01 Accuracy: 0.78846 Epoch Time: 1.41319
Epoch  7 Loss: 6.01186e-01 Accuracy: 0.79267 Epoch Time: 1.45912
Epoch  8 Loss: 5.67833e-01 Accuracy: 0.80421 Epoch Time: 1.44920
Epoch  9 Loss: 5.45934e-01 Accuracy: 0.81154 Epoch Time: 1.36424
Epoch 10 Loss: 5.28950e-01 Accuracy: 0.81593 Epoch Time: 1.32280
Epoch 11 Loss: 5.10320e-01 Accuracy: 0.82179 Epoch Time: 1.43477
Epoch 12 Loss: 4.92391e-01 Accuracy: 0.82326 Epoch Time: 1.47191
Epoch 13 Loss: 4.73109e-01 Accuracy: 0.83297 Epoch Time: 1.21850
Epoch 14 Loss: 4.612

<Figure size 640x480 with 0 Axes>