Author:Anthony Zalev
Goal: Illistrate catestrophic forgetting using the covid dataset.

In [None]:
import pandas as pd
import pandasql as ps
import torch
import random
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torch.nn.functional as F
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler, RobustScaler
import sqlite3 as sq

import seaborn as sns
import matplotlib.pyplot as plt


Set Seed and Set device functions obtained from: https://deeplearning.neuromatch.io/tutorials/W3D4_ContinualLearning/student/W3D4_Tutorial1.html#section-1-1-a-brief-example-of-catastrophic-forgetting


In [None]:
def set_seed(seed=None, seed_torch=True):
    """
    Function that controls randomness. NumPy and random modules must be imported.

    Args:
      seed : Integer
        A non-negative integer that defines the random state. Default is `None`.
      seed_torch : Boolean
        If `True` sets the random seed for pytorch tensors, so pytorch module
        must be imported. Default is `True`.

    Returns:
      Nothing.
    """
    if seed is None:
        seed = np.random.choice(2 ** 32)
    random.seed(seed)
    np.random.seed(seed)
    if seed_torch:
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    print(f'Random seed {seed} has been set.')


# In case that `DataLoader` is used
def seed_worker(worker_id):
    """
    DataLoader will reseed workers following randomness in
    multi-process data loading algorithm.

    Args:
      worker_id: integer
        ID of subprocess to seed. 0 means that
        the data will be loaded in the main process
        Refer: https://pytorch.org/docs/stable/data.html#data-loading-randomness for more details

    Returns:
      Nothing
    """
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
def set_device():
    """
    Set the device. CUDA if available, CPU otherwise

    Args:
      None

    Returns:
      Nothing
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if device != "cuda":
        print("GPU is not enabled in this notebook. \n"
              "If you want to enable it, in the menu under `Runtime` -> \n"
              "`Hardware accelerator.` and select `GPU` from the dropdown menu")
    else:
        print("GPU is enabled in this notebook. \n"
              "If you want to disable it, in the menu under `Runtime` -> \n"
              "`Hardware accelerator.` and select `None` from the dropdown menu")

    return device

In [None]:
SEED = 2021
set_seed(seed=SEED)
DEVICE = set_device()

Domain incremental learning:

This nueral net will have to learn to identify the cases/death ratio over time first before vaccines and then after they come out.

In [None]:
conn = sq.connect('datasets/{}.sqlite'.format("master")) #create file

The following model based on

https://github.com/christianversloot/machine-learning-articles/blob/main/how-to-create-a-neural-network-for-regression-with-pytorch.md

In [None]:
#instatiate neural net.
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
        nn.Linear(59, 512), #input layer
        nn.ReLU(), #hidden 1
        nn.ReLU(),
        nn.ReLU(),
        nn.ReLU(),
        nn.ReLU(),
        nn.Linear(512, 128), #hidden 2
        nn.ReLU(), # hidden 3
        nn.Linear(128, 1)
        )
    def forward(self, x):
        '''
          Forward pass
        '''
        return self.layers(x)

In [None]:
# set random seed and load initial data from 2020
#torch.manual_seed(42) # group 42 for the win.
dataset_df = pd.read_sql_query("SELECT * FROM combined_weekly_encoded_scaled", conn)
year_week = pd.read_sql_query("SELECT year_week FROM combined_weekly_encoded_scaled", conn)

In [None]:
#test
year_week["year_week"][0][0:4] + year_week["year_week"][0][5:]

In [None]:
#tranform date to int for range selection
dataset_df["year_week"] = dataset_df["year_week"].transform(lambda x: x[0:4] + x[5:])

In [None]:
dataset_df.head(10)

In [None]:
query = """ SELECT * FROM dataset_df
        WHERE year_week < 202056
"""
dataset_2020_df = ps.sqldf(query, locals())
dataset_2020_df.head(10)

In [None]:
query = """ SELECT * FROM dataset_df
        WHERE year_week > 202100
"""
dataset_rest_df = ps.sqldf(query, locals())
dataset_rest_df.head(10)

In [None]:
from sklearn.model_selection import train_test_split
X = dataset_2020_df.drop(['year_week', 'fips', 'avg_deaths'], axis = 1).to_numpy()
Y = dataset_2020_df[['avg_deaths']].to_numpy()
x_train, x_test, y_train, y_test = train_test_split(X,Y,test_size = 0.33 , random_state = 15)

In [None]:
X_2 = dataset_rest_df.drop(['year_week', 'fips', 'avg_deaths'], axis = 1).to_numpy()
Y_2 = dataset_rest_df[['avg_deaths']].to_numpy()
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(X_2,Y_2,test_size = 0.33 , random_state = 15)

In [None]:
loss_function = nn.L1Loss()

In [None]:


def train(model, x_train, t_train, optimizer, epoch, device):
    loss_list = []
    """
    Train function

    Args:
      model: Net() type
        Instance of the multilayer CNN
      x_train: np.ndarray
        Training data
      t_train: np.ndarray
        Labels corresponding to the training data
      optimizer: torch.optim type
        Implements Adam algorithm.
      epoch: int
        Number of epochs
      device: string
        CUDA/GPU if available, CPU otherwise

    Returns:
      Nothing
    """
    model.train()

    for start in range(0, len(t_train)-1, 256):
        end = start + 256
        x = torch.from_numpy(x_train[start:end])
        if torch.cuda.is_available():
            x = x.type(torch.cuda.FloatTensor)
        else:
            x = x.type(torch.FloatTensor)
        y = torch.from_numpy(t_train[start:end]).long()
        y = y.type(torch.FloatTensor)
        x, y = x.to(device), y.to(device)

        optimizer.zero_grad()

        output = model(x)
        loss = loss_function(output, y)
        loss.backward()
        optimizer.step()
    #print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, loss.item()))

    return loss.item()

def test(model, x_test, t_test,device):
    """
    Test function.

    Args:
      model: Net() type
        Instance of the multilayer CNN
      x_test: np.ndarray
        Test data
      t_test: np.ndarray
        Labels corresponding to the test data
      device: string
        CUDA/GPU if available, CPU otherwise

    Returns:
      Nothing
    """
    model.eval()
    correct, test_loss = 0, 0
    target_mean = torch.mean(torch.from_numpy(t_test))
    y = torch.from_numpy(t_test)
    y = y.to(device)
    x = torch.from_numpy(x_test)
    x = x.type(torch.cuda.FloatTensor)
    x = x.to(device)
    output = model(x)
    #print(target_mean)
    #print(output)

    ss_tot = torch.sum((output - target_mean) ** 2)
    ss_res =  torch.sum((y - output) ** 2)
    for start in range(0, len(t_test)-1, 256):
        end = start + 256
        with torch.no_grad():
            x = torch.from_numpy(x_test[start:end])
            if torch.cuda.is_available():
                x = x.type(torch.cuda.FloatTensor)
            else:
                x = x.type(torch.FloatTensor)
            y = torch.from_numpy(t_test[start:end]).long()
            x, y = x.to(device), y.to(device)
            output = model(x)
            test_loss += loss_function(output, y).item()  # Sum up batch loss
            pred = output.max(1, keepdim=True)[1]  # Get the index of the max logit



    test_loss /= len(t_test)
    r2 = 1 - ss_res/ss_tot
    print('Test set: Average loss: {:.4f}, R^2: {:.4f}\n'.format(test_loss, r2))
    return r2, test_loss

In [None]:
torch.cuda.empty_cache()

In [None]:
  # Initialize the MLP
mlp = MLP().to(DEVICE)
# Define the loss function and optimizer
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=.01)
nEpochs = 50
loss_origial = []
loss_permutated = []
finalr2 = 0
permutatedr2 = 0
for epoch in range(1, nEpochs+1):
    train(mlp, x_train, y_train, optimizer, epoch, device=DEVICE)
    finalr2, loss_1 = test(mlp, x_test, y_test, device=DEVICE)
    permutated_r2 , loss_2 = test(mlp, x_test_2, y_test_2, device = DEVICE)
    loss_origial.append([loss_1, epoch, 'original'])
    loss_permutated.append([loss_2, epoch, 'next_series'])


In [None]:
print(finalr2.item(), permutated_r2.item())
#df = {'testYears' : ['2020', '2021-2022'], 'adjustedr2' : [finalr2.item(), permutated_r2.item()]}
#sns.barplot(x = "testYears", y = "adjustedr2" , data = df)

In [None]:
loss_origial.extend(loss_permutated)
loss_origial

In [None]:
lineplot_df = pd.DataFrame(loss_origial, columns = ['loss', 'epoch', 'timeseries'])
lineplot_df.head(10)

In [None]:
plt.figure(figsize = (16,9))
plot = sns.lineplot( x= 'epoch', y = 'loss', hue = 'timeseries', data = lineplot_df)
plot.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
plot.set(title = "Loss Convergence For Test Sets From During and Subsequent Time Series")
plt.savefig('MLP_test_set_convergence.png',
            dpi=300,transparent=True,
            bbox_inches = 'tight')
plt.show()

In [None]:
torch.cuda.empty_cache()

In [None]:
mlp2 = MLP().to(DEVICE)
# Define the loss function and optimizer
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(mlp2.parameters(), lr=.001)
nEpochs = 50

finalr2_train2 = 0
permutated_r2_train2 = 0
for epoch in range(1, nEpochs+1):
    train(mlp2, x_train_2, y_train_2, optimizer, epoch, device=DEVICE)
    finalr2_train2 = test(mlp2, x_test_2, y_test_2, device=DEVICE)
    permutated_r2_train2  = test(mlp2, x_test, y_test, device = DEVICE)

In [None]:
print(finalr2_train2, permutated_r2_train2)


In [None]:
df = pd.DataFrame({'testYears' : ['2020', '2021-2022'], 'adjustedr2' : [permutated_r2_train2.item(), finalr2_train2.item()],  "trainingData" :  ['2021-2022', '2021-2022']})
#sns.barplot(x = "testYears", y = "adjustedr2" , data = df)

In [None]:
df_2 = pd.DataFrame({'testYears' : ['2020', '2021-2022'], 'adjustedr2' : [finalr2.item(), permutated_r2.item()], "trainingData" :  ['2020', '2020']})
sns.barplot(x = "testYears", y = "adjustedr2" , data = df)

In [None]:
df_3 = pd.concat([df, df_2])
df_3.head(10)

In [None]:
ordering= pd.DataFrame({'testYears' : ['2020', '2021-2022']})

plt.figure(figsize = (16,9))
plot = sns.barplot(x = "trainingData", y = "adjustedr2" , hue = 'testYears' , data = df_3, order = ordering['testYears'])
plot.set_xlabel("Time Range of Training Data", fontsize = 15, )
plot.set_ylabel("R^2", fontsize = 15)
plot.legend(title = "Test Periods", bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
plot.set(title = "R^2 for Test Data trained on Different Time Periods")
plt.savefig('catestrophic_forgetting.png',
            dpi = 300,
             transparent = True,
            bbox_inches = 'tight')

## EWC MODEL