<a href="https://colab.research.google.com/github/parimalakettymuthu/MachineLearning-Projects/blob/main/stackExchange_NN_Multilabelclassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')

  !pip install torchtext --upgrade --q
  !pip install torchmetrics --q
  !pip install -quiet torch-lr-finder --q
  !pip install wandb --q --upgrade

  basepath = '/content/drive/My Drive/NLP' 
  sys.path.append('/content/drive/My Drive/NLP/custom-functions')

In [None]:
#Importing the required libraries 
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchmetrics
from torchmetrics.classification import MultilabelF1Score, MultilabelHammingDistance
from torchmetrics.functional.classification import multilabel_f1_score, multilabel_hamming_distance

import joblib
#import swifter
import ast
import wandb

from types import SimpleNamespace
from functools import partial
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer as mlb


In [None]:
#Defined the google drive folders for accessing/saving models related files
embeddings_folder = Path(basepath)/ 'assignment7/WordEmbeddings'
data_folder = Path(basepath)/ 'assignment7/MultiLabel_Classification'
model_saving_folder = Path(basepath)/ 'assignment7/MultiLabel_Classification'

In [None]:
data_cleaned = data_folder/ "df_multilabel_hw_cleaned.joblib"
stackExchange_dataset = joblib.load(data_cleaned)

In [None]:
X = stackExchange_dataset['cleaned_text'].values
y = stackExchange_dataset['Tag_Number'].values

In [None]:
!pip install swifter -qq

In [None]:
import swifter
import ast
stackExchange_dataset['Tag_Number_list'] = stackExchange_dataset['Tag_Number'].swifter.apply(lambda x: ast.literal_eval(x))

In [None]:
y_final = stackExchange_dataset['Tag_Number_list'].values

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer as mlb
y_stackExchange_encoding = mlb().fit_transform(y_final)

In [None]:
from sklearn.model_selection import train_test_split
X_sExchange_train, X_valid_test, y_sExchange_train, y_valid_test = train_test_split(X, y_stackExchange_encoding, test_size=0.4, random_state=42)
X_sExchange_valid, X_sExchange_test, y_sExchange_valid, y_sExchange_test = train_test_split(X_valid_test, y_valid_test, test_size=0.5, random_state=42)

In [None]:
from gensim.models import KeyedVectors
pretrained_sExchange_file = str(embeddings_folder/ "model_stackExchange_CBOW.bin")
sExchange_vectors = KeyedVectors.load(pretrained_sExchange_file)

In [None]:
!pip install torch_lr_finder -qq

In [None]:
from torch_lr_finder import LRFinder
from Trainer_v4 import Trainer
from data_preparation_HW7 import * 

In [None]:
from ff_sequential_model import MLPCustom
#from ff_sequential_model_v1 import MLPCustom

In [None]:
#Creating training dataset with subset 
import random
Trainer.set_seed()
sExchange_trainset = CustomDataset(X_sExchange_train, y_sExchange_train)
se_train_subset_indices = random.sample(range(0, len(sExchange_trainset)), 500)
se_train_subset = torch.utils.data.Subset(sExchange_trainset, se_train_subset_indices)
se_vocab = get_vocab(sExchange_trainset, min_freq=2)

In [None]:
type(se_vocab), se_vocab['debug']

In [None]:
pretrained_sExchange_file = str(embeddings_folder/ "model_stackExchange_CBOW.bin")
sExchange_vectors = KeyedVectors.load(pretrained_sExchange_file)

In [None]:
pretrained_weights, words_found, words_not_found = get_pretrained_weights(
    vocab = se_vocab,
    pretrained_vectors = sExchange_vectors,
    embedding_dim = 300, #updated from 300 to 150
)

In [None]:
pretrained_weights.shape

In [None]:
type(pretrained_weights), words_found, words_not_found

Initialining all required HyperParameters 

In [None]:
hyperparameters = SimpleNamespace(
# for model
    EMBED_DIM=300,
    VOCAB_SIZE=len(se_vocab),
    OUTPUT_DIM=10,
    HIDDEN_SIZES_LIST=[],
    DPROB_LIST=[],
    NON_LINEARITY=nn.SELU(),
    BATCH_NORM=False,
    
    # for optimizer
    OPTIMIZER="SGD",
    MOMENTUM=0,
    NESTEROV=False,
    
    # for training
    INITIALIZATION="default",
    EPOCHS=20,
    BATCH_SIZE=32,
    LEARNING_RATE=0.001,
    DATASET="STACKEXCHANGE",
    ARCHITECTURE="emdbag_linear",
    
    # Schedulers
    SCHEDULER="None",
)

Specifying run name & folder

In [None]:
#Defining the variables
project_name = "StackExchange NN architecture"
run_name="Task3a exp2"
run_folder = model_saving_folder / run_name
run_folder.mkdir(exist_ok=True)
log_frequency = 5

Modifying Hyperparameters for Run

In [None]:
# run 1 - based on default initialization
# Notes: Final Learning rate was set to 1

# run 2
hyperparameters.HIDDEN_SIZES_LIST = [200]
hyperparameters.DPROB_LIST = [0]
hyperparameters.LEARNING_RATE = 0.001  # reset initial learning rate
# Notes : Final Learning rate was set to 1

# run 3
#hyperparameters.OPTIMIZER = "Adam"
#hyperparameters.LEARNING_RATE = 0.001  # reset initial learning rate
# Notes : Final Learning rate was set to 0.02

# run 4
#hyperparameters.HIDDEN_SIZES_LIST = [200] + [200]
#hyperparameters.DPROB_LIST = [0] + [0]
#hyperparameters.LEARNING_RATE = 0.001  # reset initial learning rate
# Notes : Final Learning rate was set to 0.02

# run 5
#hyperparameters.INITIALIZATION = 'kaiming'
#hyperparameters.NON_LINEARITY = nn.ReLU()
#hyperparameters.LEARNING_RATE = 0.001  # reset initial learning rate
# Notes : Final Learning rate was set to 0.02

# run 6
#hyperparameters.HIDDEN_SIZES_LIST = [400] 
#hyperparameters.DPROB_LIST = [0] 
#hyperparameters.LEARNING_RATE = 0.001  # reset initial learning rate
# # Notes : Final Learning rate was set to 0.02

Configuring the trainer 

In [None]:
#Set the seed value 
Trainer.set_seed()

se_collate_fn = partial(collate_batch, vocab=se_vocab)

In [None]:
type(se_collate_fn)

In [None]:
#Data Loader
se_train_loader, _ = get_loaders(trainset=sExchange_trainset, validset=None, 
                                         batch_size_=hyperparameters.BATCH_SIZE,
                                         collate_fn=se_collate_fn)

se_loss_function = nn.BCEWithLogitsLoss()
stackExchange_model = MLPCustom(hyperparameters.EMBED_DIM,
                                hyperparameters.VOCAB_SIZE,
                                hyperparameters.HIDDEN_SIZES_LIST,
                                hyperparameters.DPROB_LIST, 
                                hyperparameters.OUTPUT_DIM, 
                                hyperparameters.NON_LINEARITY,
                                hyperparameters.BATCH_NORM,)
                                # use_pre_trained_weights = hyperparameters.USE_PRE_TRAINED_WEIGHTS,
                                # pretrained_weights = pretrained_weights,
                                # freeze_pretrained = hyperparameters.FREEZE_PRETRAINED)
def init_weights(m):
  if type(m)=="nn.Linear":
    torch.nn.init.kaiming_normal_(m.weight)
    torch.nn.init.zeros_(nn.bias)

#Apply initialization to all modules
if hyperparameters.INITIALIZATION == 'kaiming':
  stackExchange_model.apply(init_weights)


#Defining optimizer
def get_optimizer():
  if hyperparameters.OPTIMIZER == 'SGD':
    optimizer = torch.optim.SGD(
        stackExchange_model.parameters(),
        lr = hyperparameters.LEARNING_RATE,
        momentum = hyperparameters.MOMENTUM,
        nesterov = hyperparameters.NESTEROV,
    )
  else:
    optimizer = torch.optim.Adam(
        stackExchange_model.parameters(),
        lr = hyperparameters.LEARNING_RATE
    )
  return optimizer

sExchange_optimizer = get_optimizer()
sExchange_device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
sExchange_optimizer

In [None]:
stackExchange_model

Trainer

In [None]:
sEXchange_trainer = Trainer(
    model=stackExchange_model, optimizer=sExchange_optimizer, 
    criterion=se_loss_function,
    device=sExchange_device
)
sEXchange_trainer.set_loaders(se_train_loader)

Learning Rate Finder

In [None]:
type(se_train_loader)

In [None]:
#sEXchange_trainer.lr_finder_range_test(sEXchange_trainer.se_train_loader)
sEXchange_trainer.lr_finder_range_test(se_train_loader)

In [None]:
hyperparameters.LEARNING_RATE = 0.02

Set trainer based on hyperparameters

In [None]:
#Resetting optimizer
sEXchange_trainer.set_optimizer(get_optimizer())

#Setting metric 
se_train_metric = MultilabelHammingDistance(num_labels=10)
sEXchange_trainer.set_metric(se_train_metric.to(sExchange_device))


In [None]:
#Setting wandb 
sEXchange_trainer.set_wandb(
    project_name = project_name,
    run_name = run_name,
    config = hyperparameters,
    log_batch=True, 
    log_frequency=log_frequency
)

In [None]:
sEXchange_trainer.learning_rates

Sanity Check

In [None]:
sEXchange_trainer.sanity_check(num_classes=2)

In [None]:
sEXchange_trainer.train(num_epochs=hyperparameters.EPOCHS, multilabel=True)

In [None]:
sEXchange_trainer.plot_history()

Finish Run

In [None]:
wandb.finish()