<a href="https://colab.research.google.com/github/parimalakettymuthu/MachineLearning-Projects/blob/main/stackExchange_Final_NN_Multilabel_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/content/drive')

  !pip install torchtext --upgrade --q
  !pip install torchmetrics --q
  !pip install --quiet torch-lr-finder --q
  !pip install --upgrade wandb --q
  !pip install --gensim --q

  basepath = '/content/drive/My Drive/NLP'
  sys.path.append('/content/drive/My Drive/NLP/custom-functions')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: --gensim


In [3]:
#Importing the required libraries
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchmetrics
from torchmetrics.classification import MultilabelF1Score, MulticlassHammingDistance
from torchmetrics.functional.classification import multilabel_f1_score,multilabel_hamming_distance

import joblib
import ast
import wandb

from types import SimpleNamespace
from functools import partial
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer as mlb

In [4]:
#Defined the google drive folders for accessing/saving model progress
embeddings_folder = Path(basepath)/ 'assignment7/WordEmbeddings'
data_folder = Path(basepath)/ 'assignment7/MultiLabel_Classification'
model_saving_folder = Path(basepath)/ 'assignment7/NN_MultiLabel_Classification_task3b'


In [5]:
data_cleaned = data_folder/ "df_multilabel_hw_cleaned.joblib"
stackExchange_dataset = joblib.load(data_cleaned)

In [6]:
X = stackExchange_dataset['cleaned_text'].values
y = stackExchange_dataset['Tag_Number'].values

In [7]:
!pip install swifter -qq

In [8]:
import swifter
import ast
stackExchange_dataset['Tag_number_list'] = stackExchange_dataset['Tag_Number'].swifter.apply(lambda x: ast.literal_eval(x))

Pandas Apply:   0%|          | 0/47427 [00:00<?, ?it/s]

In [9]:
y_final = stackExchange_dataset['Tag_number_list'].values

In [10]:
#from sklearn.preprocessing import MultiLabelBinarizer as mlb
y_stackExchange_encoding = mlb().fit_transform(y_final)

In [11]:
from sklearn.model_selection import train_test_split
X_sExchange_train, X_valid_test, y_sExchange_train, y_valid_test = train_test_split(X, y_stackExchange_encoding, test_size=0.4, random_state=42)
X_sExchange_valid, X_sExchange_test, y_sExchange_valid, y_sExchange_test = train_test_split(X_valid_test, y_valid_test, test_size=0.5, random_state=42)

In [12]:
from gensim.models import KeyedVectors
pretrained_sExchange_file = str(embeddings_folder/ "model_stackExchange_CBOW.bin")
sExchange_vectors = KeyedVectors.load(pretrained_sExchange_file)

In [13]:
!pip install torch_lr_finder -qq

In [14]:
from torch_lr_finder import LRFinder
from Trainer_v4 import Trainer
from data_preparation_HW7 import *

In [15]:
from ff_sequential_model_v1 import MLPCustom

#Creating Dataset & Vocab

In [17]:
se_trainset = CustomDataset(X_sExchange_train, y_sExchange_train)
se_validset = CustomDataset(X_sExchange_valid, y_sExchange_valid)
se_testset = CustomDataset(X_sExchange_test, y_sExchange_test)

#Creating stackexchange vocab
stackExchange_vocab = get_vocab(se_trainset, min_freq=2)

In [20]:
pretrained_weights, words_found, words_not_found = get_pretrained_weights(
    vocab=stackExchange_vocab,
    pretrained_vectors=sExchange_vectors,
    embedding_dim = 300,
    )

In [21]:
pretrained_weights.shape

torch.Size([90287, 300])

In [22]:
type(pretrained_weights), words_found, words_not_found

(torch.Tensor, 14664, 75623)

In [24]:
hyperparameters = SimpleNamespace(
    # for model
    EMBED_DIM=300,
    VOCAB_SIZE=len(stackExchange_vocab),
    OUTPUT_DIM=10,
    HIDDEN_SIZES_LIST=[200],
    DPROB_LIST=[0.0],
    NON_LINEARITY=nn.ReLU(),
    BATCH_NORM=False,

    # for training
    INITIALIZATION ='kaiming',
    EPOCHS=50,
    BATCH_SIZE=128,
    LEARNING_RATE=0.001,
    DATASET='IMDB',
    ARCHITECTURE='embed_layer-ffn',

    # for optimizer
    OPTIMIZER='AdamW',
    MOMENTUM = 0,
    NESTEROV = False,
    WEIGHT_DECAY = 0.000,

    # gradient clipping
    CLIP_TYPE='norm',
    CLIP_VALUE=2,

    # early stopping
    EARLY_STOP_PATIENCE=5,

    #scheduler
    SCHEDULER = 'None'
    )

Specify the run name and folder

In [25]:
project_name = 'Regularization_stackExchange_v4'
run_name = 'exp8'
run_folder = model_saving_folder/project_name/run_name
run_folder.mkdir(exist_ok=True, parents=True)
log_frequency = 5

In [26]:
run_folder

PosixPath('/content/drive/My Drive/NLP/assignment7/NN_MultiLabel_Classification_task3b/Regularization_stackExchange_v4/exp8')

In [27]:
# run 1 - based on default initialization
# Notes: Final Learning rate was set to 0.01

# run 2 - add dropout
hyperparameters.LEARNING_RATE = 0.01   
hyperparameters.DPROB_LIST=[0.5] 

# run 3 - remove dropout, add weight decay
hyperparameters.WEIGHT_DECAY = 1
hyperparameters.DPROB_LIST=[0] 

# run 4 - remove dropout, add weight decay
hyperparameters.WEIGHT_DECAY = 0.1

# run 5 - increase batch size to 256
hyperparameters.BATCH_SIZE = 256

# run 6 - One cyucle scheduler
hyperparameters.LEARNING_RATE = 0.001   
hyperparameters.WEIGHT_DECAY = 10
hyperparameters.SCHEDULER='OneCyclicLR'
hyperparameters.SCHEDULER_MAX_LR=0.01
hyperparameters.SCHEDULER_DIV_FACTOR=25
hyperparameters.SCHEDULER_FINAL_DIV_FACTOR=1e3
hyperparameters.EPOCHS = 10

#  run 7 - use pre-trained weights but freeze teh weights - model will 
hyperparameters.USE_PRE_TRAINED_WEIGHTS = True
hyperparameters.FREEZE_PRETRAINED = True

# run 8 - Unfreeze the weights
hyperparameters.FREEZE_PRETRAINED = False

Trainer Configuration

In [29]:
# Fix seed value
Trainer.set_seed()

collate_fn = partial(collate_batch, vocab=stackExchange_vocab)

# Data Loader
train_loader, valid_loader = get_loaders(trainset=se_trainset, validset=se_validset, 
                                         batch_size_=hyperparameters.BATCH_SIZE,
                                         collate_fn=collate_fn)

# cross entropy loss function
loss_function = nn.BCEWithLogitsLoss()

# model
model_imdb = MLPCustom(hyperparameters.EMBED_DIM,
                       hyperparameters.VOCAB_SIZE,
                       hyperparameters.HIDDEN_SIZES_LIST,
                       hyperparameters.DPROB_LIST,
                       hyperparameters.OUTPUT_DIM,
                       hyperparameters.NON_LINEARITY,
                       hyperparameters.BATCH_NORM,                       
                       use_pre_trained_weights =hyperparameters.USE_PRE_TRAINED_WEIGHTS,
                       pretrained_weights=pretrained_weights, 
                       freeze_pretrained = hyperparameters.FREEZE_PRETRAINED)

def init_weights(m):
  if type(m) == nn.Linear:
      torch.nn.init.kaiming_normal_(m.weight)
      torch.nn.init.zeros_(m.bias)

# apply initialization recursively  to all modules
if hyperparameters.INITIALIZATION =='kaiming':
    model_imdb.apply(init_weights)

# OPTIMIZERS
def get_optimizer():
    if hyperparameters.OPTIMIZER == "SGD":
        optimizer = torch.optim.SGD(
            model_imdb.parameters(),
            lr=hyperparameters.LEARNING_RATE,
            momentum=hyperparameters.MOMENTUM,
            nesterov=hyperparameters.NESTEROV,
            weight_decay = hyperparameters.WEIGHT_DECAY
        )
    else:
        optimizer = torch.optim.AdamW(
            model_imdb.parameters(), 
            lr=hyperparameters.LEARNING_RATE,
            weight_decay = hyperparameters.WEIGHT_DECAY

        )
    return optimizer

optimizer = get_optimizer()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

TypeError: ignored