#<font color = 'pickle'> **Import libraries and setting up folders**

In [None]:
from google.colab import drive
from pathlib import Path
import joblib
import pandas as pd
import numpy as np 
import sys
from bs4 import BeautifulSoup
import re
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.utils.data import Dataset, DataLoader

drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/ColabNotebooks/NLPClass/custom-functions')
import custom_preprocessor as cp

base_folder = Path('/content/drive/MyDrive/ColabNotebooks/NLPClass/')
data_folder = base_folder/'datasets'
custom_functions = base_folder/'custom-functions'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#<font color = 'pickle'> **Task 2**

#<font color = 'pickle'> **Data Preprocessing and dumping**

In [None]:
df = joblib.load(data_folder/'df_multilabel_hw.joblib')

In [None]:
def basic_clean(text):
    if (bool(BeautifulSoup(text, "html.parser").find())==True):         
        soup = BeautifulSoup(text, "html.parser")
        text = soup.get_text(separator=" ")
    else:
        pass
    return re.sub(r'[\n\r]','', text)

In [None]:
df['Basic_Cleaned'] = df['Body'].swifter.apply(lambda x: basic_clean(x))

Pandas Apply:   0%|          | 0/47427 [00:00<?, ?it/s]

In [None]:
# !python -m spacy download 'en_core_web_sm'
preprocessor = cp.SpacyPreprocessor(model = 'en_core_web_sm', 
                                    batch_size = 1000, 
                                    lammetize=False, 
                                    lower=True, 
                                    remove_stop=True, 
                                    remove_punct=True, 
                                    remove_email=True, 
                                    remove_url=True, 
                                    remove_num=False, 
                                    stemming = False,
                                    add_user_mention_prefix=False, 
                                    remove_hashtag_prefix=False)

In [None]:
df['Full_cleaned'] = preprocessor.fit_transform(df.Basic_Cleaned.values)

In [None]:
df['Text'] = df['Title'] + ' ' + df['Full_cleaned']
df_final = df[['Text', 'Tag_Number']]
df_final.head()

Unnamed: 0,Text,Tag_Number
0,ASP Query String From DropDown webpage follo...,"[0, 9]"
1,How can I run JavaScript code at server side J...,"[1, 3]"
2,linq to sql throwing an exception row not foun...,"[0, 9]"
3,Running a Python script on a PHP server runnin...,"[2, 7]"
4,some advice on how to write a window.resize fu...,"[3, 5]"


In [None]:
import joblib
import pickle

joblib.dump(df_final, '/content/drive/MyDrive/ColabNotebooks/NLPClass/datasets/df_final.pkl')

['/content/drive/MyDrive/ColabNotebooks/NLPClass/datasets/df_final.pkl']

#<font color = 'pickle'> **Loading Preprocessed data**

In [None]:
df_final = joblib.load(data_folder/'df_final.pkl')
df_final.head()

Unnamed: 0,Text,Tag_Number
0,ASP Query String From DropDown webpage follo...,"[0, 9]"
1,How can I run JavaScript code at server side J...,"[1, 3]"
2,linq to sql throwing an exception row not foun...,"[0, 9]"
3,Running a Python script on a PHP server runnin...,"[2, 7]"
4,some advice on how to write a window.resize fu...,"[3, 5]"


In [None]:
search=re.compile(r"\d")
df_final["Tags"] = df_final["Tag_Number"].apply(lambda x: re.findall(search,x))
df_final.head()

Unnamed: 0,Text,Tag_Number,Tags
0,ASP Query String From DropDown webpage follo...,"[0, 9]","[0, 9]"
1,How can I run JavaScript code at server side J...,"[1, 3]","[1, 3]"
2,linq to sql throwing an exception row not foun...,"[0, 9]","[0, 9]"
3,Running a Python script on a PHP server runnin...,"[2, 7]","[2, 7]"
4,some advice on how to write a window.resize fu...,"[3, 5]","[3, 5]"


In [None]:
X = df_final['Text']
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_final['Tags'])
y

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

#<font color = 'pickle'> **Creating Custom Torch Dataset**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_rest, y_train, y_rest = train_test_split(X, y, test_size=0.4, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_rest, y_rest, test_size=0.5, random_state=42)

In [None]:
import torch
class CustomDataset(torch.utils.data.Dataset):
    """Multilabel dataset."""

    def __init__(self, X, y):
        self.X = np.array(X)
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = self.X[idx]
        labels = self.y[idx]
        sample = (text, labels)
        
        return sample

In [None]:
train_set = CustomDataset(X_train,y_train)
valid_set = CustomDataset(X_valid,y_valid)
test_set = CustomDataset(X_test,y_test)

# <Font color = 'pickle'>**Create Vocab**

In [None]:
from collections import Counter
counter = Counter()
for (text, tags) in train_set:
   counter.update(str(text).split())

In [None]:
len(counter)

198906

In [None]:
from torchtext.vocab import vocab
my_vocab = vocab(counter, min_freq=5)
#my_vocab.get_stoi()

In [None]:
my_vocab.insert_token('<unk>', 0)
my_vocab.set_default_index(0)

# <Font color = 'pickle'>**Create DataLoader for Embedding**

In [None]:
text_pipeline = lambda x : [my_vocab[token] for token in str(x).split()]
vector = np.vectorize(np.int_)
label_pipeline = lambda y : vector(y)

In [None]:
len(y)

47427

In [None]:
'''
We know that input to the embedding layers are indices of words from the vocab.
The collate_batch() accepts batch of data and gets the indices of text from vocab and returns the same
We will include this collate_batch() in collat_fn attribute of DataLoader.
So it will create a batch of data containing indices of words and corresponding labels.
But for EmbeddingBag we need one more extra parameter, that is offset.
offsets determines the starting index position of each bag (sequence) in input.
'''
def collate_batch(batch):
    text_list, label_list, offsets = [], [], [0]
    for (_text ,_label) in batch:
         label_list.append(label_pipeline(_label))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return text_list, label_list, offsets

In [None]:
batch_size=3
check_loader = torch.utils.data.DataLoader(dataset=valid_set,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        collate_fn=collate_batch)

In [None]:
for text, label, offsets in check_loader:
  print(label, text, offsets)
  break

tensor([[0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 0, 0, 0, 0, 1]]) tensor([    1,    63,    94,   418,     0,    10,   154,   121,  3662,   205,
          162,   418,  2660,   162,   339,     0,  4267,   418,  2660,   338,
           12,   445,  4618,    35,    94,     0,   162,     0,  6220,   418,
         1455,   388,  8492,   110, 16217, 10074,    15,    98,   350,   405,
          354,   718,  1025,  7149,     0,   579,  1796,   335,     0,    89,
          162,     0,  6694,    52,   532,   469,   108,     0,    35,   156,
         5043,   718,     0,   455,   483,  4319,  7149,    15,   335,     0,
         2177,   241,   416,   683,  1545,   420,   179,   110,   357,  8970,
          253,  1850,  1288,   124,  3117,   618,  1657,     0,   680, 12595,
            0,    98,     0,     0,    89,   179,     0,     0,     0,    89,
          179,     0,     0,    89,   179,     0,     0,    89,     0,     0,
         1082,   245,

  app.launch_new_instance()


# <Font color = 'pickle'>**Implementing NN Training**

In [None]:
# Instantiating embeddingbag layer with total number of embeddings and dimension of embedding i.e. dimesion of vector
import torch.nn as nn
torch.manual_seed(0)
model_eb = nn.EmbeddingBag(len(my_vocab),5)

In [None]:
model_eb.weight

Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339,  0.8487],
        [ 0.6920, -0.3160, -2.1152,  0.3223, -1.2633],
        [ 0.3500,  0.3081,  0.1198,  1.2377,  1.1168],
        ...,
        [ 1.1450,  0.3443,  0.4567, -2.2919, -1.8394],
        [-1.0060, -0.6572, -0.4782,  1.7916,  0.8696],
        [ 0.6533,  1.7955,  0.5710, -0.2695, -0.7298]], requires_grad=True)

## <Font color = 'pickle'>**Model**

In [None]:
from torch import nn
class MLPCustom(nn.Module):
  def __init__(self, embed_dim, vocab_size, hidden_dim1, hidden_dim2, output_dim, non_linearity):

    super().__init__()    
    self.embed_dim = embed_dim
    self.hidden_dim1 = hidden_dim1
    self.hidden_dim2 = hidden_dim2
    self.output_dim = output_dim
    self.vocab_size = vocab_size
    self.non_linearity = non_linearity

    # embedding_layer
    self.embedding = nn.EmbeddingBag(self.vocab_size, self.embed_dim)

    # hidden layer 1
    self.hidden_layer1 = nn.Linear(self.embed_dim, self.hidden_dim1)

    # dropout layer 1
    self.drop1 = nn.Dropout(p= 0.5)

    # batch layer norm 1
    self.batchnorm1 = nn.BatchNorm1d(num_features=self.hidden_dim1)

    # hidden layer 2
    self.hidden_layer2 = nn.Linear(self.hidden_dim1, self.hidden_dim2)
    
    # dropout layer 2
    self.drop2 = nn.Dropout(p= 0.5)

    # batch layer norm 2    
    self.batchnorm2 = nn.BatchNorm1d(num_features=self.hidden_dim2)
    
    # output layer
    self.output_layer = nn.Linear(self.hidden_dim2, self.output_dim)

  def forward(self, input_ , offsets):
    embed_out = self.embedding(input_, offsets) # batchsize, embedding_dim

    hout1 = self.non_linearity(self.hidden_layer1(embed_out)) # batchsize, hidden_dim1
    hout1 = self.batchnorm1(hout1)
    hout1 = self.drop1(hout1)
    
    hout2 = self.non_linearity(self.hidden_layer2(hout1)) # batchsize, hidden_dim2
    hout2 = self.batchnorm2(hout2)
    hout2 = self.drop2(hout2)
    
    ypred = self.output_layer(hout2)

    return ypred

## <Font color = 'pickle'>**Function for Training  Loops**

In [None]:
!pip install wandb
import wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.5-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 15.5 MB/s 
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.11.1-py2.py3-none-any.whl (168 kB)
[K     |████████████████████████████████| 168 kB 8.3 MB/s 
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.29-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 66.7 MB/s 
[?25hCollecting gitdb<5,>=4.0.1
  Downloading gitdb-

In [None]:
def train(train_loader, model, optimizer, loss_function, log_batch, log_interval, grad_clipping, max_norm):

  """ 
  Function for training the model in each epoch
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate.
  Output: final weights, bias, train loss, train accuracy
  """
  # initilalize variables as global
  # these counts will be updated every epoch
  global example_ct_train
  global batch_ct_train

  # Training Loop loop
  # Initialize train_loss at the he start of the epoch
  running_train_loss = 0
  #running_train_correct = 0
  
  # put the model in training mode
  model.train()

  # Iterate on batches from the dataset using train_loader
  for input_, targets, offsets in train_loader:
    
    # move inputs and outputs to GPUs
    input_ = input_.to(device)
    targets = targets.to(device)
    offsets = offsets.to(device)

    # Forward pass
    output = model(input_, offsets)
    loss = loss_function(output, targets.float())

    # Correct prediction
    #y_pred = torch.math.greater(output, 0.5)
    #correct = torch.sum(y_pred == targets)

    example_ct_train +=  len(targets)
    batch_ct_train += 1

    # set gradients to zero 
    optimizer.zero_grad()

    # Backward pass
    loss.backward()

    # Gradient Clipping
    if grad_clipping:
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=max_norm, norm_type=2)

    # Update parameters using their gradient
    optimizer.step()
          
    # Add train loss of a batch 
    running_train_loss += loss.item()

    # Add Corect counts of a batch
    #running_train_correct += correct

    # log batch loss and accuracy
    if log_batch:
      if ((batch_ct_train + 1) % log_interval) == 0:
        wandb.log({f"Train Batch Loss  :": loss})
        #wandb.log({f"Train Batch Acc :": correct/len(targets)})
 
  # Calculate mean train loss for the whole dataset for a particular epoch
  train_loss = running_train_loss/len(train_loader)

  # Calculate accuracy for the whole dataset for a particular epoch
  #train_acc = running_train_correct/len(train_loader.dataset)

  return train_loss

## <Font color = 'pickle'>**Function for Validation Loops**

In [None]:
def valid(loader, model, optimizer, loss_function, log_batch, log_interval):

  """ 
  Function for training the model and plotting the graph for train & valid loss vs epoch.
  Input: iterator for train dataset, initial weights and bias, epochs, learning rate, batch size.
  Output: final weights, bias and train loss and valid loss for each epoch.
  """

  # initilalize variables as global
  # these counts will be updated every epoch
  global example_ct_valid
  global batch_ct_valid

  # Validation loop
  # Initialize train_loss at the he strat of the epoch
  running_valid_loss = 0
  #running_valid_correct = 0
  
  # put the model in evaluation mode
  model.eval()

  with torch.no_grad():
    for input_,targets, offsets in loader:

      # move inputs and outputs to GPUs
      input_ = input_.to(device)
      targets = targets.to(device)
      offsets = offsets.to(device)

      # Forward pass
      output = model(input_, offsets)
      loss = loss_function(output,targets.float())

      # Correct Predictions(previously, torch.argmax(output, dim = 1))
      #y_pred = torch.argmax(output, dim = 1)
      #y_pred = torch.math.greater(output, 0.5)
      #y_pred = torch.cat((y_pred, indices)) 
      #correct = torch.sum(y_pred == targets)

      # count of images and batches
      example_ct_valid +=  len(targets)
      batch_ct_valid += 1

      # Add valid loss of a batch 
      running_valid_loss += loss.item()

      # Add correct count for each batch
      #running_valid_correct += correct

      # log batch loss and accuracy
      if log_batch:
        if ((batch_ct_valid + 1) % log_interval) == 0:
          wandb.log({f"Valid Batch Loss  :": loss})
          #wandb.log({f"Valid Batch Accuracy :": correct/len(targets)})

    # Calculate mean valid loss for the whole dataset for a particular epoch
    valid_loss = running_valid_loss/len(valid_loader)

    # scheduler step
    # scheduler.step(valid_loss)
    # scheduler.step()

    # Calculate accuracy for the whole dataset for a particular epoch
    #valid_acc = running_valid_correct/len(valid_loader.dataset)
    
  return valid_loss

## <Font color = 'pickle'>**Function for Model Training**

In [None]:

def train_loop(train_loader, valid_loader, model, loss_function, optimizer, epochs, device, patience, early_stopping,
               file_model):

  '''
  model: specify your model for training
  criterion: loss function 
  optimizer: optimizer like SGD , ADAM etc.
  train loader: function to carete batches for training data
  valid loader : function to create batches for valid data set
  file_model : specify file name for saving your model. This way we can upload the model weights from file. We will not to run model again.
  

  '''
  # Create lists to store train and valid loss at each epoch

  train_loss_history = []
  valid_loss_history = []
  #train_acc_history = []
  #valid_acc_history = []
  
  delta = 0
  best_score = None
  valid_loss_min = np.Inf
  counter_early_stop=0
  early_stop= False

  from datetime import datetime
  # Iterate for the given number of epochs
  for epoch in range(epochs):
    t0 = datetime.now()
    # Get train loss and accuracy for one epoch

    train_loss = train(train_loader, model, optimizer, loss_function, 
                                  wandb.config.LOG_BATCH, wandb.config.LOG_INTERVAL,
                                  wandb.config.GRAD_CLIPPING, wandb.config.MAX_NORM)
    valid_loss = valid(valid_loader, model, optimizer, loss_function,
                                    wandb.config.LOG_BATCH, wandb.config.LOG_INTERVAL)

    dt = datetime.now() - t0

    # Save history of the Losses and accuracy
    train_loss_history.append(train_loss)
    #train_acc_history.append(train_acc)
    valid_loss_history.append(valid_loss)
    #valid_acc_history.append(valid_acc)

    if early_stopping:
      score = -valid_loss
      if best_score is None:
        best_score=score
        print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving Model...')
        torch.save(model.state_dict(), file_model)
        valid_loss_min = valid_loss

      elif score < best_score + delta:
        counter_early_stop += 1
        print(f'Early stoping counter: {counter_early_stop} out of {patience}')
        if counter_early_stop > patience:
          early_stop = True
      
      else:
        best_score = score
        print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
        torch.save(model.state_dict(), file_model)
        counter_early_stop=0
        valid_loss_min = valid_loss

      if early_stop:
        print('Early Stopping')
        break

    else:

      score = -valid_loss
      if best_score is None:
        best_score=score
        print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving Model...')
        torch.save(model.state_dict(), file_model)
        valid_loss_min = valid_loss

      elif score < best_score + delta:
        print(f'Validation loss has not decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Not Saving Model...')
      
      else:
        best_score = score
        print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
        torch.save(model.state_dict(), file_model)
        valid_loss_min = valid_loss

    # Log the train and valid loss to W&B
    wandb.log({f"Train epoch Loss :": train_loss, f"Valid epoch Loss :": valid_loss })
    #wandb.log({f"Train epoch Acc :": train_acc, f"Valid epoch Acc :": valid_acc})

    # Print the train loss and accuracy for given number of epochs, batch size and number of samples
    print(f'Epoch : {epoch+1} / {epochs}')
    print(f'Time to complete {epoch+1} is {dt}')
    # print(f'Learning rate: {scheduler._last_lr[0]}')
    print(f'Train Loss: {train_loss : .4f}')
    print(f'Valid Loss: {valid_loss : .4f}')
    print()
    torch.cuda.empty_cache()

  return train_loss_history, valid_loss_history

# <Font color = 'pickle'>**Meta Data**

In [None]:
from types import SimpleNamespace
import torch.nn.functional as F

hyperparameters = SimpleNamespace(
    EMBED_DIM = 400,
    VOCAB_SIZE = len(my_vocab),
    OUTPUT_DIM = 10,
    HIDDEN_DIM1 = 200,
    HIDDEN_DIM2 = 100,
    NON_LINEARITY= F.relu,
    EPOCHS = 40,
    
    BATCH_SIZE = 256,
    LEARNING_RATE = 0.01,
    DATASET="Multilabel",
    ARCHITECTUREe="HW6B_embedbag_2hiddenlayers",
    LOG_INTERVAL = 25,
    LOG_BATCH = True,
    FILE_MODEL = data_folder/'HW6B.pt',
    GRAD_CLIPPING = False,
    MAX_NORM = 0,
    MOMENTUM = 0,
    PATIENCE = 5,
    EARLY_STOPPING = True,
    # SCHEDULER_FACTOR = 0,
    # SCHEDULER_PATIENCE = 0,
    WEIGHT_DECAY = 0
    )

# <Font color = 'pickle'>**Data Loaders, Loss Function, Optimizer**

In [None]:
# Initialize a new project
import random
wandb.init(name = 'EmbedBagNN', project = 'NLP_HW6B')

0,1
Train Batch Loss :,█▅▅▄▃▃▃▃▂▃▂▃▃▂▂▂▂▂▁▂▂▂▁▂▁▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁
Train epoch Loss :,█▄▃▂▂▁▁▁
Valid Batch Loss :,▇▅▄▃▁▂▃▄▆▆▇▅█
Valid epoch Loss :,█▃▁▁▂▄▄▆

0,1
Train Batch Loss :,0.04665
Train epoch Loss :,0.04661
Valid Batch Loss :,0.15575
Valid epoch Loss :,0.1402


In [None]:
wandb.config = hyperparameters
wandb.config

namespace(ARCHITECTUREe='HW6B_embedbag_2hiddenlayers', BATCH_SIZE=256, DATASET='Multilabel', EARLY_STOPPING=True, EMBED_DIM=400, EPOCHS=40, FILE_MODEL=PosixPath('/content/drive/MyDrive/ColabNotebooks/NLPClass/datasets/HW6B.pt'), GRAD_CLIPPING=False, HIDDEN_DIM1=200, HIDDEN_DIM2=100, LEARNING_RATE=0.01, LOG_BATCH=True, LOG_INTERVAL=25, MAX_NORM=0, MOMENTUM=0, NON_LINEARITY=<function relu at 0x7f5796efa680>, OUTPUT_DIM=10, PATIENCE=5, VOCAB_SIZE=20808, WEIGHT_DECAY=0)

In [None]:
# Fix seed value
SEED = 1234
import random
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_set, batch_size=wandb.config.BATCH_SIZE, shuffle = True, collate_fn=collate_batch, num_workers = 4)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=wandb.config.BATCH_SIZE, shuffle = False, collate_fn=collate_batch, num_workers = 4)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=wandb.config.BATCH_SIZE, shuffle = False, collate_fn=collate_batch, num_workers = 4)

# cross entropy loss function
loss_function = nn.BCEWithLogitsLoss()

# use GPUs
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
wandb.config.DEVICE = device

# model 
model = MLPCustom(wandb.config.EMBED_DIM, 
                  wandb.config.VOCAB_SIZE,
                  wandb.config.HIDDEN_DIM1, 
                  wandb.config.HIDDEN_DIM2,
                  wandb.config.OUTPUT_DIM, 
                  wandb.config.NON_LINEARITY)

model.to(wandb.config.DEVICE)

def init_weights(m):
  if type(m) == nn.Linear:
      torch.nn.init.kaiming_normal_(m.weight)
      torch.nn.init.zeros_(m.bias)
        
# apply initialization recursively  to all modules
model.apply(init_weights)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), 
                             lr = wandb.config.LEARNING_RATE, 
                             weight_decay=wandb.config.WEIGHT_DECAY)

wandb.config.OPTIMIZER = optimizer

# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor= wandb.config.scheduler_factor, 
#                              patience=wandb.config.scheduler_patience, verbose=True)

#scheduler = StepLR(optimizer, gamma=0.4,step_size=1, verbose=True)

  cpuset_checked))


In [None]:
wandb.config

namespace(ARCHITECTUREe='HW6B_embedbag_2hiddenlayers', BATCH_SIZE=256, DATASET='Multilabel', DEVICE=device(type='cpu'), EARLY_STOPPING=True, EMBED_DIM=400, EPOCHS=40, FILE_MODEL=PosixPath('/content/drive/MyDrive/ColabNotebooks/NLPClass/datasets/HW6B.pt'), GRAD_CLIPPING=False, HIDDEN_DIM1=200, HIDDEN_DIM2=100, LEARNING_RATE=0.01, LOG_BATCH=True, LOG_INTERVAL=25, MAX_NORM=0, MOMENTUM=0, NON_LINEARITY=<function relu at 0x7f5796efa680>, OPTIMIZER=Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    eps: 1e-08
    foreach: None
    lr: 0.01
    maximize: False
    weight_decay: 0
), OUTPUT_DIM=10, PATIENCE=5, VOCAB_SIZE=20808, WEIGHT_DECAY=0)

# <Font color = 'pickle'>**Sanity Check**

In [None]:
for input_, targets, offsets in train_loader:
  
  # move inputs and outputs to GPUs
  input_ = input_.to(device)
  targets = targets.to(device)
  offsets = offsets.to(device)
  model.eval()
  # Forward pass
  output = model(input_, offsets)
  loss = loss_function(output, targets.float())
  print(f'Actual loss: {loss}')
  break

print(f'Expected Theoretical loss: {np.log(2)}')

Actual loss: 0.7073890566825867
Expected Theoretical loss: 0.6931471805599453


In [None]:
offsets[-1]

tensor(17825)

In [None]:
len(input_)

17855

In [None]:
len(targets)

256

In [None]:
model

MLPCustom(
  (embedding): EmbeddingBag(20808, 300, mode=mean)
  (hidden_layer1): Linear(in_features=300, out_features=200, bias=True)
  (drop1): Dropout(p=0.5, inplace=False)
  (batchnorm1): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (hidden_layer2): Linear(in_features=200, out_features=100, bias=True)
  (drop2): Dropout(p=0.5, inplace=False)
  (batchnorm2): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (output_layer): Linear(in_features=100, out_features=10, bias=True)
)

# <Font color = 'pickle'>**Training Model**

In [None]:
wandb.watch(model, log = 'all', log_freq=25, log_graph=True)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


[<wandb.wandb_torch.TorchGraph at 0x7f573225dc50>]

In [None]:
example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0

train_loss_history, valid_loss_history = train_loop(train_loader, 
                                                    valid_loader, 
                                                    model,
                                                    loss_function, 
                                                    optimizer, 
                                                    wandb.config.EPOCHS, 
                                                    wandb.config.DEVICE,
                                                    wandb.config.PATIENCE, 
                                                    wandb.config.EARLY_STOPPING,
                                                    wandb.config.FILE_MODEL)

Validation loss has decreased (inf --> 0.144952). Saving Model...
Epoch : 1 / 40
Time to complete 1 is 0:00:16.118319
Train Loss:  0.2771
Valid Loss:  0.1450

Validation loss has decreased (0.144952 --> 0.120050). Saving model...
Epoch : 2 / 40
Time to complete 2 is 0:00:15.899438
Train Loss:  0.1291
Valid Loss:  0.1201

Validation loss has decreased (0.120050 --> 0.119413). Saving model...
Epoch : 3 / 40
Time to complete 3 is 0:00:17.133638
Train Loss:  0.1003
Valid Loss:  0.1194

Early stoping counter: 1 out of 5
Epoch : 4 / 40
Time to complete 4 is 0:00:15.096427
Train Loss:  0.0813
Valid Loss:  0.1240

Early stoping counter: 2 out of 5
Epoch : 5 / 40
Time to complete 5 is 0:00:16.317839
Train Loss:  0.0688
Valid Loss:  0.1199

Early stoping counter: 3 out of 5
Epoch : 6 / 40
Time to complete 6 is 0:00:16.167395
Train Loss:  0.0598
Valid Loss:  0.1284

Early stoping counter: 4 out of 5
Epoch : 7 / 40
Time to complete 7 is 0:00:15.222330
Train Loss:  0.0528
Valid Loss:  0.1350

Early

# <Font color = 'pickle'>**Get Predictions**

In [None]:
model_nn = MLPCustom(wandb.config.EMBED_DIM, wandb.config.VOCAB_SIZE, wandb.config.HIDDEN_DIM1, wandb.config.HIDDEN_DIM2, 
                  wandb.config.OUTPUT_DIM, wandb.config.NON_LINEARITY)

model_nn.to(device)
model_nn.load_state_dict(torch.load(wandb.config.FILE_MODEL))

<All keys matched successfully>

In [None]:
def get_pred(data_loader, model,device):
    
  """ 
  Function to get predictions and accuracy for a given data using estimated model
  Input: Data iterator, Final estimated weoights, bias
  Output: Prections and Accuracy for given dataset
  """

  # Array to store predicted labels
  predictions = torch.Tensor() # empty tensor
  predictions = predictions.to(device) # move predictions to GPU

  # Array to store actual labels
  y = torch.Tensor() # empty tensor
  y = y.to(device)

  # Iterate over batches from data iterator
  with torch.no_grad():
    for input_, targets, offsets in data_loader:
      
      # move inputs and outputs to GPUs
      
      input_ = input_.to(device)
      targets = targets.to(device)
      offsets = offsets.to(device)
      
      # Calculated the predicted labels
      output = model(input_, offsets)

      # Choose the label with maximum probability
      prediction = torch.sigmoid(output)
      indices=prediction
      indices[indices>0.5]=1
      indices[indices<0.5]=0

      # Add the predicted labels to the array
      predictions = torch.cat((predictions, indices)) 

      # Add the actual labels to the array
      y = torch.cat((y, targets))

      #print("Target: ",targets.size(0))
      #print("Predicted: ",predicted)
      #print("Target: ", targets)
      #calculate how many images were correctly classified
  # Check for complete dataset if actual and predicted labels are same or not
  # Calculate accuracy
  #acc = (predictions == y).float().mean()

  # Return tuple containing predictions and targets
  return predictions, y

In [None]:
pred_train, y_train = get_pred(train_loader, model_nn, device)

# <Font color = 'pickle'>**Metric Reasoning**

I chose weighted F1 score as the metric because if there is any imbalance in the labels F1 will balance for it, so it is more robust than accuracy

In [None]:
from sklearn.metrics import f1_score
print("Weighted F1 score: {:.2f}".format(f1_score(y_true= y_train, y_pred= pred_train, average= 'weighted')))

Weighted F1 score: 0.93


#<font color = 'pickle'> **Task 3**

#<font color = 'pickle'> **Embeddings with Gensim**

In [None]:
df = joblib.load(data_folder/'df_raw_small_hw.joblib')

In [None]:
def basic_clean(text):
    if (bool(BeautifulSoup(text, "html.parser").find())==True):         
        soup = BeautifulSoup(text, "html.parser")
        text = soup.get_text(separator=" ")
    else:
        pass
    return re.sub(r'[\n\r]','', text)

In [None]:
df['Basic_Cleaned'] = df['Body'].swifter.apply(lambda x: basic_clean(x))

Pandas Apply:   0%|          | 0/438813 [00:00<?, ?it/s]

In [None]:
import joblib
import pickle
#!pip install swifter
import swifter

joblib.dump(df, '/content/drive/MyDrive/ColabNotebooks/NLPClass/datasets/df_raw_basic.pkl')

['/content/drive/MyDrive/ColabNotebooks/NLPClass/datasets/df_raw_basic.pkl']

In [None]:
df = joblib.load(data_folder/'df_raw_basic.pkl')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Title,Body,Basic_Cleaned
3595496,1220704,7254900,Identify un-activated servers,<p>I've been on a bit of a roll this week and ...,I've been on a bit of a roll this week and hav...
1929804,4905250,4905251,How to abort the loading of an external HTML r...,"<p>I'm writing a JavaScript widget, which is i...","I'm writing a JavaScript widget, which is inte..."
2513704,138912,6173108,Xcode: how do you make it to where clicking on...,<p>I know how to make a button in Xcode with I...,I know how to make a button in Xcode with Inte...
3794169,1419377,7453573,Common WQL Monitoring Queries,<p>What WQL queries would you use for monitori...,What WQL queries would you use for monitoring ...
4341677,1966885,8001081,Loading Native Managed and C++ DLL within IIS ...,<p>I have WCF service developed in C# for with...,I have WCF service developed in C# for with .N...


In [None]:
df['Basic_Cleaned'] = df['Basic_Cleaned'].swifter.apply(lambda x: x.lower())

Pandas Apply:   0%|          | 0/438813 [00:00<?, ?it/s]

In [None]:
df['Basic_Cleaned'] = df['Basic_Cleaned'].swifter.apply(lambda x: re.sub(re.compile(r"https?[A-Za-z0-9:/._\-]+"), "" , x))

Pandas Apply:   0%|          | 0/438813 [00:00<?, ?it/s]

In [None]:
import string
df['Basic_Cleaned'] = df['Basic_Cleaned'].swifter.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

Pandas Apply:   0%|          | 0/438813 [00:00<?, ?it/s]

In [None]:
df['Text'] = df['Title'] + ' ' + df['Basic_Cleaned']
df_final = df[['Text']]
df_final.head()

Unnamed: 0,Text
3595496,Identify un-activated servers ive been on a bi...
1929804,How to abort the loading of an external HTML r...
2513704,Xcode: how do you make it to where clicking on...
3794169,Common WQL Monitoring Queries what wql queries...
4341677,Loading Native Managed and C++ DLL within IIS ...


In [None]:
joblib.dump(df_final, '/content/drive/MyDrive/ColabNotebooks/NLPClass/datasets/df_final_raw_text.pkl')

['/content/drive/MyDrive/ColabNotebooks/NLPClass/datasets/df_final_raw_text.pkl']

In [None]:
df_final = joblib.load(data_folder/'df_final_raw_text.pkl')

In [None]:
df_sample = df_final.sample(frac=0.05)

In [None]:
import gensim
gensim.__version__

'3.6.0'

In [None]:
from gensim.models.fasttext import FastText
model_raw_txt = FastText(df_sample.Text, epochs=10, vector_size=150, window=10, min_count=5, workers =8, min_n=3, max_n=6)

In [None]:
model_raw_txt.wv.save('/content/drive/MyDrive/ColabNotebooks/NLPClass/WordEmbeddings/model_rawtext_subword.bin')

In [None]:
from gensim.models import KeyedVectors
raw_text_fasttext = KeyedVectors.load('/content/drive/MyDrive/ColabNotebooks/NLPClass/WordEmbeddings/model_rawtext_subword.bin')

#<font color = 'pickle'> **Task 4**

#<font color = 'pickle'> **Modeling a classifier with pretrained embeddings**

In [None]:
from google.colab import drive
from pathlib import Path
import joblib
import pandas as pd
import numpy as np 
import sys
from bs4 import BeautifulSoup
import re
from sklearn.preprocessing import MultiLabelBinarizer
import torch
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import learning_curve
from sklearn.metrics import plot_confusion_matrix 
import joblib

drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/ColabNotebooks/NLPClass/custom-functions')
import custom_preprocessor as cp

base_folder = Path('/content/drive/MyDrive/ColabNotebooks/NLPClass/')
data_folder = base_folder/'datasets'
custom_functions = base_folder/'custom-functions'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from collections import Counter
import numpy as np

class GensimVectorizer(BaseEstimator,TransformerMixin):
  np.random.seed(0)
  def __init__(self,pretrained_vectors,unk_norm_init=False):
    # load in pre-trained word vectors
    self.pretrained_vectors= pretrained_vectors
    self.vec_size= self.pretrained_vectors.vector_size
    self.unk_norm_init = unk_norm_init
    self.pretrained_vectors_subset = {}
    self.words_not_in_pretrained = []
    self.count_missing = 0
    self.percent_missing = 0


  def fit(self, X,y=None):
    '''
    Gets the subset of pretrained vectors which are present in vocab
    X :  training sentences
    '''
    counter = Counter()

    for sent in X:
        counter.update(sent.split())
    for token in counter:
        try:
            self.pretrained_vectors_subset[token] = self.pretrained_vectors.get_vector(token, norm=True)
        except:
            self.words_not_in_pretrained.append(token)
    
    ### save so that you can access this after you fit the vectorizer
    self.count_missing = len(self.words_not_in_pretrained )
    self.percent_missing = self.count_missing / len(counter)
    return self
    
  def transform(self,X,y=None):
    X_vector = np.zeros((len(X), self.vec_size))
    
    for i, sent in enumerate(X):
        sent_vector = np.zeros(self.vec_size)
        n=0
        tokens = sent.split()
        for word in tokens:
            if word in self.pretrained_vectors_subset.keys():
                word_vector=self.pretrained_vectors_subset[word]
                sent_vector+= word_vector
                n+= 1
            else:
                if self.unk_norm_init :
                    word_vector = np.random.normal(size=  self.vec_size)
                    sent_vector+= word_vector
                    n+= 1
        if n>0:
            X_vector[i] = sent_vector/n
    return X_vector

In [None]:
df_final = joblib.load(data_folder/'df_final.pkl')
search=re.compile(r"\d")
df_final["Tags"] = df_final["Tag_Number"].apply(lambda x: re.findall(search,x))
X = df_final['Text']
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_final['Tags'])
y

array([[1, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors
pretrained_vectors = KeyedVectors.load('/content/drive/MyDrive/ColabNotebooks/NLPClass/WordEmbeddings/model_rawtext_subword.bin')

In [None]:
len(X_train)

33198

In [None]:
import random
pipeline = Pipeline([
                ('vectorizer',GensimVectorizer(pretrained_vectors)),
                ('clf',  MultiOutputClassifier(KNeighborsClassifier())),
            ])


# now we create the grid with all the parameters that we would like to test

param_grid_1 = {
    'classifier__estimator__n_neighbors': [1, 2, 3],
}

# now we set up the grid search with cross-validation

grid_classifier_1 = GridSearchCV(pipeline, param_grid_1,
                           cv=10, return_train_score= True, n_jobs=-1 )

# not enough ram, so subsetting 
X_train_sub = X_train.head(3000)
y_train_sub = y_train[0:3000]

In [None]:
grid_classifier_1.fit(X_train_sub,y_train_sub)

print(grid_classifier_1.best_params_)

# train scores
print(grid_classifier_1.score(X_train, y_train))

# cross validation score
print(grid_classifier_1.best_score_)

{'classifier__estimator__n_neighbors': 3}
0.64739543058
0.50146234247


In [None]:
from sklearn.metrics import f1_score

pred_test = grid_classifier_1.predict(X_test)
f1_score_test = f1_score(y_test, pred_test, average = 'micro')

print('f1_score_test', f1_score_test)

f1_score_test 0.6435597126598074


#<font color = 'pickle'> **Task 5**

#<font color = 'pickle'> **Load pretrained embeddings**

In [None]:
pretrained_vectors = KeyedVectors.load('/content/drive/MyDrive/ColabNotebooks/NLPClass/WordEmbeddings/model_stackexchange_cbow.bin')

In [None]:
embedding_dim = 300
pretrained_weights = np.zeros((len(my_vocab), embedding_dim))
words_found = 0
words_not_found = 0

for i, word in enumerate(my_vocab.get_itos()):
    try: 
        pretrained_weights[i] = pretrained_vectors.get_vector(word, norm=True)
        words_found += 1
    except KeyError:
        words_not_found  += 1
        pretrained_weights[i] = np.random.normal(size=(embedding_dim, ))

In [None]:
words_found

11236


In [None]:
words_not_found

97165


#<font color = 'pickle'> **Metadata**

In [None]:
from types import SimpleNamespace
import torch.nn.functional as F

hyperparameters = SimpleNamespace(
    EMBED_DIM = 400,
    VOCAB_SIZE = len(my_vocab),
    OUTPUT_DIM = 10,
    HIDDEN_DIM1 = 200,
    HIDDEN_DIM2 = 100,
    NON_LINEARITY= F.relu,
    EPOCHS = 40,
    
    BATCH_SIZE = 256,
    LEARNING_RATE = 0.01,
    DATASET="Multilabel",
    ARCHITECTUREe="HW6B_embedbag_2hiddenlayers",
    LOG_INTERVAL = 25,
    LOG_BATCH = True,
    FILE_MODEL = data_folder/'HW6B.pt',
    GRAD_CLIPPING = False,
    MAX_NORM = 0,
    MOMENTUM = 0,
    PATIENCE = 5,
    EARLY_STOPPING = True,
    # SCHEDULER_FACTOR = 0,
    # SCHEDULER_PATIENCE = 0,
    WEIGHT_DECAY = 0
    )

In [None]:
wandb.init(name = 'Task5', project = 'HW6', config = hyperparameters)

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


#<font color = 'pickle'> **Data Loader, Loss function, Optimizer**

In [None]:
# Fix seed value
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Data Loader
train_loader = torch.utils.data.DataLoader(train_set, batch_size=wandb.config.BATCH_SIZE, shuffle = True, collate_fn=collate_batch, num_workers = 4)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=wandb.config.BATCH_SIZE, shuffle = False, collate_fn=collate_batch, num_workers = 4)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=wandb.config.BATCH_SIZE, shuffle = False, collate_fn=collate_batch, num_workers = 4)

# cross entropy loss function
loss_function = nn.BCEWithLogitsLoss()

# use GPUs
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
wandb.config.DEVICE = device

# model 
model = MLPCustom(wandb.config.EMBED_DIM, 
                  wandb.config.VOCAB_SIZE,
                  wandb.config.HIDDEN_DIM1, 
                  wandb.config.HIDDEN_DIM2,
                  wandb.config.OUTPUT_DIM, 
                  wandb.config.NON_LINEARITY)

model.to(wandb.config.DEVICE)

def init_weights(m):
  if type(m) == nn.Linear:
      torch.nn.init.kaiming_normal_(m.weight)
      torch.nn.init.zeros_(m.bias)
        
# apply initialization recursively  to all modules
model.apply(init_weights)

# Intialize stochiastic gradient descent optimizer
optimizer = torch.optim.Adam(model.parameters(), 
                             lr = wandb.config.LEARNING_RATE, 
                             weight_decay=wandb.config.WEIGHT_DECAY)

wandb.config.OPTIMIZER = optimizer

# scheduler = ReduceLROnPlateau(optimizer, mode='min', factor= wandb.config.scheduler_factor, 
#                              patience=wandb.config.scheduler_patience, verbose=True)

#scheduler = StepLR(optimizer, gamma=0.4,step_size=1, verbose=True)

In [None]:
for input_, targets, offsets in train_loader:
  
  # move inputs and outputs to GPUs
  input_ = input_.to(device)
  targets = targets.to(device)
  offsets = offsets.to(device)
  model.eval()
  # Forward pass
  output = model(input_, offsets)
  loss = loss_function(output, targets.float())
  print(f'Actual loss: {loss}')
  break

print(f'Expected Theoretical loss: {np.log(10)}')

Actual loss: 0.6966218948364258
Expected Theoretical loss: 1.602485391974842


#<font color = 'pickle'> **Training**

In [None]:
# Fix seed value

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, train_acc_history, valid_loss_history, valid_acc_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.patience, wandb.config.early_stopping,
                                                                                          wandb.config.file_model)

Validation loss has decreased (inf --> 0.719485). Saving Model...
Epoch : 1 / 40
Time to complete 1 is 0:00:13.149641
Train Loss:  0.6649
Valid Loss:  0.7195

Validation loss has decreased (0.719485 --> 0.709835). Saving model...
Epoch : 2 / 40
Time to complete 2 is 0:00:12.784032
Train Loss:  0.5723
Valid Loss:  0.7098

Early stoping counter: 1 out of 5
Epoch : 3 / 40
Time to complete 3 is 0:00:11.980263
Train Loss:  0.5716
Valid Loss:  0.7086

Early stoping counter: 2 out of 5
Epoch : 4 / 40
Time to complete 4 is 0:00:12.192442
Train Loss:  0.5662
Valid Loss:  0.7040

Early stoping counter: 3 out of 5
Epoch : 5 / 40
Time to complete 5 is 0:00:12.296284
Train Loss:  0.5631
Valid Loss:  0.7028

Early stoping counter: 4 out of 5
Epoch : 6 / 40
Time to complete 6 is 0:00:15.359949
Train Loss:  0.5630
Valid Loss:  0.7116

Early stoping counter: 5 out of 5
Epoch : 7 / 40
Time to complete 7 is 0:00:13.410169
Train Loss:  0.5593
Valid Loss:  0.7048

Early stoping counter: 6 out of 5
Early St

In [None]:
model_nn = MLPCustom(wandb.config.INPUT_DIM, wandb.config.HIDDEN_DIM1, wandb.config.HIDDEN_DIM2, 
                  wandb.config.OUTPUT_DIM, wandb.config.NON_LINEARITY)
model_nn.to(device)
model_nn.load_state_dict(torch.load(wandb.config.FILE_MODEL))

<All keys matched successfully>

In [None]:
def get_pred(data_loader, model,device):
    
  """ 
  Function to get predictions and accuracy for a given data using estimated model
  Input: Data iterator, Final estimated weoights, bias
  Output: Prections and Accuracy for given dataset
  """

  # Array to store predicted labels
  predictions = torch.Tensor() # empty tensor
  predictions = predictions.to(device) # move predictions to GPU

  # Array to store actual labels
  y = torch.Tensor() # empty tensor
  y = y.to(device)

  # Iterate over batches from data iterator
  with torch.no_grad():
    for input_, targets in data_loader:
      
      # move inputs and outputs to GPUs
      
      input_ = input_.to(device)
      targets = targets.to(device)
      
      # Calculated the predicted labels
      output = model(input_)

      # Choose the label with maximum probability
      prediction = torch.sigmoid(output)
      indices=prediction
      indices[indices>0.5]=1
      indices[indices<0.5]=0

      # Add the predicted labels to the array
      predictions = torch.cat((predictions, indices)) 

      # Add the actual labels to the array
      y = torch.cat((y, targets))

      #print("Target: ",targets.size(0))
      #print("Predicted: ",predicted)
      #print("Target: ", targets)
      #calculate how many images were correctly classified
  # Check for complete dataset if actual and predicted labels are same or not
  # Calculate accuracy
  #acc = (predictions == y).float().mean()

  # Return tuple containing predictions and targets
  return predictions, y

In [None]:
pred_train, y_train = get_pred(test_loader, model_nn, device)

#<font color = 'pickle'> **F1 score**

In [None]:
from sklearn.metrics import f1_score
print("Weighted F1 score: {:.2f}".format(f1_score(y_true= y_train, y_pred= pred_train, average= 'weighted')))

Weighted F1 score: 0.32


#<font color = 'pickle'> **Task 6**

#<font color = 'pickle'> **Model**

In [None]:
# Define custom model using nn.Module()
class MLPCustom_(nn.Module):
  def __init__(self, vocab_size, h_sizes_list, dprobs_list, batchnorm_binary, output_dim, non_linearity, pretrained_weights, task):
        
    super().__init__()

    self.h_sizes_list = h_sizes_list 
    
    self.dprobs_list = dprobs_list
    self.batchnorm_binary = batchnorm_binary

    
    self.non_linearity = non_linearity
    self.output_dim = output_dim
    self.vocab_size = vocab_size
    self.pretrained_weights = pretrained_weights
    self.task = task

    # Initialize hidden layers  

    self.hidden = nn.ModuleList()
    self.dropout = nn.ModuleList()
    self.batchnorm = nn.ModuleList()
       
    
    self.embedding = nn.EmbeddingBag(vocab_size, self.h_sizes_list[0]).from_pretrained(pretrained_weights,
                                                                               freeze = False)

    for k in range(len(h_sizes_list)-1):
      self.hidden.append(nn.Linear(self.h_sizes_list[k], h_sizes_list[k+1]))
      self.dropout.append(nn.Dropout(p=dprobs_list[k]))

      if self.batchnorm_binary:
        self.batchnorm.append(nn.BatchNorm1d(self.h_sizes_list[k+1], momentum=0.9))
      
    
    self.output_layer = nn.Linear(self.h_sizes_list[-1], output_dim)

  def forward(self, input, offsets):
    x = self.embedding(input, offsets)
    for  k in range(len(self.h_sizes_list)-1):
      x =  self.non_linearity(self.hidden[k](x))
      if self.batchnorm_binary:
        x = self.batchnorm[k](x)
      x= self.dropout[k](x)

    x = self.output_layer(x)

    return x  

#<font color = 'pickle'> **Meta Data**

In [None]:
hyperparameters = dict(
    h_sizes_list = [300] + [200],
    dprobs_list = [0] + [0],
    batchnorm_binary = False,
    task = 6,
    vocab_size = len(my_vocab),
    output_dim = 10,
    epochs = 40,
    batch_size = 256,
    learning_rate = 0.01,
    dataset="Stack Overflow",
    architecture="MLP",
    log_interval = 25,
    log_batch = True,
    file_model = data_folder/'Task6.pt',
    grad_clipping = True,
    max_norm = 1,
    momentum = 0,
    patience = 5,
    early_stopping = True,
    scheduler_factor = 0.5,
    scheduler_patience = 0,
    weight_decay = 0.0005
   )

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
non_linearity = F.relu
pretrained_weights_tensor = torch.tensor(pretrained_weights).float()

In [None]:
pretrained_weights_tensor.shape

torch.Size([20808, 300])

In [None]:
wandb.init(name = 'Task6', project = 'HW6', config = hyperparameters)

In [None]:
# Fix seed value

SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

for targets, input, offsets in train_loader:
  
  # move inputs and outputs to GPUs
  input = input.to(device)
  targets = targets.to(device)
  offsets = offsets.to(device)
  model.eval()
  # Forward pass
  output = model(input, offsets)
  loss = loss_function(output, targets.float())
  print(f'Actual loss: {loss}')
  break

print(f'Expected Theoretical loss: {np.log(10)}')

Actual loss: 0.6966218948364258
Expected Theoretical loss: 2.212785092994046


In [None]:
# Fix seed value

SEED = 2345
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

example_ct_train, batch_ct_train, example_ct_valid, batch_ct_valid = 0, 0, 0, 0
train_loss_history, valid_loss_history = train_loop(train_loader, valid_loader, model, loss_function, optimizer, 
                                                                                          wandb.config.epochs, wandb.config.device,
                                                                                          wandb.config.patience, wandb.config.early_stopping,
                                                                                          wandb.config.file_model)


Validation loss has decreased (inf --> 0.227600). Saving Model...
Epoch : 1 / 40
Time to complete 1 is 0:00:13.149641
Train Loss:  0.2850
Valid Loss:  0.2276

Validation loss has decreased (0.719485 --> 0.709835). Saving model...
Epoch : 2 / 40
Time to complete 2 is 0:00:12.784032
Train Loss:  0.2218
Valid Loss:  0.2248

Early stoping counter: 1 out of 5
Epoch : 3 / 40
Time to complete 3 is 0:00:11.980263
Train Loss:  0.2116
Valid Loss:  0.2208

Early stoping counter: 2 out of 5
Epoch : 4 / 40
Time to complete 4 is 0:00:12.192442
Train Loss:  0.2107
Valid Loss:  0.2201

Early stoping counter: 3 out of 5
Epoch : 5 / 40
Time to complete 5 is 0:00:12.296284
Train Loss:  0.2216
Valid Loss:  0.2263

Early stoping counter: 4 out of 5
Epoch : 6 / 40
Time to complete 6 is 0:00:15.359949
Train Loss:  0.2206
Valid Loss:  0.2247

Early stoping counter: 5 out of 5
Early Stopping




In [None]:
pred_train, y_train = get_pred(train_loader, model_nn, device)

#<font color = 'pickle'> **F1 Score**

In [None]:
from sklearn.metrics import f1_score
print("Weighted F1 score: {:.2f}".format(f1_score(y_true= y_train, y_pred= pred_train, average= 'weighted')))

Weighted F1 score: 0.34
