## Import Packages

In [1]:
from __future__ import division
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from importlib import reload

import random 
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
from numpy.linalg import *
np.random.seed(42)  # don't change this line
from sklearn.metrics import f1_score

import base64

# NLTK, NumPy, and Pandas.
import nltk
nltk.download('punkt')
from nltk.tree import Tree
from numpy import random as rd
from nltk.tokenize import word_tokenize
import random

import collections
import re
import time
import itertools
from collections import defaultdict, Counter

import glob
from argparse import ArgumentParser

#Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load [Datasets](https://huggingface.co/datasets/SetFit/amazon_massive_intent_en-US)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df_train = pd.read_json("/content/drive/Shareddrives/CIS-5300_Final-Project/train.jsonl", lines=True)
df_train.shape

(11514, 4)

In [4]:
df_validation = pd.read_json("/content/drive/Shareddrives/CIS-5300_Final-Project/validation.jsonl", lines=True)
df_validation.shape

(2033, 4)

In [5]:
df_test = pd.read_json("/content/drive/Shareddrives/CIS-5300_Final-Project/test.jsonl", lines=True)
df_test.shape

(2974, 4)

In [6]:
# use prefix of (child) label text as parent label
df_train["parent_label_text"] = df_train["label_text"].apply(lambda x: x.split('_')[0])
# factorize to get integer value for each parent label
df_train["parent_label"] = pd.factorize(df_train["parent_label_text"])[0]

a = df_train.groupby(['parent_label'])['label'].rank(method='dense')
df_train['mapper_idx'] = (a-1).astype(int)

In [7]:
# create dictionary indexer for parent class text/label
parent_label_idx = dict(zip(df_train["parent_label_text"],df_train["parent_label"]))

In [8]:
# get parent class text/label for validation data
df_validation["parent_label_text"] = df_validation["label_text"].apply(lambda x: x.split('_')[0])
df_validation["parent_label"] = df_validation["parent_label_text"].apply(lambda x: parent_label_idx[x])

a = df_validation.groupby(['parent_label'])['label'].rank(method='dense')
df_validation['mapper_idx'] = (a-1).astype(int)

In [9]:
# get parent class text/label for validation data
df_test["parent_label_text"] = df_test["label_text"].apply(lambda x: x.split('_')[0])
df_test["parent_label"] = df_test["parent_label_text"].apply(lambda x: parent_label_idx[x])

a = df_test.groupby(['parent_label'])['label'].rank(method='dense')
df_test['mapper_idx'] = (a-1).astype(int)

## Download [Glove Embeddings](https://nlp.stanford.edu/projects/glove/) 


In [10]:
#this takes about 10 minutes to run
#!wget -nc https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
!unzip /content/drive/Shareddrives/CIS-5300_Final-Project/glove.840B.300d.zip
!ls -lat

Archive:  /content/drive/Shareddrives/CIS-5300_Final-Project/glove.840B.300d.zip
  inflating: glove.840B.300d.txt     
total 5513928
drwxr-xr-x 1 root root       4096 Dec 22 16:04 .
drwx------ 6 root root       4096 Dec 22 16:04 drive
drwxr-xr-x 1 root root       4096 Dec 22 16:01 ..
drwxr-xr-x 1 root root       4096 Dec 20 20:19 sample_data
drwxr-xr-x 4 root root       4096 Dec 20 20:18 .config
-rw-rw-r-- 1 root root 5646236541 Oct 24  2015 glove.840B.300d.txt


In [11]:
glove_file = "glove.840B.300d.txt"

## Preprocess

In [12]:
df_all = pd.concat([df_train, df_validation, df_test]).reset_index()

tokenized_data = [word_tokenize(df_all['text'][i]) for i in range(len(df_all['text']))]

vocab = {word for sentence in tokenized_data for word in sentence}
vocab.add('<PAD>')

word_to_idx = { w : i for i, w in enumerate(vocab) }

In [13]:
def pre_process(data, word_to_idx):
  tokenized_data = [word_tokenize(data['text'][i]) for i in range(len(data['text']))]

  lens = np.array([len(sentence) for sentence in tokenized_data])
  
  tokens = [word_to_idx[word] for sentence in tokenized_data for word in sentence]

  padded_tokens = np.full([len(tokenized_data), max(lens)], word_to_idx['<PAD>'])
  for i in range(len(tokenized_data)):
    for j in range(len(tokenized_data[i])):
      padded_tokens[i][j] = word_to_idx[tokenized_data[i][j]]

  labels = np.array(data['mapper_idx'])
  parent_labels = np.array(data['parent_label'])
    
  return padded_tokens, lens, labels, parent_labels

In [14]:
padded_tokens, lens, labels, parent_labels = pre_process(df_train, word_to_idx)

## Get Glove Embeddings

In [15]:
#takes about 1 minute to read through the whole file and find the words we need. 
def get_glove_mapping(vocab, file):
    """
    Gets the mapping of words from the vocabulary to pretrained embeddings
    
    INPUT:
    vocab       - set of vocabulary words
    file        - file with pretrained embeddings

    OUTPUT:
    glove_map   - mapping of words in the vocabulary to the pretrained embedding
    
    """
    
    glove_map = {}
    with open(file,'rb') as fi:
        for l in fi:
            try:
                #### STUDENT CODE HERE ####
                emd_lst = l.decode().split(' ')
                word = emd_lst.pop(0)
                emd_lst = [float(n) for n in emd_lst]

                if word in vocab:
                  glove_map[word] = np.array(emd_lst)

                #### STUDENT CODE ENDS HERE ####
            except:
                #some lines have urls, we don't need them.
                pass
    return glove_map

In [16]:
glove_map = get_glove_mapping(vocab,glove_file)

## Get Embedding Matrix

In [17]:
def get_dimensions():
    d_out =  60 #number of outputs
    n_embed =  len(vocab) #size of the dictionary of embeddings
    d_embed =  300 # the size of each embedding vector
    return d_out, n_embed, d_embed
d_out,n_embed,d_embed = get_dimensions()

In [18]:
def get_embedding_matrix(n_embed, d_embed, glove_map):
    """
    Initialize the weight matrix
    
    INPUT:
    n_embed         - size of the dictionary of embeddings
    d_embed         - the size of each embedding vector

    OUTPUT:
    embedding_matrix  - matrix of mapping from word id to embedding 
    """
    #### STUDENT CODE HERE ####
    train_words = vocab
    embedding_matrix = np.full((n_embed, d_embed), np.random.normal())

    for i, word in enumerate(train_words):

        if word in glove_map.keys():

            embedding_matrix[i] = glove_map[word]
    
    #### STUDENT CODE ENDS HERE ####
    return embedding_matrix

In [19]:
embedding_matrix = get_embedding_matrix(n_embed, d_embed, glove_map)
embedding_data = (embedding_matrix.shape, embedding_matrix[:155])

## Define Embedding Layer

In [20]:
def create_emb_layer(embedding_matrix, non_trainable=False):
    """
    Create the embedding layer
    
    INPUT:
    embedding_matrix  - matrix of mapping from word id to embedding
    non_trainable   - Flag for whether the weight matrix should be trained. 
                      If it is set to True, don't update the gradients

    OUTPUT:
    emb_layer       - embedding layer 
    
    """
    #### STUDENT CODE HERE ####

    emb_layer = nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix), padding_idx=140)

    #### STUDENT CODE ENDS HERE ####

    return emb_layer

## Define Dataloader 

In [21]:
class SSTpytorchDataset(Dataset):
    def __init__(self, dataset, tokens, word_to_idx, word_dropout = 0.3, split='train'):
        super(SSTpytorchDataset, self).__init__()
        assert split in ['train', 'test', 'dev'], "Error!"
        self.ds = dataset
        self.split = split
        self.word_to_idx = word_to_idx
        #self.word_dropout = word_dropout
        self.data_x, self.data_ns, self.data_y, self.data_y_parent = pre_process(dataset, self.word_to_idx)

    def __len__(self):
        return self.data_x.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.data_x[idx], self.data_ns[idx], self.data_y[idx], self.data_y_parent[idx]
        

## Define Train & Evaluate Functions

In [22]:
def train(model, word_to_idx, lr = .005, drop_out = 0, word_dropout = .3, batch_size = 16,
          weight_decay = 1e-5, model_type= "LSTM", root_model = False, model_label = 0):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    #Take subset of data accroding to parent class label
    if root_model == False:
        df_train_label = df_train[df_train['parent_label'] == model_label].reset_index(drop=True)
        df_validation_label = df_validation[df_validation['parent_label'] == model_label].reset_index(drop=True)
        #df_test_label = df_test[df_test['parent_label'] == model_label]
    else:
        df_train_label = df_train
        df_validation_label = df_validation
        #df_test_label = df_test

    trainset = SSTpytorchDataset(df_train_label, word_dropout, word_to_idx, 'train')
    #testset = SSTpytorchDataset(df_test_label, word_dropout, word_to_idx, 'test')
    devset = SSTpytorchDataset(df_validation_label, word_dropout, word_to_idx, 'dev')

    train_iter = DataLoader(trainset, batch_size, shuffle=True, num_workers=0)
    #test_iter = DataLoader(testset, batch_size, shuffle=False, num_workers=0)
    dev_iter = DataLoader(devset, batch_size, shuffle=False, num_workers=0)
    
    model = model
    model.to(device)

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay = weight_decay)
    
    acc, val_loss = evaluate(dev_iter, model, device, model_type, root_model=root_model)
    best_acc = acc

    print(
        'epoch |   %        |  loss  |  avg   |val loss|   acc   |  best  | time | save |')
    print(
        'val   |            |        |        | {:.4f} | {:.4f} | {:.4f} |      |      |'.format(
            val_loss, acc, best_acc))

    iterations = 0
    last_val_iter = 0
    train_loss = 0
    start = time.time()
    _save_ckp = ''
    for epoch in range(epochs):
        
        n_correct, n_total, train_loss = 0, 0, 0
        last_val_iter = 0
        
        for batch_idx, batch in enumerate(train_iter):
            # switch model to training mode, clear gradient accumulators
            
            model.train()
            optimizer.zero_grad()
            iterations += 1

            data, lens, child_label, parent_label = batch
            data = data.to(device)
            #label = label.to(device).long()

            answer = model(data, lens)

            #Check if root model
            if root_model:
              loss = criterion(answer, parent_label)
            else:
              loss = criterion(answer, child_label)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            print('\r {:4d} | {:4d}/{} | {:.4f} | {:.4f} |'.format(
                epoch, batch_size * (batch_idx + 1), len(trainset), loss.item(),
                       train_loss / (iterations - last_val_iter)), end='')

            if iterations > 0 and iterations % dev_every == 0:
                acc, val_loss= evaluate(dev_iter, model, device, model_type, root_model=root_model)
                if acc > best_acc:
                    best_acc = acc
                    torch.save(model.state_dict(), save_path)
                    _save_ckp = '*'

                print(
                    ' {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'.format(
                        val_loss, acc, best_acc, (time.time() - start) / 60,
                        _save_ckp))

                train_loss = 0
                last_val_iter = iterations
    #model.load_state_dict(torch.load(save_path)) #this will be the best model
    test_y_pred = evaluate(dev_iter, model, device, model_type, "test", root_model)
    print("\nValidation Accuracy : ", evaluate(dev_iter,model, device, model_type, root_model=root_model))
    return best_acc, test_y_pred


In [23]:
def evaluate(loader, model, device, model_type = "LSTM", split = "dev", root_model = False):
    model.eval()
    n_correct, n = 0, 0
    losses = []
    y_pred = []
    labels = []
    with torch.no_grad():
        for batch_idx, batch in enumerate(loader):
            data, lens, label, parent_label = batch
            data = data.to(device)
            
            answer = model(data, lens)
            if split != "test":
                #Check if root model
                if root_model:
                  n_correct += (torch.max(answer, 1)[1].view(parent_label.size()) == parent_label).sum().item()
                  n += answer.shape[0]
                  loss = criterion(answer, parent_label)
                else:
                  n_correct += (torch.max(answer, 1)[1].view(label.size()) == label).sum().item()
                  n += answer.shape[0]
                  loss = criterion(answer, label)

                losses.append(loss.data.cpu().numpy())
            else:
                if root_model:
                  y_pred.extend(torch.max(answer, 1)[1].view(parent_label.size()).tolist())
                else:
                  y_pred.extend(torch.max(answer, 1)[1].view(label.size()).tolist())
                labels.extend(label.tolist())
    if split != "test":
        acc = 100. * n_correct / n
        loss = np.mean(losses)
        return acc, loss
    else:
        return y_pred


## Define LSTM Network

In [24]:
import random as random
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn import LSTM, GRU

class LSTM_Classifier(nn.Module):

    def __init__(self,
                 n_embed=20000,
                 d_embed=300,
                 d_hidden=150,
                 d_out=60,
                 embeddings=None,
                 nl = 2,
                 bidirectional = True,
                 gru = False
                 ):
        super(LSTM_Classifier, self).__init__()

        self.d_hidden = d_hidden
        self.bidrectional = bidirectional
        self.num_layers = nl

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.embed = create_emb_layer(embedding_matrix,False)
        #### STUDENT CODE STARTS HERE ####
        self.lstm = nn.LSTM(d_embed, self.d_hidden, self.num_layers, batch_first=True,
                              bidirectional=bidirectional)
        
        if bidirectional:
          self.fc_out = nn.Linear(self.d_hidden*2, d_out)
        else:
          self.fc_out = nn.Linear(self.d_hidden, d_out)
        self.dropout = nn.Dropout(p=0.2)
        #### STUDENT CODE ENDS HERE ####

    def forward(self, text, seq_lengths):

        # batch_size = text.size()[0]

        #### STUDENT CODE STARTS HERE ####
        #Fill the missing pieces for the forward pass
        x = text
        #print(x)
        x = self.embed(x)
        x = pack_padded_sequence(x, seq_lengths, batch_first=True, enforce_sorted=False)
        
        x, _ = self.lstm(x)
        x, _ = pad_packed_sequence(x, batch_first=True)

        #x = self.dropout(x)
        x = self.fc_out(x)
        x, _ = torch.max(x, 1)
        #### STUDENT CODE ENDS HERE ####

        return x

## Training Loop

In [25]:
#Train the root model
torch.manual_seed(1234)
criterion = nn.CrossEntropyLoss()
batch_size = 128
epochs = 10
dev_every = 100
lr = 0.01
save_path = "best_model"
drop_out = 0
word_dropout = 0
weight_decay = 0

d_out = len(parent_label_idx)

print('Training Root Model')
print('-------------------')
root_model = LSTM_Classifier(n_embed=n_embed, d_embed=d_embed, d_hidden=150, d_out=d_out, bidirectional=True)
dev_value, test_y_pred = train(root_model, word_to_idx, lr, drop_out, word_dropout, 
                               batch_size, weight_decay, "LSTM", root_model=True)
print('-------------------')
print('\n')

#List of child models
child_models = []
batch_size = 64
epochs = 20

for label_text, parent_label in parent_label_idx.items():
    
    print('Training Child Model for Class:', label_text, parent_label)
    print('-------------------')

    d_out = len(df_train[df_train['parent_label'] == parent_label]['label_text'].value_counts())

    child_model = LSTM_Classifier(n_embed=n_embed, d_embed=d_embed, d_hidden=150, d_out=d_out, bidirectional=True)
    dev_value, test_y_pred = train(child_model, word_to_idx, lr, drop_out, word_dropout, 
                                    batch_size, weight_decay, "LSTM", root_model=False, model_label=parent_label)
    print('-------------------')
    print('\n')
    
    child_models.append(child_model)

Training Root Model
-------------------
epoch |   %        |  loss  |  avg   |val loss|   acc   |  best  | time | save |
val   |            |        |        | 2.8885 | 3.9843 | 3.9843 |      |      |
    1 | 1280/11514 | 0.3070 | 0.0319 | 0.4087 | 88.1456 | 88.1456 | 0.52 | *    |
    2 | 2560/11514 | 0.2167 | 0.0205 | 0.4059 | 88.5391 | 88.5391 | 1.11 | *    |
    3 | 3840/11514 | 0.1203 | 0.0144 | 0.4165 | 89.5229 | 89.5229 | 1.66 | *    |
    4 | 5120/11514 | 0.1909 | 0.0112 | 0.4452 | 89.2277 | 89.5229 | 2.20 | *    |
    5 | 6400/11514 | 0.1121 | 0.0065 | 0.4550 | 90.0148 | 90.0148 | 2.77 | *    |
    6 | 7680/11514 | 0.0312 | 0.0055 | 0.5041 | 89.8180 | 90.0148 | 3.30 | *    |
    7 | 8960/11514 | 0.0178 | 0.0038 | 0.5392 | 90.1131 | 90.1131 | 3.84 | *    |
    8 | 10240/11514 | 0.0693 | 0.0032 | 0.5033 | 90.4575 | 90.4575 | 4.38 | *    |
    9 | 11520/11514 | 0.0513 | 0.0029 | 0.5322 | 89.2277 | 90.4575 | 4.92 | *    |

Validation Accuracy :  (89.22774225282834, 0.5321578)
----

In [26]:
# from IPython.display import clear_output

# #evaluate
# padded_tokens, lens, labels, parent_labels = pre_process(df_validation, word_to_idx)

# parent_preds = []
# child_preds = []
# n_correct = 0
# n = 0

# for sequence, len, child_label, parent_label in zip(padded_tokens, lens, labels, parent_labels):

#     clear_output()
#     print(str(n+1) + '/' + str(lens.shape[0]) + ' testing examples done')
    
#     pred = root_model(torch.Tensor([sequence]).long(), [len])
#     parent_idx = torch.argmax(pred).item()
#     parent_preds.append(parent_idx)
    
#     child_model = child_models[parent_idx]

#     child_pred = child_model(torch.Tensor([sequence]).long(), [len])
#     child_idx = torch.argmax(child_pred).item()
#     child_preds.append(child_idx)

#     if parent_idx == parent_label and child_idx == child_label:

#         n_correct += 1
    
#     n += 1

# print('Accuracy:', 100*n_correct/n)   

In [27]:
from IPython.display import clear_output

#evaluate
padded_tokens, lens, labels, parent_labels = pre_process(df_test, word_to_idx)

parent_preds = []
child_preds = []
n_correct = 0
n = 0

for sequence, len, child_label, parent_label in zip(padded_tokens, lens, labels, parent_labels):

    clear_output()
    print(str(n+1) + '/' + str(lens.shape[0]) + ' testing examples done')
    
    pred = root_model(torch.Tensor([sequence]).long(), [len])
    parent_idx = torch.argmax(pred).item()
    parent_preds.append(parent_idx)
    
    child_model = child_models[parent_idx]

    child_pred = child_model(torch.Tensor([sequence]).long(), [len])
    child_idx = torch.argmax(child_pred).item()
    child_preds.append(child_idx)

    if parent_idx == parent_label and child_idx == child_label:

        n_correct += 1
    
    n += 1

print('Test Accuracy:', 100*n_correct/n)  

2974/2974 testing examples done
Test Accuracy: 84.39811701412239


In [28]:
f1_score(labels, child_preds, average="weighted")

0.8672464437647619

In [29]:
predictions_dict = {'Gold Label': labels, 'Predicted Label': child_preds}
predictions = pd.DataFrame(predictions_dict)
predictions

Unnamed: 0,Gold Label,Predicted Label
0,1,1
1,3,3
2,0,2
3,7,0
4,6,6
...,...,...
2969,2,3
2970,3,3
2971,3,3
2972,3,3


In [31]:
df_test[['label', 'parent_label', 'mapper_idx']]

Unnamed: 0,label,parent_label,mapper_idx
0,48,0,1
1,46,1,3
2,1,2,0
3,41,2,7
4,40,2,6
...,...,...,...
2969,33,15,2
2970,44,15,3
2971,44,15,3
2972,44,15,3


In [30]:
predictions.to_csv('extension2_predictions.csv')