## Import Packages

In [1]:
from __future__ import division
import os, sys, re, json, time, datetime, shutil
import itertools, collections
from importlib import reload

import random 
import numpy as np
import pandas as pd
import os
import sys
import matplotlib.pyplot as plt
from numpy.linalg import *
np.random.seed(42)  # don't change this line

import base64

# NLTK, NumPy, and Pandas.
import nltk
nltk.download('punkt')
from nltk.tree import Tree
from numpy import random as rd
from nltk.tokenize import word_tokenize
import random

import collections
import re
import time
import itertools
from collections import defaultdict, Counter

import glob
from argparse import ArgumentParser

#Pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load [Datasets](https://huggingface.co/datasets/SetFit/amazon_massive_intent_en-US)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
df_train = pd.read_json("/content/drive/Shareddrives/CIS-5300_Final-Project/train.jsonl", lines=True)
# df_train.shape

In [9]:
df_validation = pd.read_json("/content/drive/Shareddrives/CIS-5300_Final-Project/validation.jsonl", lines=True)
# df_validation.shape

In [10]:
df_test = pd.read_json("/content/drive/Shareddrives/CIS-5300_Final-Project/test.jsonl", lines=True)
# df_test.shape

In [11]:
# use prefix of (child) label text as parent label
df_train["parent_label_text"] = df_train["label_text"].apply(lambda x: x.split('_')[0])
# factorize to get integer value for each parent label
df_train["parent_label"] = pd.factorize(df_train["parent_label_text"])[0]

In [12]:
# create dictionary indexer for parent class text/label
parent_label_idx = dict(zip(df_train["parent_label_text"],df_train["parent_label"]))

In [13]:
# get parent class text/label for validation data
df_validation["parent_label_text"] = df_validation["label_text"].apply(lambda x: x.split('_')[0])
df_validation["parent_label"] = df_validation["parent_label_text"].apply(lambda x: parent_label_idx[x])

In [14]:
# get parent class text/label for test data
df_test["parent_label_text"] = df_test["label_text"].apply(lambda x: x.split('_')[0])
df_test["parent_label"] = df_test["parent_label_text"].apply(lambda x: parent_label_idx[x])

## Preprocessing

In [41]:
df_all = pd.concat([df_train, df_validation, df_test]).reset_index()

tokenized_data = [word_tokenize(df_all['text'][i]) for i in range(len(df_all['text']))]

vocab = {word for sentence in tokenized_data for word in sentence}
vocab.add('<PAD>')

word_to_idx = { w : i for i, w in enumerate(vocab) }
idx_to_word = { i : w for w, i in word_to_idx.items() }

In [18]:
def pre_process(data, word_to_idx):
  tokenized_data = [word_tokenize(data['text'][i]) for i in range(len(data))]

  lens = np.array([len(sentence) for sentence in tokenized_data])
  
  tokens = [word_to_idx[word] for sentence in tokenized_data for word in sentence]

  # add one extra <PAD> token at the end of each sequence
  padded_tokens = np.full([len(tokenized_data), max(lens) + 1], word_to_idx['<PAD>'])
  for i in range(len(tokenized_data)):
    for j in range(len(tokenized_data[i])):
      padded_tokens[i][j] = word_to_idx[tokenized_data[i][j]]

  labels = np.array(data['label'])
  parent_labels = np.array(data['parent_label'])
    
  return padded_tokens, lens, labels, parent_labels

In [19]:
glove_file = "glove.840B.300d.txt"

# Glove Embeddings

## Download [Glove Embeddings](https://nlp.stanford.edu/projects/glove/) 


In [20]:
#this takes about 10 minutes to run
#!wget -nc https://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
!unzip /content/drive/Shareddrives/CIS-5300_Final-Project/glove.840B.300d.zip
!ls -lat

Archive:  /content/drive/Shareddrives/CIS-5300_Final-Project/glove.840B.300d.zip
  inflating: glove.840B.300d.txt     
total 5513928
drwxr-xr-x 1 root root       4096 Dec 22 20:25 .
drwx------ 6 root root       4096 Dec 22 20:20 drive
drwxr-xr-x 1 root root       4096 Dec 22 20:16 ..
drwxr-xr-x 1 root root       4096 Dec 20 20:19 sample_data
drwxr-xr-x 4 root root       4096 Dec 20 20:18 .config
-rw-rw-r-- 1 root root 5646236541 Oct 24  2015 glove.840B.300d.txt


## Get Glove Embeddings

In [21]:
def get_glove_mapping(vocab, file):
    """
    Gets the mapping of words from the vocabulary to pretrained embeddings
    
    INPUT:
    vocab       - set of vocabulary words
    file        - file with pretrained embeddings

    OUTPUT:
    glove_map   - mapping of words in the vocabulary to the pretrained embedding
    
    """
    
    glove_map = {}
    with open(file,'rb') as fi:
        for l in fi:
            try:
                emd_lst = l.decode().split(' ')
                word = emd_lst.pop(0)
                emd_lst = [float(n) for n in emd_lst]

                if word in vocab:
                  glove_map[word] = np.array(emd_lst)
            except:
                #some lines have urls, we don't need them.
                pass
    return glove_map

In [22]:
glove_map = get_glove_mapping(vocab,glove_file)

## Get Embedding Matrix

In [23]:
def get_dimensions():
    d_out =  60 #number of outputs
    n_embed =  len(vocab) #size of the dictionary of embeddings
    d_embed =  300 # the size of each embedding vector
    return d_out, n_embed, d_embed
d_out,n_embed,d_embed = get_dimensions()

In [24]:
def get_embedding_matrix(n_embed, d_embed, glove_map):
    """
    Initialize the weight matrix
    
    INPUT:
    n_embed         - size of the dictionary of embeddings
    d_embed         - the size of each embedding vector

    OUTPUT:
    embedding_matrix  - matrix of mapping from word id to embedding 
    """
    train_words = vocab
    embedding_matrix = np.full((n_embed, d_embed), np.random.normal())

    for i, word in enumerate(train_words):

        if word in glove_map.keys():

            embedding_matrix[i] = glove_map[word]
    
    return embedding_matrix

In [25]:
embedding_matrix = get_embedding_matrix(n_embed, d_embed, glove_map)
embedding_data = (embedding_matrix.shape, embedding_matrix[:155])

# Define Dataloader 

In [26]:
class SSTpytorchDataset(Dataset):
    def __init__(self, dataset, tokens, word_to_idx, word_dropout = 0.3, split='train'):
        super(SSTpytorchDataset, self).__init__()
        assert split in ['train', 'test', 'dev'], "Error!"
        self.ds = dataset
        self.split = split
        self.word_to_idx = word_to_idx
        #self.word_dropout = word_dropout
        self.data_x, self.data_ns, self.data_y, self.data_y_parent = pre_process(dataset, self.word_to_idx)

    def __len__(self):
        return self.data_x.shape[0]
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.data_x[idx], self.data_ns[idx], self.data_y[idx], self.data_y_parent[idx]
        

# Modeling

## Define Embedding Layer

In [27]:
def create_emb_layer(embedding_matrix, non_trainable=False):
    """
    Create the embedding layer
    
    INPUT:
    embedding_matrix  - matrix of mapping from word id to embedding
    non_trainable   - Flag for whether the weight matrix should be trained. 
                      If it is set to True, don't update the gradients

    OUTPUT:
    emb_layer       - embedding layer 
    
    """

    emb_layer = nn.Embedding.from_pretrained(torch.Tensor(embedding_matrix), padding_idx=word_to_idx['<PAD>'])

    return emb_layer

## Define Train & Evaluate Functions

In [28]:
def train(model, word_to_idx, lr = .005, drop_out = 0, word_dropout = .3, batch_size = 16, weight_decay = 1e-5, model_type= "LSTM"):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    trainset = SSTpytorchDataset(df_train, word_dropout, word_to_idx, 'train')
    testset = SSTpytorchDataset(df_test, word_dropout, word_to_idx, 'test')
    devset = SSTpytorchDataset(df_validation, word_dropout, word_to_idx, 'dev')

    train_iter = DataLoader(trainset, batch_size, shuffle=True, num_workers=0)
    test_iter = DataLoader(testset, batch_size, shuffle=False, num_workers=0)
    dev_iter = DataLoader(devset, batch_size, shuffle=False, num_workers=0)
    
    model = model
    model.to(device)

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay = weight_decay)
    acc, val_loss, _ = evaluate(dev_iter, model, device, model_type)
    best_acc = acc

    print(
        'epoch |   %        |  loss  |  avg   |val loss|   acc   |  best  | time | save |')
    print(
        'val   |            |        |        | {:.4f} | {:.4f} | {:.4f} |      |      |'.format(
            val_loss, acc, best_acc))

    iterations = 0
    last_val_iter = 0
    train_loss = 0
    start = time.time()
    _save_ckp = ''
    for epoch in range(epochs):
        
        n_correct, n_total, train_loss = 0, 0, 0
        last_val_iter = 0
        
        for batch_idx, batch in enumerate(train_iter):
            # switch model to training mode, clear gradient accumulators
            
            model.train()
            optimizer.zero_grad()
            iterations += 1

            data, lens, child_label, parent_label = batch
            data = data.to(device)

            # get prediction for parent and child class labels
            parent, child = model(data, lens)

            # separate loss computation for parent/child class label
            loss_parent = criterion(parent, parent_label)
            loss_child = criterion(child, child_label)

            # sum loss for parent and child class labels to optimize over both predictions
            (loss_parent+loss_child).backward()
            optimizer.step()

            loss = loss_parent.item() + loss_child.item()

            train_loss += loss
            print('\r {:4d} | {:4d}/{} | {:.4f} | {:.4f} |'.format(
                epoch, batch_size * (batch_idx + 1), len(trainset), loss,
                       train_loss / (iterations - last_val_iter)), end='')

            if iterations > 0 and iterations % dev_every == 0:
                acc, val_loss, _ = evaluate(dev_iter, model, device, model_type)
                if acc > best_acc:
                    best_acc = acc
                    torch.save(model.state_dict(), save_path)
                    _save_ckp = '*'

                print(
                    ' {:.4f} | {:.4f} | {:.4f} | {:.2f} | {:4s} |'.format(
                        val_loss, acc, best_acc, (time.time() - start) / 60,
                        _save_ckp))

                train_loss = 0
                last_val_iter = iterations
    model.load_state_dict(torch.load(save_path)) #this will be the best model
    # test_y_pred = evaluate(dev_iter, model, device, model_type, "test")
    test_y_pred, test_labels, test_acc, test_f1 = evaluate(test_iter, model, device, model_type, "test")
    print("\nValidation Accuracy : ", evaluate(dev_iter,model, device, model_type))
    return best_acc, test_y_pred, test_labels, test_acc, test_f1


In [29]:
from sklearn.metrics import f1_score

In [30]:
def evaluate(loader, model, device, model_type = "LSTM", split = "dev"):
    model.eval()
    n_correct, n = 0, 0
    losses = []
    y_pred = []
    labels = []
    with torch.no_grad():
        for batch_idx, batch in enumerate(loader):
            data, lens, label, _ = batch
            data = data.to(device)
            label = label.to(device).long()
            
            parent, answer = model(data, lens)
            if split != "test":
                n_correct += (torch.max(answer, 1)[1].view(label.size()) == label).sum().item()
                n += answer.shape[0]
                loss = criterion(answer, label)
                losses.append(loss.data.cpu().numpy())
                y_pred.extend(torch.max(answer, 1)[1].view(label.size()).tolist())
                labels.extend(label.tolist())
            else:
                y_pred.extend(torch.max(answer, 1)[1].view(label.size()).tolist())
                labels.extend(label.tolist())
    if split != "test":
        acc = 100. * n_correct / n
        loss = np.mean(losses)
        f1 = f1_score(labels, y_pred, average="weighted")
        return acc, loss, f1
    else:
        n_correct = 0
        n = len(labels)
        for i in range(n):
          if y_pred[i] == labels[i]:
            n_correct += 1
        acc = n_correct / n
        f1 = f1_score(labels, y_pred, average="weighted")
        return y_pred, labels, acc, f1


## Define LSTM Network

In [31]:
#@title
import random as random
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn import LSTM, GRU

class LSTM_Classifier(nn.Module):

    def __init__(self,
                 n_embed=20000,
                 d_embed=300,
                 d_hidden=150,
                 d_out=60,
                 embeddings=None,
                 nl = 1,
                 bidirectional = True,
                 gru = False
                 ):
        super(LSTM_Classifier, self).__init__()

        self.d_hidden = d_hidden
        self.bidrectional = bidirectional
        self.num_layers = nl

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.embed = create_emb_layer(embedding_matrix,False)

        self.lstm = nn.LSTM(d_embed, self.d_hidden, self.num_layers, batch_first=True,
                              bidirectional=bidirectional)
        
        self.fc_out_parent = nn.Linear(self.d_hidden*2, 18)
        self.fc_out = nn.Linear(self.d_hidden*2, d_out)
        self.dropout = nn.Dropout(p=0.2)

    def forward(self, text, seq_lengths):

        # parent class label

        # do not include final pad token
        x_parent = self.embed(text[:,:-1])
        x_parent = pack_padded_sequence(x_parent, seq_lengths, batch_first=True, enforce_sorted=False)    
        x_parent, hidden = self.lstm(x_parent)
        x_parent, _ = pad_packed_sequence(x_parent, batch_first=True)
        x_parent = self.fc_out_parent(x_parent)
        x_parent, _ = torch.max(x_parent, 1)

        # child class label

        # only include final pad token
        x = self.embed(text[:,-1])
        # input final hidden state from parent class label lstm
        x, _ = self.lstm(x.unsqueeze(1), hidden)
        x = self.fc_out(x)
        x, _ = torch.max(x, 1)

        return x_parent, x

# Training Loop

In [32]:
torch.manual_seed(1234)
criterion = nn.CrossEntropyLoss()
batch_size = 128
epochs = 10
dev_every = 100
lr = 0.01
save_path = "best_model"
drop_out = 0
word_dropout = 0
weight_decay = 0

model = LSTM_Classifier(n_embed=n_embed, d_embed=d_embed, d_hidden=150, d_out=d_out, bidirectional=True)
# dev_value, test_y_pred = train(model, word_to_idx, lr, drop_out, word_dropout, batch_size, weight_decay, "LSTM") 
dev_value, test_y_pred, test_labels, test_acc, test_f1 = train(model, word_to_idx, lr, drop_out, word_dropout, batch_size, weight_decay, "LSTM") 

epoch |   %        |  loss  |  avg   |val loss|   acc   |  best  | time | save |
val   |            |        |        | 4.0812 | 1.8692 | 1.8692 |      |      |
    1 | 1280/11514 | 0.6999 | 0.0754 | 0.6453 | 81.9970 | 81.9970 | 0.32 | *    |
    2 | 2560/11514 | 0.6008 | 0.0413 | 0.5620 | 85.2435 | 85.2435 | 0.64 | *    |
    3 | 3840/11514 | 0.3301 | 0.0264 | 0.5670 | 85.8829 | 85.8829 | 0.95 | *    |
    4 | 5120/11514 | 0.1773 | 0.0169 | 0.5910 | 85.7354 | 85.8829 | 1.26 | *    |
    5 | 6400/11514 | 0.0539 | 0.0084 | 0.6301 | 86.5716 | 86.5716 | 1.57 | *    |
    6 | 7680/11514 | 0.0316 | 0.0049 | 0.6493 | 86.8175 | 86.8175 | 1.91 | *    |
    7 | 8960/11514 | 0.0331 | 0.0035 | 0.6752 | 86.9159 | 86.9159 | 2.22 | *    |
    8 | 10240/11514 | 0.0812 | 0.0025 | 0.6951 | 86.9159 | 86.9159 | 2.53 | *    |
    9 | 11520/11514 | 0.1399 | 0.0024 | 0.7256 | 86.9651 | 86.9651 | 2.86 | *    |

Validation Accuracy :  (86.96507624200689, 0.72558653, 0.8681029429320444)


# Output from Test Dataset

In [33]:
test_acc

0.8523873570948218

In [34]:
test_f1

0.8503276995847924

In [35]:
output_df = pd.DataFrame(list(zip(test_y_pred,
                             test_labels)))

output_df = output_df.reset_index()

output_df = output_df.rename(columns={output_df.columns[0]: 'id',
                                      output_df.columns[1]: 'test_pred',
                                      output_df.columns[2]: 'test_label'})

output_df.to_csv('predictions.csv', index=False)

In [36]:
def evaluate(pred, gold):
    """

    Evaluate performance using accuracy metric

    INPUT:
    pred    – list of predicted classes from model
    gold    - list of corresponding "gold" classes/labels from data

    OUTPUT:
    acc     - accuracy percentage (float)

    """

    # counter for correct predictions
    n_correct = 0

    # total labels
    n = len(gold)

    # loop through labels and check correctness
    for i in range(n):
        if pred[i] == gold[i]:
            n_correct += 1

    # compute accuracy
    acc = 100. * n_correct / n

    return acc

In [37]:
predictions_df = pd.read_csv('predictions.csv')

pred = list(predictions_df['test_pred'])
gold = list(predictions_df['test_label'])

evaluate(pred,gold)

85.23873570948219

# Error Analysis

In [38]:
devset = SSTpytorchDataset(df_validation, word_dropout, word_to_idx, 'dev')
dev_iter = DataLoader(devset, df_validation.shape[0], shuffle=False, num_workers=0)

In [39]:
model = LSTM_Classifier(n_embed=n_embed, d_embed=d_embed, d_hidden=150, d_out=d_out, bidirectional=True)
model.load_state_dict(torch.load("/content/best_model"))
error_analysis = pd.DataFrame({"text": [np.nan], "label":[np.nan], "pred": [np.nan]})
with torch.no_grad():
    for batch_idx, batch in enumerate(dev_iter):
        data, lens, label, _ = batch 
        parent, answer = model(data, lens)
        predictions = torch.max(answer, 1)[1].view(label.size()).tolist()
        error_analysis_temp = pd.DataFrame({"text": pd.Series(data.tolist()), "label":label, "pred": predictions})

        error_analysis = pd.concat([error_analysis, error_analysis_temp])
error_analysis = error_analysis[1:]

In [42]:
def convert(lst):
  return " ".join([idx_to_word[i] for i in lst if i != word_to_idx['<PAD>']])
error_analysis["sentence"] = error_analysis["text"].apply(convert)
error_analysis.head()

Unnamed: 0,text,label,pred,sentence
0,"[95, 5370, 2097, 3512, 3525, 302, 302, 302, 30...",40.0,40.0,turn the lights off please
1,"[628, 5370, 2097, 2121, 5370, 192, 302, 302, 3...",31.0,31.0,dim the lights in the hall
2,"[3163, 4227, 1684, 3801, 302, 302, 302, 302, 3...",31.0,18.0,make a room darker
3,"[4530, 5370, 6019, 302, 302, 302, 302, 302, 30...",34.0,34.0,clean the flat
4,"[5632, 5907, 3955, 4334, 5907, 444, 5175, 650,...",34.0,34.0,cleaning is good dust is so bad do now your ma...


In [43]:
child_label_mapper = df_validation[['label', 'label_text']].drop_duplicates().sort_values(by = 'label').reset_index(drop = True)
child_label_mapper

Unnamed: 0,label,label_text
0,0,datetime_query
1,1,iot_hue_lightchange
2,2,transport_ticket
3,3,takeaway_query
4,4,qa_stock
5,5,general_greet
6,6,recommendation_events
7,7,music_dislikeness
8,8,iot_wemo_off
9,9,cooking_recipe


In [45]:
error_analysis_complete = error_analysis.merge(child_label_mapper, on = "label")
error_analysis_complete[error_analysis_complete["label"] != error_analysis_complete["pred"]]["label_text"].value_counts()

general_quirky              57
qa_factoid                  24
calendar_set                15
calendar_query              15
play_music                  10
news_query                   9
recommendation_events        8
social_post                  8
music_query                  7
qa_definition                6
lists_query                  5
email_query                  5
takeaway_order               5
email_sendemail              5
play_audiobook               5
lists_remove                 5
transport_query              5
datetime_query               5
social_query                 5
play_radio                   4
recommendation_locations     4
qa_maths                     4
play_podcasts                4
play_game                    3
qa_stock                     3
email_querycontact           3
weather_query                3
alarm_set                    3
alarm_query                  3
iot_hue_lightdim             2
audio_volume_mute            2
takeaway_query               2
music_li

In [47]:
error_analysis_complete[error_analysis_complete.label_text == "general_quirky"]

Unnamed: 0,text,label,pred,sentence,label_text
53,"[1101, 1334, 5370, 4737, 3703, 5802, 6059, 399...",12.0,12.0,give me the status on my available memory,general_quirky
54,"[1863, 3779, 723, 962, 1945, 650, 302, 302, 30...",12.0,12.0,what things ca n't you do,general_quirky
55,"[5488, 5552, 302, 302, 302, 302, 302, 302, 302...",12.0,46.0,night time,general_quirky
56,"[1213, 2900, 1561, 1775, 5370, 1352, 302, 302,...",12.0,12.0,thanks to ally for the photos,general_quirky
57,"[4006, 1418, 4227, 5533, 611, 302, 302, 302, 3...",12.0,26.0,meaning of a particular word,general_quirky
...,...,...,...,...,...
153,"[4207, 923, 800, 3469, 2121, 2562, 4227, 4941,...",12.0,12.0,how people are thinking in such a manner that ...,general_quirky
154,"[2696, 1334, 4207, 5370, 2201, 2935, 302, 302,...",12.0,12.0,tell me how the world began,general_quirky
155,"[5266, 5564, 4227, 1143, 302, 302, 302, 302, 3...",12.0,26.0,i need a manger,general_quirky
156,"[1830, 5370, 5371, 302, 302, 302, 302, 302, 30...",12.0,22.0,address the situation,general_quirky


In [46]:
error_analysis_complete[error_analysis_complete.label_text == "qa_factoid"]

Unnamed: 0,text,label,pred,sentence,label_text
1060,"[5192, 5907, 1796, 302, 302, 302, 302, 302, 30...",49.0,49.0,when is sunset,qa_factoid
1061,"[4516, 5907, 5370, 4085, 2500, 1720, 302, 302,...",49.0,49.0,where is the convention center located,qa_factoid
1062,"[5425, 5907, 3275, 1418, 76, 302, 302, 302, 30...",49.0,49.0,who is president of poland,qa_factoid
1063,"[4207, 5424, 5907, 4227, 4591, 302, 302, 302, ...",49.0,26.0,how tall is a giraffe,qa_factoid
1064,"[5049, 5370, 5731, 1418, 232, 2428, 302, 302, ...",49.0,12.0,calculate the resistance of this resistor,qa_factoid
...,...,...,...,...,...
1145,"[4207, 1656, 5907, 4966, 302, 302, 302, 302, 3...",49.0,49.0,how big is japan,qa_factoid
1146,"[4207, 34, 394, 4952, 562, 5192, 4302, 5097, 1...",49.0,49.0,how old was albert einstein when he came up wi...,qa_factoid
1147,"[5049, 4227, 3651, 760, 1418, 4069, 2322, 6118...",49.0,39.0,calculate a close integral of exponential func...,qa_factoid
1148,"[2696, 1334, 4207, 5669, 5907, 4995, 2900, 955...",49.0,49.0,tell me how brexit is going to affect e. u. ci...,qa_factoid
