### Import Libraries

In [314]:
import gensim.downloader
import pandas as pd
import numpy as np
import torch
import nltk
import string
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

### Download pretrained word embeddings

In [177]:
word2vec = gensim.downloader.load('word2vec-google-news-300')

In [298]:
vocab = list(word2vec.index_to_key)
vocab_size = len(word2vec.index_to_key)
embedding_dim = 300

### Import Dataset

In [276]:
train_df = pd.read_csv('dataset part 2/train.csv')
test_df = pd.read_csv("dataset part 2/test.csv")
train_df.drop(columns=['label-fine'], inplace=True)
test_df.drop(columns=['label-fine'], inplace=True)

In [277]:
train_df

Unnamed: 0,label-coarse,text
0,0,How did serfdom develop in and then leave Russ...
1,1,What films featured the character Popeye Doyle ?
2,0,How can I find a list of celebrities ' real na...
3,1,What fowl grabs the spotlight after the Chines...
4,2,What is the full form of .com ?
...,...,...
5447,1,What 's the shape of a camel 's spine ?
5448,1,What type of currency is used in China ?
5449,4,What is the temperature today ?
5450,4,What is the temperature for cooking ?


Form a development set from a random subset (containing 500 examples) within the original training data. Remove these examples from original training file

In [278]:
development_df = train_df.sample(n=500, random_state=0) # randomly sample 500 rows from train df
train_df = train_df.drop(development_df.index) # remove sampled rows from train df

In [279]:
development_df

Unnamed: 0,label-coarse,text
2755,4,How many trees go into paper making in a year ?
3326,1,What concerts are held in New York this week ?
2204,5,Where did the sport of caber-tossing originate ?
2888,3,What kind of people took part in Shays ' Rebel...
2812,0,How is cologne made ?
...,...,...
5335,5,Where was George Washington born ?
1275,4,In what year did the Bounty mutiny happen ?
4508,0,What is the difference between khaki and chino ?
402,4,How many types of dogs ' tails are there - three


Randomly select 4 classes from 6 coarse labels, combine remaining 2 to form a single class 'OTHERS'. Adjust original data such that label for each sentence is updated accordingly

In [280]:
# Selected 2 labels for combining
combined_1 = 4
combined_2 = 5

# train_df['label-coarse'] = train_df['label-coarse'].astype(object)
# test_df['label-coarse'] = train_df['label-coarse'].astype(object)
# development_df['label-coarse'] = train_df['label-coarse'].astype(object)

# Assign label 5 as 4 as the OTHERS label
train_df.loc[(train_df['label-coarse'] == combined_2), 'label-coarse'] = combined_1
test_df.loc[(test_df['label-coarse'] == combined_2), 'label-coarse'] = combined_1
development_df.loc[(development_df['label-coarse'] == combined_2), 'label-coarse'] = combined_1

### Tokenize and remove punctuation

In [281]:
from nltk.tokenize import word_tokenize

def tokenize_and_remove_punctuation(text):
    # Create a translation table to replace punctuation with empty strings
    translator = str.maketrans('', '', string.punctuation)
    # Remove punctuation and replace with ''
    text_ = text.translate(translator)
    tokens = word_tokenize(text_)
    # filtered_tokens = [token for token in tokens if token in vocab]
    return tokens

In [282]:
train_df['text'] = train_df['text'].apply(lambda x: tokenize_and_remove_punctuation(x))
test_df['text'] = test_df['text'].apply(lambda x: tokenize_and_remove_punctuation(x))
development_df['text'] = development_df['text'].apply(lambda x: tokenize_and_remove_punctuation(x))

### Padding of Data for LSTM

In [283]:
def pad_tokens(tokens, max_length, padding_value='<pad>'):
    if len(tokens) < max_length:
        tokens.extend([padding_value] * (max_length - len(tokens)))
    return tokens

In [284]:
max_length = max(train_df['text'].apply(len))

In [285]:
train_df['text'] = train_df['text'].apply(pad_tokens, max_length=max_length)
test_df['text'] = test_df['text'].apply(pad_tokens, max_length=max_length)
development_df['text'] = development_df['text'].apply(pad_tokens, max_length=max_length)

### Using Torch

In [309]:
embedding_matrix = word2vec.vectors

In [315]:
class LSTMTextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidirectional):
        super(LSTMTextClassifier, self).__init__()
        
        # Embedding layer with pretrained word vectors
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        self.embedding.weight.requires_grad = False # freeze the embeddings
        
        # LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, batch_first=True)
        
        # Softmax Layer
        self.softmax_layer = nn.Sequential(nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim),
        nn.Softmax(dim=1))
        
    def forward(self, text):
        # Embed the input
        embedded = self.embedding(text)
        # Pass through the LSTM layer
        output, (hidden, cell) = self.lstm(embedded)
        # Use the final hidden state as the output
        hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1) if self.lstm.bidirectional else hidden[-1, :, :]
        
        # Pass through the fully connected layer
        output = self.softmax_layer(hidden)
        return output

### NEXT STEP, ENCODE TEXT IN DATAFRAME TO INDICEES