In [85]:
import tqdm
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchtext
from torchtext.data import get_tokenizer
from torch.nn.utils.rnn import pad_sequence

import gensim
import gensim.downloader
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [86]:
#Process data 
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data , development_data = train_test_split(train_data,test_size=500,random_state=True)
train_data.drop('label-fine',axis=1)
test_data.drop('label-fine',axis=1)
development_data.drop('label-fine',axis=1)
labels = [0, 1, 2, 3, 4, 5]
selected_labels = random.sample(labels, 4)
selected_labels.sort()
train_data['label-coarse'] = train_data['label-coarse'].apply(lambda x: selected_labels.index(x) if x in selected_labels else 4)
test_data['label-coarse'] = test_data['label-coarse'].apply(lambda x: selected_labels.index(x) if x in selected_labels else 4)
development_data['label-coarse'] = development_data['label-coarse'].apply(lambda x: selected_labels.index(x) if x in selected_labels else 4)


In [3]:
google_news = gensim.downloader.load('word2vec-google-news-300')

In [87]:
#Model
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, x):
        embedded = self.embedding(x)
        _, (h_n, _) = self.rnn(embedded)
        aggregated = h_n[-1]  
        out = self.fc(aggregated) 
        return out

In [129]:
#Model parameters (Can be changed to maybe improve accuracy)
hidden_dim = 512
num_classes = 5
learning_rate = 0.001
batch_size = 64
num_epochs = 50


In [130]:
#Perform tokenization
tokenizer = get_tokenizer("basic_english")
train_sequences=[]
development_sequences=[]
test_sequences=[]
for text in train_data['text']:
    x=tokenizer(text)
    train_sequences.append(x) 
for text in development_data['text']:
    x=tokenizer(text)
    development_sequences.append(x) 
for text in test_data['text']:
    x=tokenizer(text)
    test_sequences.append(x)

y_train = train_data['label-coarse'].tolist()
y_val = development_data['label-coarse'].tolist()
y_test = test_data['label-coarse'].tolist()
#Training the word2vec model(Replace with trained word2vec model from part 1)
google_news_model = Word2Vec(train_sequences, vector_size=100, window=5, min_count=1)
embedding_dim = google_news_model.wv.vector_size
vocab_size = google_news_model.corpus_total_words

In [135]:
word_vectors = torch.FloatTensor(google_news_model.wv.vectors)
# Create an embedding layer
embedding_layer = nn.Embedding(google_news_model.corpus_total_words+1,google_news_model.vector_size)
unk_token = np.random.uniform(-1,1,100)
embedding_layer.weight.data[-1] = torch.FloatTensor(unk_token)
X_train = []
X_val = []
X_test = []
max_size = 20
#Perform zero padding because all parameters must be the same length for some reason
for input_text in train_sequences:
    input_indices = [google_news_model.wv.key_to_index.get(word,vocab_size-1) for word in input_text]
    input_indices = (input_indices + [0] * 20)[:20]
    X_train.append(input_indices)
    
for input_text in development_sequences:
    input_indices = [google_news_model.wv.key_to_index.get(word,vocab_size-1) for word in input_text]
    input_indices = (input_indices + [0] * 20)[:20]
    X_val.append(input_indices)
    
for input_text in test_sequences:
    input_indices = [google_news_model.wv.key_to_index.get(word,vocab_size-1) for word in input_text]
    input_indices = (input_indices + [0] * 20)[:20]
    X_test.append(input_indices)


In [136]:
def intialise_loaders(X_train, y_train,X_dev,y_dev, X_test, y_test):
    train_data = torch.utils.data.TensorDataset(torch.LongTensor(X_train), torch.LongTensor(y_train))
    development_data = torch.utils.data.TensorDataset(torch.LongTensor(X_dev), torch.LongTensor(y_dev))
    test_data = torch.utils.data.TensorDataset(torch.LongTensor(X_test), torch.LongTensor(y_test))

    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    dev_dataloader = torch.utils.data.DataLoader(development_data, batch_size=batch_size) 
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)   
 
    return train_dataloader,dev_dataloader, test_dataloader

train_dataloader,dev_dataloader, test_dataloader = intialise_loaders(X_train, y_train,X_val,y_val, X_test, y_test)

In [137]:
#Initialize the model
model = TextClassifier(vocab_size, embedding_dim, hidden_dim, num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
loss_function = torch.nn.CrossEntropyLoss()
train_accuracies = []
dev_accuracies = []
train_losses = []
dev_losses = []
patience = 3
best_accuracy = 0

# Training loop
for epoch in range(num_epochs):
    correct_train = 0
    total_train = 0

    model.train()
    running_loss = 0.0
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)  
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _,predicted = torch.max(outputs.data, 1) 
        correct_train += (predicted == labels).sum().item()
        total_train += labels.size(0)
    average_loss = running_loss / len(train_dataloader)
    train_losses.append(average_loss)
    train_accuracy = correct_train / total_train
    train_accuracies.append(train_accuracy)
    print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {average_loss:.4f} Train Accuracy: {train_accuracy:.4f}")     
    
    model.eval()
    correct_dev = 0
    total_dev = 0
    with torch.no_grad():
        val_loss = 0.0
        for inputs, labels in dev_dataloader:
            outputs = model(inputs)
            val_loss += nn.CrossEntropyLoss()(outputs, labels)
            _,predicted = torch.max(outputs.data, 1) 
            correct_dev += (predicted == labels).sum().item()
            total_dev += labels.size(0)
    average_loss2 = val_loss / len(dev_dataloader)
    dev_losses.append(average_loss2)    
    dev_accuracy = correct_dev / total_dev
    dev_accuracies.append(dev_accuracy)
    print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {average_loss2:.4f} Development Accuracy: {dev_accuracy:.4f}")
    
    if dev_accuracy > best_accuracy:
        best_accuracy = dev_accuracy
        no_improvement = 0
    else:
        no_improvement += 1

    if no_improvement >= patience:
        print(f"Early stopping after {epoch + 1} epochs with no improvement.")
        break

Epoch [1/50] Loss: 1.5947 Train Accuracy: 0.2347
Epoch [1/50] Loss: 1.5284 Development Accuracy: 0.3660
Epoch [2/50] Loss: 1.4040 Train Accuracy: 0.3924
Epoch [2/50] Loss: 1.1720 Development Accuracy: 0.5200
Epoch [3/50] Loss: 1.1092 Train Accuracy: 0.4994
Epoch [3/50] Loss: 1.0551 Development Accuracy: 0.5360
Epoch [4/50] Loss: 1.0191 Train Accuracy: 0.5380
Epoch [4/50] Loss: 1.0486 Development Accuracy: 0.5620
Epoch [5/50] Loss: 0.8609 Train Accuracy: 0.6254
Epoch [5/50] Loss: 0.9649 Development Accuracy: 0.6020
Epoch [6/50] Loss: 0.6912 Train Accuracy: 0.7405
Epoch [6/50] Loss: 0.8278 Development Accuracy: 0.7260
Epoch [7/50] Loss: 0.5176 Train Accuracy: 0.8213
Epoch [7/50] Loss: 0.7709 Development Accuracy: 0.7440
Epoch [8/50] Loss: 0.3617 Train Accuracy: 0.8772
Epoch [8/50] Loss: 0.7011 Development Accuracy: 0.7800
Epoch [9/50] Loss: 0.2666 Train Accuracy: 0.9124
Epoch [9/50] Loss: 0.7230 Development Accuracy: 0.7800
Epoch [10/50] Loss: 0.1885 Train Accuracy: 0.9412
Epoch [10/50] 

In [138]:
#Test the model on test data
correct_test = 0
total_test = 0
model.eval()
with torch.no_grad():
        for inputs, labels in test_dataloader:
            outputs = model(inputs)
            val_loss += nn.CrossEntropyLoss()(outputs, labels)
            _,predicted = torch.max(outputs.data, 1) 
            correct_test += (predicted == labels).sum().item()
            total_test += labels.size(0)
accuracy = correct_test / total_test
accuracy

0.83

tensor([3, 0, 2, 0, 1, 1, 2, 0, 2, 4, 4, 3, 2, 4, 0, 4, 4, 1, 4, 2, 4, 2, 4, 0,
        3, 1, 4, 4, 3, 4, 4, 4, 4, 4, 3, 1, 2, 0, 4, 4, 0, 2, 0, 0, 4, 2, 4, 2,
        2, 3, 1, 0, 1, 2, 3, 3, 2, 4, 2, 2, 1, 3, 2, 3, 4, 2, 4, 2, 0, 4, 4, 2,
        2, 4, 4, 2, 0, 0, 2, 3, 4, 1, 2, 2, 2, 1, 2, 1, 2, 1, 0, 4, 2, 4, 4, 0,
        3, 0, 4, 1, 2, 0, 4, 0, 4, 4, 0, 4, 4, 3, 0, 4, 0, 0, 2, 0, 4, 4, 0, 2,
        3, 4, 4, 4, 1, 1, 4, 1, 2, 2, 2, 1, 2, 4, 1, 4, 0, 2, 4, 4, 4, 2, 2, 3,
        0, 0, 2, 4, 2, 3, 4, 3, 2, 4, 1, 0, 3, 4, 0, 4, 4, 4, 1, 1, 2, 2, 0, 4,
        3, 1, 4, 3, 4, 1, 0, 0, 3, 1, 4, 4, 4, 2, 4, 0, 1, 0, 3, 3, 1, 0, 2, 1,
        2, 1, 0, 0, 0, 2, 4, 2, 0, 3, 4, 4, 0, 0, 3, 4, 3, 0, 4, 1, 4, 2, 4, 4,
        0, 2, 0, 0, 4, 3, 0, 3, 1, 0, 0, 2, 3, 2, 3, 2, 4, 2, 4, 4, 2, 2, 2, 1,
        0, 2, 2, 4])


tensor([2, 0, 4, 2, 1, 1, 2, 0, 2, 4, 4, 3, 2, 4, 0, 4, 4, 1, 4, 4, 4, 2, 4, 0,
        3, 1, 4, 4, 3, 4, 4, 4, 4, 4, 3, 0, 2, 0, 4, 4, 0, 2, 0, 0, 4, 2, 4, 2,
        2, 3, 1, 3, 1, 2, 3, 3, 2, 4, 3, 4, 1, 3, 0, 3, 4, 2, 4, 0, 2, 4, 4, 2,
        2, 4, 4, 2, 0, 0, 2, 3, 4, 0, 4, 2, 2, 0, 2, 1, 2, 1, 0, 4, 2, 4, 2, 3,
        0, 0, 4, 1, 2, 2, 4, 0, 4, 4, 0, 2, 4, 2, 0, 0, 4, 3, 2, 0, 4, 4, 0, 2,
        3, 4, 4, 4, 1, 1, 0, 1, 2, 2, 4, 1, 4, 4, 1, 4, 0, 2, 4, 4, 4, 2, 2, 3,
        1, 0, 2, 4, 2, 3, 4, 3, 2, 4, 1, 0, 3, 4, 0, 4, 4, 4, 1, 1, 2, 3, 0, 4,
        3, 0, 4, 3, 4, 1, 4, 0, 3, 1, 4, 4, 4, 2, 4, 0, 1, 0, 3, 0, 1, 0, 3, 1,
        2, 0, 0, 0, 0, 2, 2, 2, 0, 3, 4, 4, 1, 0, 3, 2, 3, 0, 4, 1, 4, 4, 4, 4,
        0, 2, 0, 0, 4, 3, 0, 3, 0, 0, 0, 4, 3, 4, 3, 2, 0, 0, 4, 4, 0, 2, 2, 1,
        0, 2, 0, 4])


[2, 3, 1, 4, 2, 2, 1, 0, 4, 4, 3, 1, 2, 1, 2, 2, 0, 1, 4, 2, 1, 4, 3, 4, 4, 1, 4, 3, 3, 3, 2, 3, 4, 2, 2, 2, 3, 2, 2, 2, 0, 4, 4, 4, 0, 0, 2, 4, 2, 1, 4, 1, 1, 4, 4, 4, 2, 3, 3, 2, 3, 1, 3, 0, 3, 1, 0, 4, 2, 3, 2, 2, 3, 1, 3, 4, 0, 2, 2, 1, 4, 2, 1, 2, 2, 4, 4, 1, 4, 3, 0, 4, 2, 2, 1, 3, 2, 4, 4, 0, 4, 1, 4, 1, 3, 4, 2, 0, 2, 3, 0, 4, 2, 1, 0, 1, 2, 0, 2, 3, 2, 2, 4, 1, 2, 1, 2, 2, 4, 2, 2, 1, 4, 0, 4, 0, 3, 1, 1, 4, 4, 0, 3, 0, 4, 0, 1, 0, 4, 4, 0, 1, 4, 2, 1, 3, 3, 4, 3, 3, 4, 4, 2, 3, 0, 3, 2, 4, 3, 1, 2, 2, 2, 4, 3, 3, 3, 2, 0, 2, 3, 4, 3, 4, 0, 1, 1, 4, 3, 0, 0, 2, 2, 2, 4, 3, 0, 1, 0, 0, 0, 3, 1, 0, 2, 3, 1, 2, 4, 4, 2, 2, 3, 3, 4, 0, 4, 0, 2, 4, 4, 2, 4, 0, 3, 0, 2, 4, 3, 1, 2, 1, 4, 2, 0, 4, 3, 4, 3, 2, 3, 4, 4, 4, 4, 3, 4, 4, 4, 0, 1, 3, 0, 0, 4, 3, 2, 0, 4, 2, 1, 1, 2, 0, 2, 4, 4, 3, 2, 4, 0, 4, 4, 1, 4, 4, 4, 2, 4, 0, 3, 1, 4, 4, 3, 4, 4, 4, 4, 4, 3, 0, 2, 0, 4, 4, 0, 2, 0, 0, 4, 2, 4, 2, 2, 3, 1, 3, 1, 2, 3, 3, 2, 4, 3, 4, 1, 3, 0, 3, 4, 2, 4, 0, 2, 4, 4, 2, 2, 4, 4, 2, 0, 