# Imports

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import csv
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score, confusion_matrix



In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
stopwords = set(stopwords.words('english'))

# Classes and Functions

In [None]:
hyperlink_regex = re.compile(r'&lt;A.*?&gt;.*?&lt;/A&gt;')
font_regex = re.compile(r'&lt;font.*?/font&gt;')
img_regex = re.compile(r'&lt;img.*?&gt;')
nobr_regex = re.compile(r'&lt;nobr&gt;.*?&lt;/nobr&gt;')

In [None]:
def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords])

In [None]:
def read_dataset(filename, rmst=False):
    with open(filename, 'r') as f:
        reader = csv.reader(f)
        data = list(reader)

        # remove header
        data = data[1:]

        # remove the first column
        data = [row[1:] for row in data]

    data = preprocess_data(data, rmst)

    return data

def preprocess_data(data, rmst=False):
    for i, row in enumerate(data):
        text = row[0]

        # Change text to lowercase
        text = text.lower()

        # Replace occurences of '#36;', "\$" or '#151;' or '#160;' with a space
        text = text.replace("#36;", " ")
        text = text.replace("\$ ", " ")
        text = text.replace("#151;", " ")
        text = text.replace("#160;", " ")
        text = text.replace("\\"," ")

        # Replace '#39;' or '#8217;' with an apostrophe
        text = text.replace("#39;", "")
        text = text.replace("#8217;", "")

        # Replace '#147;' or '#148;' with a double quote
        text = text.replace("#147;", '')
        text = text.replace("#148;", '')
        text = text.replace("quot;", '')

        # Remove occurences of some tags like bold, strong, etc.
        text = text.replace('&lt;br&gt;', ' ')
        text = text.replace('&lt;br/&gt;', ' ')
        text = text.replace('&lt;b&gt;...&lt;/b&gt;', ' ')
        text = text.replace('&lt;strong&gt;', ' ')
        text = text.replace('&lt;/strong&gt;', ' ')
        text = text.replace('&lt;cite&gt;is&lt;/cite&gt;', ' ')
        text = text.replace('&lt;p&gt;', ' ')
        text = text.replace('&lt;/p&gt;', ' ')
        text = text.replace('\&lt;p&gt;', ' ')
        text = text.replace('\&lt;hpq.n&gt;', ' ')
        text = text.replace('&lt;i&gt;', ' ')
        text = text.replace('&lt;/i&gt;', ' ')

        # Match and remove the regexes defined above
        text = hyperlink_regex.sub('', text)
        text = font_regex.sub('', text)
        text = img_regex.sub('', text)
        text = nobr_regex.sub('', text)

        # Remove \ in the text
        # First replace any possible escape characters like \', \n, \r, \t, \b, \f, \" with appropriate characters
        text = text.replace("\'", "")
        text = text.replace("\"", "")
        text = text.replace("\n", " n")
        text = text.replace("\r", " r")
        text = text.replace("\t", " t")
        text = text.replace("\b", " b")
        text = text.replace("\f", " f")

        # Remove any remaining \ in the text
        text = text.replace("\ ".replace(" ", ""), " ")

        text = ''.join([c if c.isalnum() or c.isspace() else ' ' for c in text])

        if rmst:
            text = remove_stopwords(text)

        row[0] = text

        # # Converting the label to one-hot encoding
        # label = np.zeros(4)
        # label[int(row[1])] = 1
        # row[1] = label

        data[i] = row

    return data

In [None]:
def create_vocab(dataset):
    sentences = [row[0] for row in dataset]

    # If sentences is not a 2D list, convert it to one
    if not isinstance(sentences[0], list):
        tokens = [word_tokenize(sentence) for sentence in sentences]
    else:
        tokens = sentences

    vocab = {}

    for token_list in tokens:
        for token in token_list:
            if token in vocab:
                vocab[token] += 1
            else:
                vocab[token] = 1

    vocab = [k for k, v in vocab.items()]

    tokenized_sentences = []

    for token_list in tokens:
        tokenized_sentences.append([token for token in token_list if token in vocab])

    return vocab, tokenized_sentences

In [None]:
def average_embeddings(sentence, embeddings, vector_size):
    avg = np.zeros(vector_size)
    count = 0

    for word in sentence:
        if word in embeddings:
            avg += embeddings[word]
            count += 1

    if count > 0:
        avg /= count
    else:
        avg = np.zeros(vector_size)

    return np.array(avg, dtype=np.float32)

In [None]:
# Data class
class Data(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y).type(torch.LongTensor)

        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.len

In [None]:
# Neural Network class
class classifier_nn(nn.Module):
    def __init__(self, input_size, hidden_sizes, num_classes):
        super(classifier_nn, self).__init__()
        self.layers = nn.ModuleList()

        prev_layer_size = input_size
        for hidden_size in hidden_sizes:
            self.layers.append(nn.Linear(prev_layer_size, hidden_size))
            # self.layers.append(nn.ReLU())
            prev_layer_size = hidden_size

        self.output_layer = nn.Linear(prev_layer_size, num_classes)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
            x = torch.sigmoid(x)
        return self.output_layer(x)

def train_nn(model, train_loader, epochs, lr):
    model.train()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for epoch in range(epochs):
        epoch_loss = 0
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            output = model(inputs)

            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        if epoch%10 == 0:
            print(f'Epoch {epoch}: Loss: {epoch_loss}')

def get_predictions(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in data_loader:
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            # _, labels = torch.max(labels, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels

def get_perf(preds, labels):
    accuracy = np.mean(np.array(preds) == np.array(labels))

    f1 = f1_score(labels, preds, average='weighted')

    return accuracy, f1

def plot_conf_matrix(preds, labels):
    conf_matrix = confusion_matrix(labels, preds)
    print(conf_matrix)

def save_predictions(preds, og_filename, filename):
    with open(og_filename, 'r') as f:
        reader = csv.reader(f)
        data = list(reader)
        header = data[0]

        lines = [row[1] for row in data[1:]]
    print("index,text,label", file=open(filename, "w"))

    for i, line in enumerate(lines):
        print(f"{i},{line},{preds[i]}", file=open(filename, "a"))

In [None]:
# Codes for RNN part
def get_embedding_vectors(sentences, vector_size=100, max_seq_len=50):
    sentences = [sentence.split() for sentence in sentences]
    for i, sentence in enumerate(sentences):
        if len(sentence) > max_seq_len:
            sentence = sentence[:max_seq_len]
        sentences[i] = ' '.join(sentence)

    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

    model = Word2Vec(tokenized_sentences, vector_size=vector_size, window=5, min_count=1, workers=4)

    embedded_vectors = []
    
    # Create vocabulary from the model
    vocab = model.wv

    for sentence in tokenized_sentences:
        embedded_vectors.append(average_embeddings(sentence, model.wv, vector_size))

    return np.array(embedded_vectors), vocab

def collate_fn(batch, max_sequence):
    sentences, labels = zip(*batch)

    padded_sentences = pad_sequence(sentences, batch_first=True)

    return padded_sentences, torch.tensor(labels)

def prepare_data(embedded_vectors, labels, batch_size):
    dataset = Data(embedded_vectors, labels)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

    return data_loader


In [None]:
# class Classifier_RNN(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes, bidirectional=True):
#         super(Classifier_RNN, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
#         self.fc = nn.Linear(hidden_size*2 if bidirectional else hidden_size, num_classes)
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.embedding(x)
#         x, _ = self.rnn(x)
#         x = self.dropout(x)
#         x = torch.mean(x, 1)
#         x = self.fc(x)
#         return x

# class Classifier_LSTM(nn.Module):
#     def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, num_classes, bidirectional=True):
#         super(Classifier_LSTM, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embedding_dim)
#         self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True, bidirectional=bidirectional)
#         self.fc = nn.Linear(hidden_size * 2 if bidirectional else hidden_size, num_classes)
#         self.dropout = nn.Dropout(0.5)

#     def forward(self, x):
#         x = self.embedding(x)
#         x, _ = self.lstm(x)
#         x = self.dropout(x)
#         x = torch.mean(x, 1)
#         x = self.fc(x)
#         return x

# Main

## Data files

In [None]:
train_file = './train.csv'
test_file = './test.csv'

## Task 1

### Data Preparation

In [None]:
train_data_task_1 = read_dataset(train_file, rmst=True)
test_data_task_1 = read_dataset(test_file, rmst=True)

# Now we set a 10% of the training data aside for validation
train_data_task_1, val_data_task_1 = train_test_split(train_data_task_1, test_size=0.1)

In [None]:
vocabulary_1, tokenized_sentences = create_vocab(train_data_task_1)

### Word2Vec

In [None]:
vec_size = 100
output_dim = 4
epoch_sizes = [10, 20, 40]

In [None]:
# Create the word2vec model using the train_data as the corpus
word2vec_model = Word2Vec(tokenized_sentences, vector_size=vec_size, window=5, min_count=1, workers=4)
word2vec_model.save('word2vec_model')

In [None]:
# Get embeddings for the vocabulary
embeddings = {}

for word in vocabulary_1:
    embeddings[word] = word2vec_model.wv[word]

In [None]:
# Split train data into X, y
X_train = [row[0] for row in train_data_task_1]
y_train = [row[1] for row in train_data_task_1]

X_train = [average_embeddings(word_tokenize(sentence), embeddings, vec_size) for sentence in X_train]

In [None]:
# Split validation data into X, y
X_val = [row[0] for row in val_data_task_1]
y_val = [row[1] for row in val_data_task_1]

X_val = [average_embeddings(word_tokenize(sentence), embeddings, vec_size) for sentence in X_val]

In [None]:
# Split test data into X, y
X_test = [row[0] for row in test_data_task_1]
y_test = [row[1] for row in test_data_task_1]

X_test = [average_embeddings(word_tokenize(sentence), embeddings, vec_size) for sentence in X_test]

### Neural Network

In [None]:
# load train_data into DataLoader
train_data = Data(np.array(X_train), np.array(y_train))
train_loader = DataLoader(train_data, batch_size=32, shuffle=False)

In [None]:
# load val_data into DataLoader
val_data = Data(np.array(X_val), np.array(y_val))
val_loader = DataLoader(val_data, batch_size=32, shuffle=False)

In [None]:
# load test_data into DataLoader
test_data = Data(np.array(X_test), np.array(y_test))
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

Model 1:
* 1 hidden layer - 64 size

In [None]:
# Create the neural network
input_size = vec_size
hidden_sizes = [64]

model_1 = classifier_nn(input_size, hidden_sizes, output_dim).to(device)

train_nn(model_1, train_loader, 100, 0.01)

In [None]:
# Get accuracy on validation set
preds_1, labels_1 = get_predictions(model_1, val_loader)

accuracy_1, f1_1 = get_perf(preds_1, labels_1)

print(f'Accuracy: {accuracy_1}, F1: {f1_1}')

In [None]:
accuracy_1, f1_1 = get_perf(preds_1, labels_1)

print(f'Accuracy: {accuracy_1}, F1: {f1_1}')

Model 2:
* 1 hidden layer - 128 size

In [None]:
# Create Neural network 2
input_size = vec_size
hidden_sizes = [128]

model_2 = classifier_nn(input_size, hidden_sizes, output_dim).to(device)

train_nn(model_2, train_loader, 100, 0.01)

In [None]:
# Get accuracy on validation set
preds_2, labels_2 = get_predictions(model_2, val_loader)

accuracy_2, f1_2 = get_perf(preds_2, labels_2)

print(f'Accuracy: {accuracy_2}, F1: {f1_2}')

Model 3:

* 2 hidden layers - sizes 64, 32

In [None]:
# Create Neural network 3
input_size = vec_size
hidden_sizes = [64, 32]

model_3 = classifier_nn(input_size, hidden_sizes, output_dim).to(device)

train_nn(model_3, train_loader, 100, 0.01)

In [None]:
# Get accuracy on validation set
preds_3, labels_3 = get_predictions(model_3, val_loader)

accuracy_3, f1_3 = get_perf(preds_3, labels_3)

print(f'Accuracy: {accuracy_3}, F1: {f1_3}')

In [None]:
best_model = model_1

In [None]:
# Get predictions, labels
preds, labels = get_predictions(best_model, test_loader)

In [None]:
print(preds[0:5])
print(labels[0:5])

In [None]:
# Get accuracy, f1
accuracy, f1 = get_perf(preds, labels)

print(f'Accuracy on test set: {accuracy}')
print(f'F1 score on test set: {f1}')

In [None]:
# Plot confusion matrix
plot_conf_matrix(preds, labels)

In [None]:
# Save the predictions to a csv file named "w2v_test.csv"
save_predictions(preds, "test.csv","w2v_test.csv")

### Recurrent Neural Networks

In [None]:
max_sequence_length = 50
batch_size = 32

In [None]:
train_data_rnn = read_dataset(train_file, rmst=True)
train_data_rnn, val_data_rnn = train_test_split(train_data_rnn, test_size=0.1)  

In [None]:
sentences = [row[0] for row in train_data_rnn]
labels = np.array([row[1] for row in train_data_rnn])



In [None]:
val_sentences = [row[0] for row in val_data_rnn]
val_labels = np.array([row[1] for row in val_data_rnn])

val_embeddings, _ = get_embedding_vectors(val_sentences, vector_size=100, max_seq_len=max_sequence_length)

val_data_loader = prepare_data(val_embeddings, val_labels, batch_size)

In [None]:
test_data_rnn = read_dataset(test_file, rmst=True)

sentences = [row[0] for row in test_data_rnn]
labels = np.array([row[1] for row in test_data_rnn])

embeddings_rnn, _ = get_embedding_vectors(sentences, vector_size=100, max_seq_len=max_sequence_length)

test_data_loader = prepare_data(embeddings_rnn, labels, batch_size)

In [None]:
vocab_size = len(vocab)
embedding_dim = 100

#### Training RNNs

In [None]:
# Model 1
hidden_sizes = 64
num_layers = 1
num_classes = 4

model_1 = Classifier_RNN(vocab_size, embedding_dim, hidden_sizes, num_layers, num_classes).to(device)

train_nn(model_1, train_data_loader, 100, 0.01)


In [None]:
# Get accuracy on validation set
preds_1, labels_1 = get_predictions(model_1, val_data_loader)

accuracy_1, f1_1 = get_perf(preds_1, labels_1)

print(f'Accuracy: {accuracy_1}, F1: {f1_1}')