#Loading datasets and other preliminaries

In [None]:
# Download the training data
!gdown "https://drive.google.com/uc?export=download&id=199EtucyfnhZOIHq_jfGjODTZm3Scz5yC"

Downloading...
From: https://drive.google.com/uc?export=download&id=199EtucyfnhZOIHq_jfGjODTZm3Scz5yC
To: /content/quora_questions_train.csv
100% 124M/124M [00:02<00:00, 55.4MB/s]


In [None]:
# Download the test data
!gdown "https://drive.google.com/uc?export=download&id=1hL9DZvEY-HCOh3FXVtUIuS1CyM8zhRZn"

Downloading...
From: https://drive.google.com/uc?export=download&id=1hL9DZvEY-HCOh3FXVtUIuS1CyM8zhRZn
To: /content/quora_questions_test.csv
100% 35.0M/35.0M [00:00<00:00, 82.5MB/s]


In [None]:
# Mount Google Drive into the colab
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Imports
import torch
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import os
import time
from tqdm.notebook import tqdm

#Setting up the GPU

In [None]:
# Choose what device to run computations on

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#DEVICE = torch.device('cpu')

In [None]:
DEVICE

#Network Achitecture

In [None]:
class LSTMNetwork(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, layers, sequence_length):
        super().__init__()
        self.hidden_size = hidden_dim
        self.sequence_length = sequence_length
        self.recurrent_layer = nn.LSTM(input_size = input_dim, hidden_size = hidden_dim, num_layers = layers, batch_first = False)
        self.classifier = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        output, (hn, cn) = self.recurrent_layer(x)
        # REPLACE 25 WITH SEQUENCE LENGTH
        answer = self.classifier(output[self.sequence_length-1])
        answer = self.sigmoid(answer)
        return answer

#Dataset

In [None]:
class RawWordsDataset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.df = data
        self.inputs = self.df.question_text.tolist() # list of questions
        self.labels = self.df.target.tolist() # list of labels

    def __getitem__(self, i):
        # return the ith sample's string and label
        return self.inputs[i], self.labels[i]

    def __len__(self):
        return len(self.labels)

#Word vectorizer helper function

In [None]:
from torchtext.data import get_tokenizer
from torchtext.vocab import GloVe

max_words = 25
embed_len = 50

tokenizer = get_tokenizer("basic_english")
global_vectors = GloVe(name = "6B", dim = embed_len)

# https://coderzcolumn.com/tutorials/artificial-intelligence/how-to-use-glove-embeddings-with-pytorch

def vectorize_batch(X):
    # separate the question into individual tokens (words)
    X = [tokenizer(x) for x in X]
    # make all sentences have the same number of tokens, pad with empty string or cut as needed
    X = [tokens+[""] * (max_words-len(tokens))  if len(tokens)<max_words else tokens[:max_words] for tokens in X]
    # note that this shape will require batch_first = true for the lstm, so we will transpose it at the end
    X_tensor = torch.zeros(len(X), max_words, embed_len)
    for i, tokens in enumerate(X):
        X_tensor[i] = global_vectors.get_vecs_by_tokens(tokens)
    # with the transpose, we can have batch_first = false for the lstm
    return torch.transpose(X_tensor, 0, 1)


.vector_cache/glove.6B.zip: 862MB [02:41, 5.32MB/s]                           
100%|█████████▉| 399999/400000 [00:09<00:00, 43448.11it/s]


#Training loop function

In [None]:
def glove_train(train_dataset, val_dataset, model, hyperparameters, n_eval, device):
    """
    Trains and evaluates a model.

    Args:
        train_dataset:   PyTorch dataset containing training data.
        val_dataset:     PyTorch dataset containing validation data.
        model:           PyTorch model to be trained.
        hyperparameters: Dictionary containing hyperparameters.
        n_eval:          Interval at which we evaluate our model.
    """

    # Get keyword arguments
    batch_size, epochs = hyperparameters["batch_size"], hyperparameters["epochs"]

    # Initialize dataloaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True
    )

    # Note: batch_size = len(val_dataset), so that's the whole validation set
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=len(val_dataset), shuffle=True
    )

    # Initalize optimizer (for gradient descent) and loss function
    optimizer = optim.Adam(model.parameters())
    loss_fn = nn.BCELoss()

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1} of {epochs}")

        # Loop over each batch in the dataset
        for batch, (X, y) in tqdm(enumerate(train_loader)):
            # Predictions and loss
            '''
            inputs = Embeddings.vectorize_batch(X)
            '''

            inputs = vectorize_batch(X)
            y = y.type(torch.float)

            inputs = inputs.to(device)
            y = y.to(device)

            pred = model(inputs)
            pred = torch.flatten(pred)
            loss = loss_fn(pred, y)
            pred.to(device)

            # Backpropagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Periodically evaluate our model + log to Tensorboard
            if batch % n_eval == 0:

                joshua_path = "/content/drive/MyDrive/Colab Notebooks/GloveLSTMSaves/"
                name = "GloveLSTM" + str(batch) + ".pt"
                torch.save(model, joshua_path + name)

                # Compute training loss and accuracy.
                accuracy = compute_accuracy(pred, y)
                print("loss: ", loss)
                print("accuracy: ", accuracy)

                # Compute validation loss and accuracy.
                val_loss, val_accuracy,val_f1 = evaluate(val_loader, model, loss_fn, device)
                print("validation loss: ", val_loss)
                print("validation accuracy: ", val_accuracy)
                print("f1 score: ", val_f1)
                # TODO: Log the results to Tensorboard.



def compute_accuracy(outputs, labels):
    n_correct = (torch.round(outputs) == labels).sum().item()
    n_total = len(outputs)
    return n_correct / n_total


def evaluate(val_loader, model, loss_fn, device):
    with torch.no_grad():
        # There should only be one batch (the entire validation set)
        for (X, y) in val_loader:
            '''
            inputs = Embeddings.vectorize_batch(X)
            '''

            inputs = vectorize_batch(X)
            y = y.type(torch.float)

            inputs = inputs.to(device)
            y = y.to(device)

            pred = model(inputs)
            pred = torch.flatten(pred)
            loss = loss_fn(pred, y)
            f1 = f1_score(torch.round(pred).cpu(), y.cpu(), average='macro')
            accuracy = compute_accuracy(pred, y)
            return loss, accuracy, f1


#Training the model

In [None]:
# Set hyperparameters
EPOCHS = 1
BATCH_SIZE = 32
N_EVAL = 100
HIDDEN_DIM = 64
SEED = 17
NUM_LAYERS = 1

Mounted at /content/drive


In [None]:
# Load the data we downloaded

%cd /
%cd content
data_path = 'quora_questions_train.csv'
data_pd = pd.read_csv(data_path)

/
/content


In [None]:
data_pd.head()

Unnamed: 0,qid,question_text,target
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0


In [None]:
# Split the data into training and validation sets
data, val = train_test_split(data_pd, test_size = 0.05, stratify = data_pd['target'], shuffle = True, random_state = SEED)
train_dataset = RawWordsDataset(data)
val_dataset = RawWordsDataset(val)

In [None]:
# Define the model
model = LSTMNetwork(embed_len, HIDDEN_DIM, NUM_LAYERS, max_words)

In [None]:
# Load the model onto the GPU
model.to(DEVICE)

LSTMNetwork(
  (recurrent_layer): LSTM(50, 64)
  (classifier): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
# Run the training loop
glove_train(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    model=model,
    hyperparameters={"epochs": EPOCHS, "batch_size": BATCH_SIZE},
    n_eval=N_EVAL,
    device = DEVICE
)

Epoch 1 of 1


0it [00:00, ?it/s]

loss:  tensor(0.6594, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
accuracy:  0.90625
validation loss:  tensor(0.6520, device='cuda:0')
validation accuracy:  0.9247247615110172
f1 score:  0.5305059814021125
loss:  tensor(0.3292, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
accuracy:  0.90625
validation loss:  tensor(0.2215, device='cuda:0')
validation accuracy:  0.9381230189719325
f1 score:  0.4840368799032969
loss:  tensor(0.4555, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
accuracy:  0.84375
validation loss:  tensor(0.2015, device='cuda:0')
validation accuracy:  0.9381230189719325
f1 score:  0.4840368799032969
loss:  tensor(0.2239, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
accuracy:  0.90625
validation loss:  tensor(0.1778, device='cuda:0')
validation accuracy:  0.9381230189719325
f1 score:  0.4840368799032969
loss:  tensor(0.1717, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward0>)
accuracy:  0.96875
validation loss:  tensor(0.1728

#Evaluation

In [None]:
# Load a saved model
# Your path will probably look different
joshua_saves_path = "/content/drive/MyDrive/ACM AI Stuff/GloveLSTMSaves/"
model_name = joshua_saves_path + "GloveLSTM38700.pt"

trained_model = torch.load(model_name)
trained_model.eval()

LSTMNetwork(
  (recurrent_layer): LSTM(50, 64)
  (classifier): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [None]:
# Load the test data
%cd /
%cd content
test_path = 'quora_questions_test.csv'
test_pd = pd.read_csv(test_path)

/
/content


In [None]:
test_pd.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [None]:
test_size = test_pd["qid"].size
test_size

375806

In [None]:
# Use our model to predict on the test data and save our guesses
guesses = []
for i in tqdm(range(test_size)):
  input = test_pd.loc[i, "question_text"]
  input_tokenized = tokenizer(input)
  input_tokenized = input_tokenized+[""] * (max_words-len(input_tokenized))  if len(input_tokenized)<max_words else input_tokenized[:max_words]
  input_vectorized = global_vectors.get_vecs_by_tokens(input_tokenized)
  input_vectorized = input_vectorized.to(DEVICE)
  pred = trained_model(input_vectorized)
  pred = torch.round(torch.squeeze(pred)).item()
  guesses.append(pred)

  0%|          | 0/375806 [00:00<?, ?it/s]

In [None]:
final_answers_pd = pd.DataFrame(guesses, columns = ["prediction"])

In [None]:
final_answers_pd.head()

Unnamed: 0,prediction
0,1.0
1,0.0
2,0.0
3,0.0
4,0.0


In [None]:
qids = test_pd[["qid"]]

In [None]:
qids.head()

Unnamed: 0,qid
0,0000163e3ea7c7a74cd7
1,00002bd4fb5d505b9161
2,00007756b4a147d2b0b3
3,000086e4b7e1c7146103
4,0000c4c3fbe8785a3090


In [None]:
submission_pd = pd.concat([qids, final_answers_pd], axis = 1)

In [None]:
submission_pd.head()

Unnamed: 0,qid,prediction
0,0000163e3ea7c7a74cd7,1.0
1,00002bd4fb5d505b9161,0.0
2,00007756b4a147d2b0b3,0.0
3,000086e4b7e1c7146103,0.0
4,0000c4c3fbe8785a3090,0.0


In [None]:
submission_pd.to_csv("/content/drive/MyDrive/ACM AI Stuff/GloveLSTMSaves/glovelstm38700.csv")