## **NLP Team Project**
Problem Statement: Classifying Quora questions as insincere or not.


### Loading datasets and preliminaries

In [None]:
# Download the training data
!gdown "https://drive.google.com/uc?export=download&id=199EtucyfnhZOIHq_jfGjODTZm3Scz5yC"

Downloading...
From: https://drive.google.com/uc?export=download&id=199EtucyfnhZOIHq_jfGjODTZm3Scz5yC
To: /content/quora_questions_train.csv
100% 124M/124M [00:01<00:00, 96.6MB/s]


In [None]:
# Download the test data
!gdown "https://drive.google.com/uc?export=download&id=1hL9DZvEY-HCOh3FXVtUIuS1CyM8zhRZn"

Downloading...
From: https://drive.google.com/uc?export=download&id=1hL9DZvEY-HCOh3FXVtUIuS1CyM8zhRZn
To: /content/quora_questions_test.csv
100% 35.0M/35.0M [00:00<00:00, 105MB/s]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from tqdm import tqdm
import numpy as np
SEED = 23

### Creating the BagofWords Dataset to get data ready for training

In [None]:
class BagofwordsDataset(torch.utils.data.Dataset):
    def __init__(self, data,vectorizer):
        self.df = data
        self.sequences = vectorizer.transform(self.df.question_text.tolist()) # matrix of word counts for each sample
        self.labels = self.df.target.tolist() # list of labels
        self.token2idx = vectorizer.vocabulary_ # dictionary converting words to their counts
        self.idx2token = {idx: token for token, idx in self.token2idx.items()} # same dictionary backwards
    def __getitem__(self, i):
        # return the ith sample's list of word counts and label
        return self.sequences[i, :].toarray(), self.labels[i]

    def __len__(self):
        return self.sequences.shape[0]

### Neural net design

In [None]:
class StartingNetwork(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.sigmoid(x)
        return x

## Training the model (and evaluating loss) in epochs! ⏰⏱

In [None]:
def starting_train(train_dataset, val_dataset, model, hyperparameters, n_eval):
    """
    Trains and evaluates a model.
    Args:
        train_dataset:   PyTorch dataset containing training data.
        val_dataset:     PyTorch dataset containing validation data.
        model:           PyTorch model to be trained.
        hyperparameters: Dictionary containing hyperparameters.
        n_eval:          Interval at which we evaluate our model.
    """

    # Get keyword arguments
    batch_size, epochs = hyperparameters["batch_size"], hyperparameters["epochs"]

    # Initialize dataloaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True
    )

    # Note: batch_size = len(val_dataset), so that's the whole validation set
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=len(val_dataset), shuffle=True
    )

    # Initalize optimizer (for gradient descent) and loss function
    optimizer = optim.Adam(model.parameters())
    loss_fn = nn.BCELoss()

    for epoch in range(epochs):
        print(f"Epoch {epoch + 1} of {epochs}")

        # Loop over each batch in the dataset
        for batch, (X, y) in tqdm(enumerate(train_loader)):
            # Predictions and loss
            X = X.type(torch.float)
            y = y.type(torch.float)

            pred = model(X)
            pred = np.squeeze(pred)
            loss = loss_fn(pred, y)

            # Backpropagation and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Periodically evaluate our model + log to Tensorboard
            if batch % n_eval == 0:
                # Compute training loss and accuracy.

                # CHANGE PATH AS NECESSARY
                joshua_path = "/content/drive/MyDrive/Colab Notebooks/BagOfWordsSaves/"
                #arjun_path = ""
                name = "BagOfWords" + str(batch) + ".pt"
                torch.save(model, joshua_path + name)
                #torch.save(model, arjun_path + name)
                accuracy = compute_accuracy(pred, y)
                print("batch loss: ", loss)
                print("batch accuracy: ", accuracy)

                # Compute validation loss and accuracy.
                val_loss, val_accuracy, f1 = evaluate(val_loader, model, loss_fn)
                print("validation loss: ", val_loss)
                print("validation accuracy: ", val_accuracy)
                print("f1 score: ", f1)
                # TODO: Log the results to Tensorboard.



def compute_accuracy(outputs, labels):
    n_correct = (torch.round(outputs) == labels).sum().item()
    n_total = len(outputs)
    return n_correct / n_total


def evaluate(val_loader, model, loss_fn):
    with torch.no_grad():
        # There should only be one batch (the entire validation set)
        for (X, y) in val_loader:
            X = X.type(torch.float)
            y = y.type(torch.float)

            pred = model(X)
            pred = np.squeeze(pred)
            loss = loss_fn(pred, y)
            f1 = f1_score(torch.round(pred), y, average='macro')
            accuracy = compute_accuracy(pred, y)
            return loss, accuracy, f1

##Using train data to run the training function we wrote 😅

In [None]:
EPOCHS = 1
BATCH_SIZE = 32
N_EVAL = 100
HIDDEN_DIM = 64

In [None]:
%cd /
%cd content
data_path = 'quora_questions_train.csv'
data_pd = pd.read_csv(data_path)
data, val = train_test_split(data_pd, test_size = 0.05, stratify = data_pd['target'], shuffle = True, random_state = SEED)
vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=0.005)
fit = vectorizer.fit(data.question_text.tolist()) # matrix of word counts for each sample
train_dataset = BagofwordsDataset(data,fit)
val_dataset = BagofwordsDataset(val,fit)

/
/content


In [None]:
model = StartingNetwork(train_dataset.sequences.shape[1], HIDDEN_DIM)

In [None]:
hyperparameters = {"batch_size": BATCH_SIZE, "epochs": EPOCHS}

In [None]:
starting_train(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        model=model,
        hyperparameters=hyperparameters,
        n_eval=N_EVAL,
    )

##Evaluation and archiving of model for later use 🤔

In [None]:
joshua_saves_path = "/content/drive/MyDrive/ACM AI Stuff/BagOfWordsSaves/"
model_name = joshua_saves_path + "BagOfWords38700.pt"

trained_model = torch.load(model_name)
trained_model.eval()

StartingNetwork(
  (fc1): Linear(in_features=110, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

## Some data processing to run test dataset! 🤓

In [None]:
%cd /
%cd content
test_path = 'quora_questions_test.csv'
test_pd = pd.read_csv(test_path)

/
/content


In [None]:
test_pd.head()

Unnamed: 0,qid,question_text
0,0000163e3ea7c7a74cd7,Why do so many women become so rude and arroga...
1,00002bd4fb5d505b9161,When should I apply for RV college of engineer...
2,00007756b4a147d2b0b3,What is it really like to be a nurse practitio...
3,000086e4b7e1c7146103,Who are entrepreneurs?
4,0000c4c3fbe8785a3090,Is education really making good people nowadays?


In [None]:
test_size = test_pd["qid"].size
test_size

375806

In [None]:
test_pd.loc[0, "question_text"]

'Why do so many women become so rude and arrogant when they get just a little bit of wealth and power?'

## Preparing CSV with predictions generated on test dataset 🔮🔮

In [None]:
# NEED TO SAVE VECTORIZER IN MODEL AND USE THE SAME IN TRAINING AND TESTING

guesses = []
for i in range(test_size):
  input = test_pd.loc[i, "question_text"]
  pred = trained_model(input)
  pred = torch.round(torch.squeeze(pred)).item()
  guesses.append(pred)

TypeError: ignored