In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import codecs
from dataloaders import *
from processor import *

In [2]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
# !unzip glove*.zip

--2021-02-17 21:50:53--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-02-17 21:50:53--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-02-17 21:50:53--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’

glove

In [2]:
# Setting random seed and device
SEED = 1

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

In [3]:
train_df = pd.read_csv('data/task-1/train.csv')
test_df = pd.read_csv('data/task-1/dev.csv')

training_data = train_df['original']
training_edits = train_df['edit']
test_data = test_df['original']
test_edits = test_df['edit']

edited_training = pd.Series(create_edited_sentences(training_data, training_edits))
edited_test = pd.Series(create_edited_sentences(test_data, test_edits))

# Creating word vectors
training_vocab, training_tokenized_corpus = create_vocab(edited_training)
test_vocab, test_tokenized_corpus = create_vocab(edited_test)
joint_vocab, joint_tokenized_corpus = create_vocab(pd.concat([edited_training, edited_test]))

training_grades = train_df['meanGrade']

training_vector_sentences = vectorize_sentences(training_tokenized_corpus, joint_vocab)
training_dataset = Task1Dataset(training_vector_sentences, training_grades)

train_dataset, validation_dataset = dataset_split(training_dataset)

BATCH_SIZE = 32
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

glove_tensor, words_not_in_glove = build_embedding_tensor(joint_vocab, 100)
print("Number of words not in GloVe: {}".format(words_not_in_glove), flush=True)

Word 
Word chibok
Word stupefy
Word daca
Word schlapp
Word manassian
Word shithole
Word dotards
Word deplorableness
Word jives
Word tipline
Word ukraines
Word kushners
Word brexit
Word delingpole
Word orangeness
Word pocohontas
Word aecon
Word 300mw
Word canoodles
Word #war
Word reince
Word mcenany
Word q&amp;a
Word subreddit
Word graveling
Word cyberweapons
Word warmbier
Word #metoo
Word guantรกnamo
Word scaramucci
Word hollyweed
Word infowars.com
Word myeshia
Word npr/ipsos
Word dubke
Word sh*t
Word huffpo
Word wapo
Word maute
Word !!!!!!!!!!
Word kusher
Word #womensmarch
Word catfishing
Word bigly
Word rodchenkov
Word disinvites
Word covfefe
Word schlapps
Word puzder
Word redecorates
Word frexit
Word bamboozles
Word gianforte
Word whcd
Word bagpiping
Word wannacry
Word unfollowed
Word nobost
Word zarrab
Word transtemporalize
Word 418m
Word frightbart
Word bridgegate
Word exerpts
Word counterspies
Word 133m
Word selfie
Word trumpcare
Word strzok
Word dumbs
Word 230k
Word inaguration


In [4]:
def model_performance(output, target, print_output=False):
    """
    Returns SSE and MSE per batch (printing the MSE and the RMSE)
    """

    sq_error = (output - target)**2

    sse = np.sum(sq_error)
    mse = np.mean(sq_error)
    rmse = np.sqrt(mse)

    if print_output:
        print(f'| MSE: {mse:.2f} | RMSE: {rmse:.2f} |')

    return sse, mse


In [6]:
class FFNN(nn.Module):
    def __init__(self, glove_tensor, embedding_dim, vocab_size):
        super(FFNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0).from_pretrained(glove_tensor)
        # hidden layer
        self.fc1 = nn.Linear(embedding_dim, 20)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(20, 10)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(10, 5)
        self.relu3 = nn.ReLU()
        # output layer
        self.output = nn.Linear(5, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        sentence_lengths = x.ne(0).sum(1, keepdims=True)
        averaged = embedded.sum(1) / sentence_lengths
        out = self.fc1(averaged)
        out = self.relu1(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.output(out)
        return out

In [12]:
class CNN(nn.Module):
  def __init__(self, glove, vocab_size, embedding_dim, output_channels, window_size, out_dim, dropout):
    super(CNN, self).__init__()
    
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0).from_pretrained(glove)
    
    self.conv = nn.Conv2d(
      in_channels=1, out_channels=output_channels,
      kernel_size=(window_size, embedding_dim))
    
    self.drop = nn.Dropout(dropout)
    self.fc1 = nn.Linear(output_channels, 16)
    self.fc2 = nn.Linear(16, out_dim)
        
  def forward(self, x):
    
    x_embed = self.embedding(x)
    x_embed = x_embed.unsqueeze(1)     
    x_embed = self.conv(x_embed)
    
    x_embed = x_embed.squeeze(3)

    x_embed = F.relu(x_embed)
    x_embed = F.max_pool1d(x_embed, x_embed.shape[2])
    
    x_embed = x_embed.squeeze(2)
    
    x_embed = self.drop(x_embed)
    x_embed = self.fc1(x_embed)
    x_embed = self.drop(x_embed)
    out = self.fc2(x_embed)
    
    return out

In [6]:
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_sse = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch
            feature, target = feature.to(device), target.to(device)
            # for RNN:
            # model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            # model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            sse, __ = model_performance(pred, trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_sse/no_observations, np.array(pred_all), np.array(trg_all)

In [10]:
EMBEDDING_DIM = 100
HIDDEN_DIM = 50
# model = FFNN(glove_tensor, EMBEDDING_DIM, len(joint_vocab))
model = CNN(glove_tensor, len(joint_vocab), EMBEDDING_DIM, 3, 5, 1, 0.2)
optimizer = optim.Adam(model.parameters())
loss_fn = nn.MSELoss()

def train(train_loader, validation_loader, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """
    print("Training model.")
    for epoch in range(1, number_epoch+1):
        model.train()
        epoch_loss = 0
        epoch_sse = 0
        no_observations = 0  # Observations used for training so far
        for batch in train_loader:
            feature, target = batch
            feature, target = feature.to(device), target.to(device)
            # for RNN:
            # model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            # model.hidden = model.init_hidden()
            predictions = model(feature).squeeze(1)
            optimizer.zero_grad()
            loss = loss_fn(predictions, target)
            sse, __ = model_performance(predictions.detach().cpu().numpy(), target.detach().cpu().numpy())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()*target.shape[0]
            epoch_sse += sse

        valid_loss, valid_mse, __, __ = eval(validation_loader, model)

        epoch_loss, epoch_mse = epoch_loss / no_observations, epoch_sse / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train MSE: {epoch_mse:.2f} | Train RMSE: {epoch_mse**0.5:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. MSE: {valid_mse:.2f} |  Val. RMSE: {valid_mse**0.5:.2f} |')

In [13]:
train(train_loader, validation_loader, model, 200)

Training model.
| Epoch: 01 | Train Loss: 0.30 | Train MSE: 0.30 | Train RMSE: 0.55 |         Val. Loss: 0.31 | Val. MSE: 0.31 |  Val. RMSE: 0.56 |
| Epoch: 02 | Train Loss: 0.29 | Train MSE: 0.29 | Train RMSE: 0.54 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.56 |
| Epoch: 03 | Train Loss: 0.29 | Train MSE: 0.29 | Train RMSE: 0.54 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.56 |
| Epoch: 04 | Train Loss: 0.29 | Train MSE: 0.29 | Train RMSE: 0.54 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.56 |
| Epoch: 05 | Train Loss: 0.29 | Train MSE: 0.29 | Train RMSE: 0.54 |         Val. Loss: 0.31 | Val. MSE: 0.31 |  Val. RMSE: 0.56 |
| Epoch: 06 | Train Loss: 0.29 | Train MSE: 0.29 | Train RMSE: 0.54 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.56 |
| Epoch: 07 | Train Loss: 0.29 | Train MSE: 0.29 | Train RMSE: 0.54 |         Val. Loss: 0.32 | Val. MSE: 0.32 |  Val. RMSE: 0.56 |
| Epoch: 08 | Train Loss: 0.28 | Train MSE: 0.28 | Train RMS