In [21]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, random_split
import nltk
import torch.optim as optim
import re
import pandas as pd
import spacy

!python3 -m spacy download en_core_web_sm
ner = spacy.load("en_core_web_sm")
nltk.download('stopwords')

Collecting en-core-web-sm==3.0.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0-py3-none-any.whl (13.7 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package stopwords to /Users/Matt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [33]:
# Load data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('dev.csv')
train_proportion = 0.8

original_words = []
for i in range(len(train_df)):
  original_words.append(re.search(r'<(.*)/>', train_df['original'][i]).group(1))
  train_df.at[i, 'original'] = re.sub(r'<.*/>', train_df['edit'][i], train_df['original'][i])
train_df['originalWord'] = original_words


In [30]:
class Task1DatasetNN(Dataset):

  def __init__(self, train_data, labels, max_len, word2idx, entity2idx, stopwords=nltk.corpus.stopwords.words('english')):
    self.x_train = train_data
    self.y_train = labels
    self.stopwords = stopwords
    self.max_len = max_len
    self.word2idx = word2idx
    self.entity2idx = entity2idx

  def __len__(self):
    return len(self.y_train)

  def __getitem__(self, item):
    headline = self.x_train[item]

    # Remove punctuation
    headline = re.sub(r'[^\w\s]', '', headline)

    # Map headlines to list of entity IDs
    # entities = [self.entity2idx[str(e)] for e in ner(headline).ents if str(e) in self.entity2idx]

    # Tokenize, lowercase, remove stop words
    headline = [w.lower() for w in headline.split() if w not in self.stopwords and len(w)]

    # Get headline vector
    vectorized_headline = [self.word2idx[tok] for tok in headline if tok in self.word2idx]
    headline_tensor = torch.zeros((self.max_len,)).long()
    headline_tensor[:len(vectorized_headline)] = torch.LongTensor(vectorized_headline)

    target_tensor = torch.FloatTensor([self.y_train[item]])
    extra_features = []

    return {
      'encoding': headline_tensor,
      'extra_features': torch.tensor(extra_features),
      'target': target_tensor
    }

In [4]:
TOKEN_PATTERN = re.compile("\w+")
def tokenize(sentence):
  return TOKEN_PATTERN.findall(sentence)

# Remove angle brackets and lowercase
def preprocess(word):
  if len(word) > 2 and word[0] == '<' and word[-2] == '/' and word[-1] == '>' :
    word = word[1:-2]
  return word.lower()

# Create vocab of all words
def create_vocab(data):
  # Let us put the tokenized corpus in a list
  tokenized_corpus = [[preprocess(t) for t in tokenize(s)] for s in data]

  # Create single list of all vocabulary
  vocabulary = []
  for sentence in tokenized_corpus:
    for token in sentence:
      if token not in vocabulary:
        vocabulary.append(token)

  return vocabulary, tokenized_corpus

# TODO: need to tie this in with NER
def build_word2idx(tokenized_corpus):
  vocabulary = []
  for sentence in tokenized_corpus:
    for token in sentence:
      if token not in vocabulary:
        vocabulary.append(token)
  w2i = {w: idx+1 for (idx, w) in enumerate(vocabulary)}
  w2i['<pad>'] = 0
  return w2i

# Get NER lookup table from training data
def build_entity2idx(corpus):
  entities = {}
  idx = 0
  for s in corpus:
    doc = ner(s)
    for e in doc.ents:
      e = str(e)
      if e not in entities:
        entities[e] = idx
        idx+=1
  return entities

In [5]:
# e2i = get_entity2idx(corpus)
# print(e2i)
# print(type(e2i['Trump']))
# efs = get_entity_features(corpus, e2i)
# print(efs[:10])

In [6]:
class FFNN(nn.Module):
  def __init__(self, embedding_dim, hidden_dim, vocab_size, num_classes):
    super(FFNN, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
    self.fc1 = nn.Linear(embedding_dim, hidden_dim)
    self.relu1 = nn.ReLU()
    self.fc2 = nn.Linear(hidden_dim, num_classes)
    self.relu2 = nn.ReLU()

  # Average sentence embedding (ignoring padding)
  # Maps embedded : (batch_size, max_sent_len, embedding_dim) => (batch_size, embeddind_dim)
  def average_embedding(self, embedded):
    # Want to calculate the avg embedding across all words
    # This is the max_sent_len dimension so dim=1
    total_emb = torch.sum(embedded, dim=1)

    # Count number of non-zero embeddings to average over
    non_zero_embs = torch.sum(embedded != 0, dim=[1])
    avg_emb = total_emb / non_zero_embs

    return avg_emb

  def forward(self, x):
    # x has shape (batch_size, max_sent_len)
    embedded = self.embedding(x)

    # embedding has shape (batch_size, max_sent_len, embedding_dim)
    # Compute the average embeddings of shape (batch_size, embedding_dim)
    averaged = self.average_embedding(embedded)

    # TODO: extra features

    out = self.fc1(averaged)
    out = self.relu1(out)
    out = self.fc2(out)
    # out = self.relu2(out)
    return out


In [7]:
EPOCHS = 10
LRATE = 0.1
EMBEDDING_DIM = 20 # ~4th root of vocab size
HIDDEN_DIM = 50
OUTPUT_DIM = 1

# Set training and test data
training_data = train_df['original']
training_labels = train_df['meanGrade']
test_data = test_df['original']

_vocab, tokenized_corpus = create_vocab(training_data)
max_headline_len = max([len([w for w in s.split()]) for s in training_data])
w2i = build_word2idx(tokenized_corpus)
# e2i = build_entity2idx(training_data)
e2i = {}

train_and_dev = Task1DatasetNN(training_data, training_labels, max_headline_len, w2i, e2i)

# TODO: what processing should be done on both train and dev and what should be done on just train???
# Split training data into train and dev sets
train_examples = round(len(training_data)*train_proportion)
dev_examples = len(training_data) - train_examples
train_dataset, dev_dataset = random_split(train_and_dev, (train_examples, dev_examples))

# train_sent_tensor, train_label_tensor
i2w = {v:k for k,v in w2i.items()}

# for a, d in zip(training_data, train_and_dev):
#   d = d['encoding'].tolist()
#   dec = [i2w[i] for i in d]
#   print(a)
#   print(dec)


In [8]:
# Create data loaders
BATCH_SIZE = 50
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE)

# Construct the model
model = FFNN(EMBEDDING_DIM, HIDDEN_DIM, len(w2i), OUTPUT_DIM)
print(model)

# we use the stochastic gradient descent (SGD) optimizer
optimizer = optim.SGD(model.parameters(), lr=LRATE)

# we use the binary cross-entropy loss with sigmoid (applied to logits)
# Recall that we did not apply any activation to our output layer, hence we need
# to make our outputs look like probabilities.
loss_fn = nn.MSELoss()

# Start training
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EPOCHS = 10
for epoch in range(EPOCHS):
  # to ensure the dropout (explained later) is "turned on" while training
  # good practice to include even if do not use here
  model.train()

  for batch in train_loader:

    encoding = batch['encoding'].to(device)
    target = batch['target'].to(device).squeeze(1)
    # extra_features = batch['extra_features'].to(device)

    # we zero the gradients as they are not removed automatically
    optimizer.zero_grad()

    # squeeze is needed as the predictions will have the shape (batch size, 1)
    # and we need to remove the dimension of size 1
    predictions = model(encoding).squeeze(1)

    # Compute the loss
    loss = loss_fn(predictions, target)
    train_loss = loss.item()

    # calculate the gradient of each parameter
    loss.backward()

    # update the parameters using the gradients and optimizer algorithm
    optimizer.step()

  print(f'| Epoch: {epoch:02} | Train Loss: {train_loss:.3f}')

FFNN(
  (embedding): Embedding(7265, 20, padding_idx=0)
  (fc1): Linear(in_features=20, out_features=50, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=50, out_features=1, bias=True)
  (relu2): ReLU()
)
| Epoch: 00 | Train Loss: 0.246
| Epoch: 01 | Train Loss: 0.219
| Epoch: 02 | Train Loss: 0.457
| Epoch: 03 | Train Loss: 0.537
| Epoch: 04 | Train Loss: 0.451
| Epoch: 05 | Train Loss: 0.240
| Epoch: 06 | Train Loss: 0.418
| Epoch: 07 | Train Loss: 0.493
| Epoch: 08 | Train Loss: 0.415
| Epoch: 09 | Train Loss: 0.337
