# Data Cleaning/Embedding

In [3]:
import pandas as pd
import torchtext
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import spacy

In [4]:
# The first time you run this will download a ~823MB file
glove = torchtext.vocab.GloVe(name="6B", # trained on Wikipedia 2014 corpus
                              dim=50)   # embedding size = 100

In [5]:
# define the columns that we want to process and how to process
# use default tokenizer (string.split())
text_field = torchtext.legacy.data.Field(sequential=True, 
                                         include_lengths=True, 
                                         use_vocab=True,
                                         batch_first=True)
label_field = torchtext.legacy.data.Field(sequential=False, 
                                          use_vocab=False, 
                                          pad_token=None, 
                                          unk_token=None,
                                          batch_first=True,
                                          preprocessing=lambda x: int(x == 'D'))

fields = [
    ('tweet', text_field), # process it as text
    ('id', None), # we dont need this, so no processing
    ('conversation_id', None), # we dont need this, so no processing
    ('party', label_field) # process it as label
]

trainds, valds, testds = torchtext.legacy.data.TabularDataset.splits(path='', 
                                                                    format='csv', 
                                                                    train='train_set.csv', 
                                                                    validation='val_set.csv',
                                                                    test='test_set.csv', 
                                                                    fields=fields, 
                                                                    skip_header=True)

In [6]:
# Build vocab
text_field.build_vocab(trainds)

In [7]:
def get_data_loader(batch_size):
  traindl = torchtext.legacy.data.BucketIterator(trainds, # specify train and validation Tabulardataset
                                                batch_size=batch_size,  # batch size of train and validation
                                                sort_key=lambda x: len(x.tweet), # on what attribute the text should be sorted
                                                sort_within_batch=True, 
                                                repeat=False)
  
  valdl = torchtext.legacy.data.BucketIterator(valds, # specify train and validation Tabulardataset
                                              batch_size=batch_size,  # batch size of train and validation
                                              sort_key=lambda x: len(x.tweet), # on what attribute the text should be sorted
                                              sort_within_batch=True, 
                                              repeat=False)
    
  testdl = torchtext.legacy.data.BucketIterator(testds, # specify train and validation Tabulardataset
                                                batch_size=batch_size,  # batch size of train and validation
                                                sort_key=lambda x: len(x.tweet), # on what attribute the text should be sorted
                                                sort_within_batch=True, 
                                                repeat=False)

  return traindl, valdl, testdl

# Model

In [8]:
# Example taken from lab

class TweetRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(TweetRNN, self).__init__()
        self.emb = torch.eye(input_size)
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, 2)
    
    def forward(self, x):
        # Look up the embedding
        x = self.emb[x]
        # Set an initial hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.rnn(x, h0)
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out

# Training

In [9]:
def train_network(model, train_loader, valid_loader, num_epochs=5, learning_rate=1e-5, plot=False):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    train_losses, valid_losses, train_acc, valid_acc = [], [], [], []
    epochs = []
    for epoch in range(num_epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            pred = model(batch.tweet[0])
            train_loss = criterion(pred, batch.party)
            train_loss.backward()
            optimizer.step()

        for batch in valid_loader:
            optimizer.zero_grad()
            pred = model(batch.tweet[0])
            valid_loss = criterion(pred, batch.party)
            
        train_losses.append(float(train_loss))
        valid_losses.append(float(valid_loss))     
        epochs.append(epoch)
        train_acc.append(get_accuracy(model, train_loader))
        valid_acc.append(get_accuracy(model, valid_loader))
        print("Epoch %d; Train Loss %f; Val Loss %f; Train Acc %f; Val Acc %f" % (
            epoch+1, train_loss, valid_loss, train_acc[-1], valid_acc[-1]))

        # TODO: Save model

    # plotting
    if plot:
      plt.title("Accuracy Curve")
      plt.plot(epochs, train_acc, label="Train")
      plt.plot(epochs, valid_acc, label="Validation")
      plt.xlabel("Epoch")
      plt.ylabel("Accuracy")
      plt.legend(loc='best')
      plt.show()

      plt.title("Loss Curve")
      plt.plot(epochs, train_losses, label="Train")
      plt.plot(epochs, valid_losses, label="Validation")
      plt.xlabel("Epoch")
      plt.ylabel("Loss")
      plt.legend(loc='best')
      plt.show()

    print("Final Training Accuracy: {}".format(train_acc[-1]))
    print("Final Validation Accuracy: {}".format(valid_acc[-1]))

def get_accuracy(model, data):
    correct, total = 0, 0
    for batch in data:
        output = model(batch.tweet[0])
        pred = output.max(1, keepdim=True)[1]
        correct += pred.eq(batch.party.view_as(pred)).sum().item()
        total += batch.party.shape[0]
    return correct / total

In [None]:
input_size = len(text_field.vocab.itos)
model = TweetRNN(input_size, 100)
train_loader, valid_loader, test_loader = get_data_loader(256)
train_network(model, train_loader, valid_loader, num_epochs=30, plot=True)