# 1. Preprocess dataset 

In [10]:
import csv
import re

from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F

In [11]:
train_emotion = []
train_tweets = []
with open('dataset/train.csv') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='"', skipinitialspace=True)
    line_count = 0
    for row in spamreader:
        line_count += 1
        if line_count == 1: continue # skip header
        if not row: continue
        emotion = row[0]
        tweet = row[1]
        tweet = tweet.replace('@USERNAME', '')
        tweet = tweet.replace('[#TRIGGERWORD#]', '')
        tweet = result = re.sub(r"http\S+", "", tweet)
        train_tweets.append(tweet)
        train_emotion.append(emotion)

In [12]:
sentences = train_tweets

# Lower-case the sentence, tokenize them and add <SOS> and <EOS> tokens
sentences = [["<SOS>"] + word_tokenize(sentence.lower()) + ["<EOS>"] for sentence in sentences]

# Create the vocabulary. Note that we add an <UNK> token to represent words not in our vocabulary.
word_counts = Counter([word for sentence in sentences for word in sentence])
vocabulary = ["<UNK>"] + [e[0] for e in list(word_counts.items()) if e[1] > 2]
vocabularySize = len(vocabulary)
word2index = {word:index for index,word in enumerate(vocabulary)}
one_hot_embeddings = np.eye(vocabularySize)

In [13]:
# Create emotion array
emotions = sorted(list(set(train_emotion)))
emotions

['anger', 'disgust', 'fear', 'joy', 'sad', 'surprise']

In [14]:
# Build the word2vec embeddings
wordEncodingSize = 300
filtered_sentences = [[word for word in sentence if word in word2index] for sentence in sentences]
w2v = Word2Vec(filtered_sentences, min_count=0, size=wordEncodingSize)
w2v_embeddings = np.concatenate((np.zeros((1, wordEncodingSize)), w2v.wv.syn0))

In [15]:
def preprocess_numberize(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into list of numbers (denoting the index into the vocabulary).
    """
    tokenized = word_tokenize(sentence.lower())
        
    # Add the <SOS>/<EOS> tokens and numberize (all unknown words are represented as <UNK>).
    tokenized = ["<SOS>"] + tokenized + ["<EOS>"]
    numberized = [word2index.get(word, 0) for word in tokenized]
    
    return numberized

def preprocess_one_hot(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of one-hot vectors.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    one_hot_embedded = one_hot_embeddings[numberized]
    
    return one_hot_embedded

def preprocess_word2vec(sentence):
    """
    Given a sentence, in the form of a string, this function will preprocess it
    into a numpy array of word2vec embeddings.
    """
    numberized = preprocess_numberize(sentence)
    
    # Represent each word as it's one-hot embedding
    w2v_embedded = w2v_embeddings[numberized]
    
    return w2v_embedded

def compute_bleu(reference_sentence, predicted_sentence):
    """
    Given a reference sentence, and a predicted sentence, compute the BLEU similary between them.
    """
    reference_tokenized = word_tokenize(reference_sentence.lower())
    predicted_tokenized = word_tokenize(predicted_sentence.lower())
    return sentence_bleu([reference_tokenized], predicted_tokenized)

# 1. Build a Emotion Decoder

In [16]:
use_cuda = False
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = F.log_softmax(self.out(output[0]), dim=1) 
        return output, hidden

    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size))
        if use_cuda:
            return result.cuda()
        else:
            return result
'''
# decoder for one hot embedding
decoder=DecoderLSTM(input_size=len(vocabulary), 
                    hidden_size=300, 
                    output_size=len(emotions))
'''
# decoder for word2vec embedding
decoder=DecoderLSTM(input_size=wordEncodingSize, 
                    hidden_size=300, 
                    output_size=len(emotions))
decoder

DecoderLSTM(
  (lstm): LSTM(300, 300)
  (out): Linear(in_features=300, out_features=6, bias=True)
)

# 2. Train the Emotion Decoder

In [17]:
# build some helper function
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [18]:
def train(target_variable, 
          emotion,
          decoder, 
          decoder_optimizer, 
          criterion, 
          embeddings=w2v_embeddings,
          teacher_force=True): 
    """
    Given a single training sample, go through a single step of training.
    """
    loss = 0
    decoder_optimizer.zero_grad()
    
    decoder_input = Variable(torch.FloatTensor([[embeddings[target_variable[0].data[0]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = (decoder.initHidden(), decoder.initHidden())

    for di in range(0,target_variable.size(0)):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)

        if teacher_force:
            ni = target_variable[di].data[0]
        else:          
            ni = topi[0][0]
        
        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input
        if di == target_variable.size(0) - 2: 
            loss += criterion(decoder_output, emotion)
        if vocabulary[ni] == "<EOS>":
            break

    loss.backward()
    
    torch.nn.utils.clip_grad_norm(decoder.parameters(), 10.0)

    decoder_optimizer.step()

    return loss.data[0] / target_variable.size(0)

decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001) 
criterion = nn.CrossEntropyLoss()  

num_epochs = 1
numberized_emotion = [emotions.index(emotion) for emotion in train_emotion]
target_emotion = Variable(torch.LongTensor(numberized_emotion))
start = time.time()
total_loss = 0
avg_loss = []
for _ in range(num_epochs):
    for i,sentence in enumerate(train_tweets):
        
        numberized = preprocess_numberize(sentence)
        if len(numberized) == 2:
            continue
        target_variable = Variable(torch.LongTensor(numberized[1:]))

        loss = train(target_variable, target_emotion[i], decoder, decoder_optimizer, criterion)
        total_loss += loss
        avg_loss.append(total_loss/(i+1))
        if i % 1000 == 0:
            print('%s (%d %d%%) %.6f' % 
                  (timeSince(start, (i+1)/len(train_tweets)), i, (i+1)/len(train_tweets)*100, total_loss/(i+1)))

0m 0s (- 209m 9s) (0 0%) 0.117485
2m 10s (- 331m 30s) (1000 0%) 0.101095
3m 51s (- 292m 14s) (2000 1%) 0.101818
5m 18s (- 265m 43s) (3000 1%) 0.100999
7m 5s (- 264m 22s) (4000 2%) 0.100529
9m 4s (- 268m 57s) (5000 3%) 0.100627
10m 52s (- 266m 50s) (6000 3%) 0.100287
12m 40s (- 264m 45s) (7000 4%) 0.100329
14m 10s (- 257m 24s) (8000 5%) 0.100434
15m 53s (- 254m 45s) (9000 5%) 0.100931
17m 32s (- 251m 25s) (10000 6%) 0.100947
19m 7s (- 247m 25s) (11000 7%) 0.100847
20m 50s (- 245m 19s) (12000 7%) 0.100751
22m 23s (- 241m 34s) (13000 8%) 0.100467
23m 50s (- 237m 13s) (14000 9%) 0.100260
25m 20s (- 233m 39s) (15000 9%) 0.100184
26m 42s (- 229m 10s) (16000 10%) 0.099966
28m 1s (- 224m 42s) (17000 11%) 0.099831
29m 28s (- 221m 31s) (18000 11%) 0.099709
30m 57s (- 218m 49s) (19000 12%) 0.099400
32m 18s (- 215m 17s) (20000 13%) 0.099109
33m 46s (- 212m 42s) (21000 13%) 0.098867
35m 11s (- 209m 59s) (22000 14%) 0.098651
36m 37s (- 207m 24s) (23000 15%) 0.098347
38m 1s (- 204m 48s) (24000 15%) 0

KeyboardInterrupt: 

In [19]:
len(train_tweets)
showPlot(avg_loss)

In [20]:
# after training, save model 
torch.save(decoder.state_dict(), 'decoder.pt')

In [None]:
# load previously training model:
torch.load(decoder.load_state_dict(), ('decoder.pt'))

# 3. Evaluate the Emotion decoder

In [21]:
dev_tweets = []
with open('dataset/dev.csv') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='"', skipinitialspace=True)
    line_count = 0
    for row in spamreader:
        line_count += 1
        if line_count == 1: continue # skip header
        if not row: continue
        tweet = row[1]
        tweet = tweet.replace('@USERNAME', '')
        tweet = tweet.replace('[#TRIGGERWORD#]', '')
        tweet = result = re.sub(r"http\S+", "", tweet)
        dev_tweets.append(tweet)

In [22]:
dev_emotions = []
with open('dataset/trial-v3.csv') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='"', skipinitialspace=True)
    line_count = 0
    for row in spamreader:
        line_count += 1
        if line_count == 1: continue # skip header
        if not row: continue
        dev_emotions.append(row[0])

In [23]:
def evaluate(decoder, 
             target_variable, 
             embeddings=w2v_embeddings, 
             teacher_force=True):
    
    decoder_input = Variable(torch.FloatTensor([[embeddings[target_variable[0].data[0]]]]))
    decoder_input = decoder_input.cuda() if use_cuda else decoder_input
    decoder_hidden = (decoder.initHidden(),decoder.initHidden())
    
    softmax = nn.Softmax()
    for di in range(0,target_variable.size(0)):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.data.topk(1)
        
        if teacher_force:
            ni = target_variable[di].data[0]
        else:          
            ni = topi[0][0]

        decoder_input = Variable(torch.FloatTensor([[embeddings[ni]]]))
        decoder_input = decoder_input.cuda() if use_cuda else decoder_input

        if di == target_variable.size(0) - 2: # last output 
            if dev_emotions[i] == emotions[topi[0][0]]:
                return True
            #print (dev_emotions[i], emotions[topi[0][0]])
            
        if vocabulary[ni] == "<EOS>":
            break
    return False

# evaluate the model
print ("ground truth, model prediction")
correct_prediction_counts = 0
for i,tweet in enumerate(dev_tweets): 
    numberized = preprocess_numberize(tweet)
    if len(numberized) == 2: continue
    target_variable = Variable(torch.LongTensor(numberized[1:]))
    
    if evaluate(decoder, target_variable):
        correct_prediction_counts += 1
    
    if i % 100 == 0:
        print (correct_prediction_counts, " correct predictions in ", i+1)
        print ("acurray: ", correct_prediction_counts/(i+1))


ground truth, model prediction
0  correct predictions in  1
acurray:  0.0
36  correct predictions in  101
acurray:  0.3564356435643564
75  correct predictions in  201
acurray:  0.373134328358209
106  correct predictions in  301
acurray:  0.3521594684385382
138  correct predictions in  401
acurray:  0.34413965087281795
172  correct predictions in  501
acurray:  0.34331337325349304
213  correct predictions in  601
acurray:  0.3544093178036606
258  correct predictions in  701
acurray:  0.3680456490727532
298  correct predictions in  801
acurray:  0.37203495630461925
333  correct predictions in  901
acurray:  0.36958934517203107
374  correct predictions in  1001
acurray:  0.37362637362637363
413  correct predictions in  1101
acurray:  0.3751135331516803
447  correct predictions in  1201
acurray:  0.37218984179850123
484  correct predictions in  1301
acurray:  0.372021521906226
523  correct predictions in  1401
acurray:  0.37330478229835834
564  correct predictions in  1501
acurray:  0.3757