In [0]:
import gdown
import os
from pandas_profiling import ProfileReport

# https://drive.google.com/file/d/1l_J0P9A_AD8d_rzZHJ5Fg8F4y1nGP_x3/view?usp=sharing

url = f'https://drive.google.com/uc?id=1l_J0P9A_AD8d_rzZHJ5Fg8F4y1nGP_x3'
filename = 'dataset.csv'
if not os.path.exists(filename):
    gdown.download(url, filename, quiet=True)

In [0]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

In [0]:
columns = ['emotion', 'text']

df = pd.read_csv(filename, names=columns)

In [0]:
Xraw = df['text'].values
yraw = df['emotion'].values

### Preprocessing

In [0]:
import nltk

nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
!python -m pip install -U symspellpy

Requirement already up-to-date: symspellpy in /usr/local/lib/python3.6/dist-packages (6.5.2)


In [0]:
text_raw = df['text'].values
print(text_raw[0])

On days when I feel close to my partner and other friends.   
When I feel at peace with myself and also experience a close  
contact with people whom I regard greatly.


In [0]:
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell()

dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")

sym_spell.load_dictionary(dictionary_path, 0, 1)

spell = lambda term: ' '.join([sym_spell.lookup(t, Verbosity.CLOSEST, 
                                      max_edit_distance=2, include_unknown=True)[0].term for t in term.split()])

In [0]:
from nltk.tokenize import word_tokenize

process_text = lambda t: word_tokenize(t.lower()) if type(t) is str else []

In [0]:
text_prep = list(map(process_text, text_raw))
' | '.join(text_prep[0])

'on | days | when | i | feel | close | to | my | partner | and | other | friends | . | when | i | feel | at | peace | with | myself | and | also | experience | a | close | contact | with | people | whom | i | regard | greatly | .'

In [0]:
all_text  = df['text'].str.cat()
all_text_prep = process_text(all_text)

In [0]:
from collections import Counter

word_freq = Counter(all_text_prep)

In [0]:
word_freq.most_common()[-10:]

[('classmate.when', 1),
 ('baptism', 1),
 ('pepole', 1),
 ('baptised', 1),
 ('deepened.when', 1),
 ('gym', 1),
 ('stack', 1),
 ('questioning', 1),
 ('inserted', 1),
 ('randomly', 1)]

In [0]:
vocab = list(word_freq.keys())
len(vocab)

13438

### GLOVE pretrained word embedding

In [0]:
# !wget http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip

In [0]:
# !unzip glove.42B.300d.zip

In [0]:
# lines = []

# with open(f'{glove_path}/glove.42B.300d.txt', 'rb') as f:
#     for l in f:
#         line_decode = l.decode(encoding='utf-8')
#         line = line_decode.lower().split()

#         if line[0] in word_freq:
#             lines.append(line_decode)

In [0]:
# lines[0]
# len(lines)

In [0]:
# word2vecfilename = 'relevant_word2vec.txt'

# with open(word2vecfilename, 'w') as wf:
#     wf.write("".join(lines))

In [0]:
word2vecfilename = 'relevant_word2vec.txt'

# https://drive.google.com/file/d/1E2FCguEoggAVak1dCXksXlfq7ruOiFU1/view?usp=sharing
w2vurl = f'https://drive.google.com/uc?id=1E2FCguEoggAVak1dCXksXlfq7ruOiFU1'

if not os.path.exists(word2vecfilename):
    gdown.download(w2vurl, word2vecfilename, quiet=True)

In [0]:
word2vec = {}
with open(word2vecfilename) as fr:
    for l in fr:
        line = l.split()
        word = line[0]
        wordvec = np.array(line[1:], dtype=np.float64)
        word2vec[word] = wordvec

In [0]:
np.random.seed(18)
word2vec['UNK'] = np.random.randn(300, 1)
# word2vec['UNK']

In [0]:
# processed_text = process_text(text)
# processed_text

In [0]:
def process_input(inputs):
    return [
         np.array([word2vec[tk].flatten() for tk in process_text(text) if tk in word2vec])
         for text in inputs
    ]

In [0]:
Xp = process_input(list(text_raw))

In [0]:
classes = list(set(yraw))

yci = np.array([classes.index(c) for c in yraw], dtype=np.float64)

In [0]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)

enc.fit(yraw.reshape(-1, 1))

yenc = enc.transform(yraw.reshape(-1, 1))

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
Xenc = np.array(Xp)

In [107]:
X_train, X_test, y_train, y_test = train_test_split(Xenc, yci, test_size=0.2)

print(len(X_train), len(X_test), y_train.shape, y_test.shape)

5956 1490 (5956,) (1490,)


In [108]:
y_train[0]

4.0

### Bidirectional LSTM Encoder-Decoder architecture

In [109]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

device: cuda


In [0]:
# X_train = torch.tensor(X_train, device=device)
# y_train = torch.tensor(y_train, device=device)

# X_test = torch.tensor(X_test, device=device)
# y_test = torch.tensor(y_test, device=device)

In [0]:
class Encoder(nn.Module):
    def __init__(self, n_class, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size=300, hidden_size=self.hidden_size)

    def forward(self, inp, hidden_state):
        # print(inp.shape)
        # print(f'forward: {inp}')
        return self.lstm(inp.view((1, 1, -1)), hidden_state)
    
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device),
                torch.zeros(1, 1, self.hidden_size, device=device))

class Decoder(nn.Module):
    def __init__(self, n_class, hidden_size):
        super().__init__()
        # self.lin_layer = nn.Linear(hidden_size, 512)
        self.out_layer = nn.Linear(hidden_size, n_class)

    def forward(self, inp):
        # out1 = torch.tanh(self.lin_layer(inp))
        return F.log_softmax(self.out_layer(inp).squeeze(), dim=0)

In [0]:
encoder = Encoder(7, 1024).to(device)
decoder = Decoder(7, 1024).to(device)

enc_optim = optim.Adam(encoder.parameters())
dec_optim = optim.Adam(decoder.parameters())

criterion = nn.NLLLoss()

In [0]:
def train(X_i, y_i, encoder, decoder, enc_optim, dec_optim, criterion):
    hidden, cell = encoder.init_hidden()

    enc_optim.zero_grad()
    dec_optim.zero_grad()

    X_i = torch.tensor(X_i, dtype=torch.float32, device=device)
    y_i = torch.tensor(y_i, dtype=torch.long, device=device).view(1)
    # y_i = int(y_i)

    n_len = X_i.size(0)
    # print(f'Shape of X_i: {X_i.shape}, {X_i.dtype}')
    # print(f'Shape of y_i: {y_i.shape}, {y_i.dtype}')

    for ei in range(n_len):
        _, (hidden, cell) = encoder(X_i[ei], (hidden, cell))

    # print(f'hidden: {hidden}, {type(hidden)}')

    decoder_output = decoder(hidden).squeeze()
    # print(f'decoder output: {decoder_output}, {decoder_output.shape}')
    # print(f'y_i: {y_i}, {y_i.shape}')

    loss = criterion(decoder_output.view(1, 7), y_i)
    # print(f'Loss: {loss}')

    loss.backward()

    enc_optim.step()
    dec_optim.step()

    return loss.item()

In [0]:
def train_iters(X, y, n_epochs, encoder, decoder, enc_optim, dec_optim, criterion):
    for i in range(n_epochs):
        num_X = len(X)

        loss = 0
        for xi in range(num_X):
            if xi == (num_X - 1):
                torch.save(encoder.state_dict(), './encoder.tm')
                torch.save(decoder.state_dict(), './decoder.tm')
                print(f'Training data: {xi}/{num_X}', end='\n')
                # training_accuracy = get_accuracy(X_train, y_train, encoder, decoder)
                testing_accuracy = get_accuracy(X_test, y_test, encoder, decoder)

                print(f'Testing accuracy: {testing_accuracy:.4f}')
            loss += train(X[xi], y[xi], encoder, decoder, enc_optim, dec_optim, criterion)

        loss /= num_X
        print(f'Epoch: {i}, Loss: {loss:.5f}')

In [0]:
def predict(X, encoder, decoder):
    with torch.no_grad():
        X = torch.tensor(X, dtype=torch.float32, device=device)
        X_len = X.size(0)

        hidden, cell = encoder.init_hidden()

        for ei in range(X_len):
            _, (hidden, cell) = encoder(X[ei], (hidden, cell))

        decoder_output = decoder(hidden).squeeze()

        return decoder_output

In [0]:
from sklearn.metrics import accuracy_score

def get_accuracy(X, y, encoder, decoder):
    y_pred = [np.argmax(predict(Xi, encoder, decoder).cpu()) for Xi in X]
    accuracy = accuracy_score(y, y_pred)
    return accuracy

In [0]:
train_iters(X_train, y_train, 4000, encoder, decoder, enc_optim, dec_optim, criterion)

Training data: 5955/5956
Testing accuracy: 0.5785
Epoch: 0, Loss: 1.41183
Training data: 5955/5956
Testing accuracy: 0.5933
Epoch: 1, Loss: 0.95683
Training data: 5955/5956
Testing accuracy: 0.5919
Epoch: 2, Loss: 0.65850
Training data: 5955/5956
Testing accuracy: 0.5966
Epoch: 3, Loss: 0.36715
Training data: 5955/5956
Testing accuracy: 0.5792
Epoch: 4, Loss: 0.20456
Training data: 5955/5956
Testing accuracy: 0.5866
Epoch: 5, Loss: 0.12315
Training data: 5955/5956
Testing accuracy: 0.5872
Epoch: 6, Loss: 0.10360
Training data: 5955/5956
Testing accuracy: 0.5859
Epoch: 7, Loss: 0.08695
Training data: 5955/5956
Testing accuracy: 0.5852
Epoch: 8, Loss: 0.08023


In [0]:
y_train_pred = np.array([np.argmax(predict(xi, encoder, decoder).cpu()) for xi in X_train])

In [0]:
print(classification_report(y_train, y_train_pred))

In [0]:
from sklearn.metrics import classification_report
y_test_pred = np.array([np.argmax(predict(xi, encoder, decoder).cpu()) for xi in X_test])
print(classification_report(y_test, y_test_pred))