In [0]:
import gdown
import os
from pandas_profiling import ProfileReport

# https://drive.google.com/file/d/1l_J0P9A_AD8d_rzZHJ5Fg8F4y1nGP_x3/view?usp=sharing

url = f'https://drive.google.com/uc?id=1l_J0P9A_AD8d_rzZHJ5Fg8F4y1nGP_x3'
filename = 'dataset.csv'
if not os.path.exists(filename):
    gdown.download(url, filename, quiet=True)

In [0]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

In [0]:
columns = ['emotion', 'text']

df = pd.read_csv(filename, names=columns)

In [0]:
Xraw = df['text'].values
yraw = df['emotion'].values

### Preprocessing

In [0]:
import nltk

nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
!python -m pip install -U symspellpy

Collecting symspellpy
[?25l  Downloading https://files.pythonhosted.org/packages/6d/0b/2daa14bf1ed649fff0d072b2e51ae98d8b45cae6cf8fdda41be01ce6c289/symspellpy-6.5.2-py3-none-any.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 2.8MB/s 
Installing collected packages: symspellpy
Successfully installed symspellpy-6.5.2


In [0]:
text_raw = df['text'].values
print(text_raw[0])

On days when I feel close to my partner and other friends.   
When I feel at peace with myself and also experience a close  
contact with people whom I regard greatly.


In [0]:
import pkg_resources
from symspellpy import SymSpell, Verbosity

sym_spell = SymSpell()

dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")

sym_spell.load_dictionary(dictionary_path, 0, 1)

spell = lambda term: ' '.join([sym_spell.lookup(t, Verbosity.CLOSEST, 
                                      max_edit_distance=2, include_unknown=True)[0].term for t in term.split()])

In [0]:
from nltk.tokenize import word_tokenize

process_text = lambda t: word_tokenize(t.lower()) if type(t) is str else []

In [0]:
text_prep = list(map(process_text, text_raws))
' | '.join(text_prep[0])

'in | days | when | a | feel | close | to | my | partner | and | other | friends | when | a | feel | at | peace | with | myself | and | also | experience | a | close | contact | with | people | whom | a | regard | greatly'

In [0]:
all_text  = df['text'].str.cat()
all_text_prep = process_text(all_text)

In [0]:
from collections import Counter

word_freq = Counter(all_text_prep)

In [0]:
word_freq.most_common()[-100:]

[('tears.when', 1),
 ('others.during', 1),
 ('persoon', 1),
 ('ashamed.our', 1),
 ('organizor', 1),
 ('ponder', 1),
 ('apologetic', 1),
 ('afterwards.it', 1),
 ('mr.w', 1),
 ('p.m.the', 1),
 ('die.one', 1),
 ('sunned', 1),
 ('snatched', 1),
 ('starteed', 1),
 ('freely.when', 1),
 ('before.my', 1),
 ('favour.on', 1),
 ('zip', 1),
 ('happened.one', 1),
 ('thirsty', 1),
 ('fetched', 1),
 ('thermos', 1),
 ('pour', 1),
 ('flurry', 1),
 ('phillipines', 1),
 ('dim', 1),
 ('dawn.i', 1),
 ('didcovered', 1),
 ('thay', 1),
 ('out.a', 1),
 ('exam.once', 1),
 ('athletic', 1),
 ('flaws', 1),
 ('muttered', 1),
 ('day.my', 1),
 ('scissors', 1),
 ('fastidious', 1),
 ('intolerable.i', 1),
 ('p3', 1),
 ('admited', 1),
 ('hong', 1),
 ('kong', 1),
 ('number.one', 1),
 ('ghosts.when', 1),
 ('team-mate', 1),
 ('disgusting.when', 1),
 ('riot', 1),
 ('replied', 1),
 ('best.i', 1),
 ('mates.i', 1),
 ('flatter', 1),
 ('sex.during', 1),
 ('organizors', 1),
 ('bright', 1),
 ('attendants', 1),
 ('prefects', 1),
 ('

In [0]:
vocab = list(word_freq.keys())
len(vocab)

True

### GLOVE pretrained word embedding

In [0]:
# !wget http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip

--2020-05-13 14:36:28--  http://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip [following]
--2020-05-13 14:36:28--  https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.42B.300d.zip [following]
--2020-05-13 14:36:28--  http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Le

In [0]:
# !unzip glove.42B.300d.zip

Archive:  glove.42B.300d.zip
  inflating: glove.42B.300d.txt      


In [0]:
# lines = []

# with open(f'{glove_path}/glove.42B.300d.txt', 'rb') as f:
#     for l in f:
#         line_decode = l.decode(encoding='utf-8')
#         line = line_decode.lower().split()

#         if line[0] in word_freq:
#             lines.append(line_decode)

In [0]:
# lines[0]
# len(lines)

9629

In [0]:
# word2vecfilename = 'relevant_word2vec.txt'

# with open(word2vecfilename, 'w') as wf:
#     wf.write("".join(lines))

In [0]:
word2vecfilename = 'relevant_word2vec.txt'

# https://drive.google.com/file/d/1E2FCguEoggAVak1dCXksXlfq7ruOiFU1/view?usp=sharing
w2vurl = f'https://drive.google.com/uc?id=1E2FCguEoggAVak1dCXksXlfq7ruOiFU1'

if not os.path.exists(word2vecfilename):
    gdown.download(w2vurl, word2vecfilename, quiet=True)

In [0]:
word2vec = {}
with open(word2vecfilename) as fr:
    for l in fr:
        line = l.split()
        word = line[0]
        wordvec = np.array(line[1:], dtype=np.float64)
        word2vec[word] = wordvec

In [0]:
np.random.seed(18)
word2vec['UNK'] = np.random.randn(300, 1)
# word2vec['UNK']

In [0]:
def process_input(inputs):
    return [
         np.array([
          word2vec[tk].flatten() if tk in word2vec else word2vec['UNK'].flatten() for tk in process_text(text)
          ])
         for text in inputs
    ]

In [0]:
Xp = process_input(list(text_raw))

In [0]:
classes = list(set(yraw))

yci = np.array([classes.index(c) for c in yraw], dtype=np.float64)

In [0]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(sparse=False)

enc.fit(yraw.reshape(-1, 1))

yenc = enc.transform(yraw.reshape(-1, 1))

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
Xenc = np.array(Xp)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(Xenc, yci)

print(len(X_train), len(X_test), y_train.shape, y_test.shape)

5584 1862 (5584,) (1862,)


In [0]:
y_train[0]

4.0

### Bidirectional LSTM Encoder-Decoder architecture

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

device: cpu


In [0]:
# X_train = torch.tensor(X_train, device=device)
# y_train = torch.tensor(y_train, device=device)

# X_test = torch.tensor(X_test, device=device)
# y_test = torch.tensor(y_test, device=device)

In [0]:
class Encoder(nn.Module):
    def __init__(self, n_class, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size=300, hidden_size=self.hidden_size)

    def forward(self, inp, hidden_state):
        print(inp.shape)
        # print(f'forward: {inp}')
        return self.lstm(inp.view((1, 1, -1)), hidden_state)
    
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size, device=device).double(),
                torch.zeros(1, 1, self.hidden_size, device=device).double())

class Decoder(nn.Module):
    def __init__(self, n_class, hidden_size):
        super().__init__()
        self.lin_layer = nn.Linear(hidden_size, 512)
        self.out_layer = nn.Linear(512, n_class)

    def forward(self, inp):
        out1 = F.tanh(self.lin_layer(inp))
        return F.softmax(self.out_layer(out1))

In [0]:
encoder = Encoder(7, 1024).to(device)
decoder = Decoder(7, 1024).to(device)

In [0]:
enc_optim = optim.Adam(encoder.parameters())
dec_optim = optim.Adam(decoder.parameters())

In [0]:
def train(X_i, y_i, encoder, decoder, enc_optim, dec_optim):
    hidden = encoder.init_hidden()

    enc_optim.zero_grad()
    dec_optim.zero_grad()

    X_i = torch.tensor(X_i, device=device).double()
    y_i = torch.tensor(y_i, device=device).double()

    n_len = X_i.size(0)
    print(f'Shape of X_i: {X_i.shape}, {X_i.dtype}')
    print(f'Shape of y_i: {y_i.shape}, {y_i.dtype}')

    for ei in range(n_len):
        _, hidden = encoder(X_i[ei], hidden)
        print(type(hidden))

    print(f'hidden: {hidden}, {type(hidden)}')

    decoder_output = decoder(torch.cat(hidde, dim=0))

    loss = nn.NLLLoss()(decoder_output, y_i)

    loss.backward()

    enc_optim.step()
    dec_optim.step()

    return loss.item()

In [0]:
def train_iters(X, y, n_epochs, encoder, decoder, enc_optim, dec_optim):
    for i in range(n_epochs):
        num_X = len(X)

        loss = 0
        for xi in range(num_X):
            loss += train(X[xi], y[xi], encoder, decoder, enc_optim, dec_optim)

        print(f'Epoch: {i}, Loss: {loss:.5f}')

In [0]:
train_iters(X_train, y_train, 10, encoder, decoder, enc_optim, dec_optim)

Shape of X_i: torch.Size([11, 300]), torch.float64
Shape of y_i: torch.Size([]), torch.float64
torch.Size([300])


RuntimeError: ignored