In [90]:
import datetime
import os
import re
import sys
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, LSTM
from keras.models import load_model, save_model
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Flatten, SpatialDropout1D, MaxPooling1D
from keras.utils import np_utils
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.casual import TweetTokenizer


In [4]:
training_data = pd.read_csv('/home/malits/data/emotion/train_data.csv',
                           encoding='utf-8')

In [18]:
list(set(training_data.sentiment))

['love',
 'sadness',
 'enthusiasm',
 'empty',
 'hate',
 'surprise',
 'boredom',
 'anger',
 'happiness',
 'neutral',
 'fun',
 'worry',
 'relief']

### Preprocessing utils 

In [43]:
stop_words = set(stopwords.words('english'))
tokenizer = Tokenizer()

new_stops = set(stop_words)
new_stops.remove("not")

for s in stop_words:
    new_stops.add(s.replace('\'', ''))
    
filters = '!"#$%&()*+,-./:;<=>?[\]^_`{|}~\t\n'

In [44]:
def preprocess(texts):
    clean_tokens = []
    
    for text in texts:
        seq = text_to_word_sequence(text,filters=filters,lower=True)
        
        text = str(text)
        text = text.replace('\'', '')
        text = text.lower()
        toks = [t for t in seq if not t.startswith("@")]
        toks = [tok for tok in toks if tok not in new_stops]
        clean_tokens.append(toks)
        
    return clean_tokens

In [45]:
training_data["processed_content"] = preprocess(training_data.content)

### Tokenization

In [49]:
word_sequences = training_data.processed_content

tokenizer = Tokenizer()
tokenizer.fit_on_texts(word_sequences)
word_indices = tokenizer.texts_to_sequences(word_sequences)
word_index = tokenizer.word_index

print(f"Tokenized to Word indices as {np.array(word_indices).shape}")

Tokenized to Word indices as (30000,)


### Padding Word Sequences

In [51]:
MAX_SEQUENCE_LENGTH = 20

In [52]:
padded_data = pad_sequences(word_indices, maxlen=MAX_SEQUENCE_LENGTH)

### Embedding Layer using GloVe 50D pre-trained embeddings

In [54]:
EMBEDDING_DIM = 50

In [62]:
def make_embedding_layer(dim=EMBEDDING_DIM):
    embeddings_index = {}
    f = open(os.path.join('/home/malits/data/glove/', 'glove.6B.50d.txt'),
                            'r', encoding='utf-8')
    # Open and parse GloVe file
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

    print('Loaded GloVe Vectors')

    embedding_matrix = np.zeros((len(word_index) + 1, dim))
    for word, i in word_index.items():
        # populate embedding matrix with GloVe vectors
        # leave unkown words to be all zeros
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    print(f'Emedding Matrix Generated with Shape {embedding_matrix.shape}')
    
    embedding_layer = Embedding(len(word_index) + 1,dim, 
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    
    return embedding_layer
        

In [65]:
def make_binary_encodings(data):
    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(data)
    le_name_mapping = dict(zip(label_encoder.transform(label_encoder.classes_),
                               label_encoder.classes_))
    print(f"Label Encoding Classes as {le_name_mapping}")
    
    binarized_data = np_utils.to_categorical(integer_encoded)
    print(f"One Hot Encoded class shape {binarized_data.shape}")
    
    return binarized_data

### Building The Model

In [85]:
embedding = make_embedding_layer()
binary_encodings = make_binary_encodings(training_data.sentiment)

model = Sequential()
model.add(embedding)
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(binary_encodings.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

Loaded GloVe Vectors
Emedding Matrix Generated with Shape (27968, 50)
Label Encoding Classes as {0: 'anger', 1: 'boredom', 2: 'empty', 3: 'enthusiasm', 4: 'fun', 5: 'happiness', 6: 'hate', 7: 'love', 8: 'neutral', 9: 'relief', 10: 'sadness', 11: 'surprise', 12: 'worry'}
One Hot Encoded class shape (30000, 13)


W0811 11:40:54.514535 140563970615104 deprecation.py:506] From /home/malits/anaconda3/envs/sonar/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


### Training the Model

In [86]:
x_train, x_test, y_train, y_test = train_test_split(padded_data, 
                                                    binary_encodings)


In [87]:
batch_size = 64
num_epochs = 100

In [91]:
history = model.fit(x_train, y_train, epochs=num_epochs, 
                    batch_size=batch_size,
                    callbacks=[EarlyStopping(monitor='val_loss',
                                            patience=3,
                                            min_delta=0.0001)])

Epoch 1/100
Epoch 2/100
  128/22500 [..............................] - ETA: 21s - loss: 2.0475 - acc: 0.3359



Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 

Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
