## Importing libraries

In [1]:
import pandas as pd
import numpy as np
import sqlite3

import matplotlib.pyplot as plt
import seaborn as sns



from sklearn import preprocessing
#encoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
#tokenization
from keras.preprocessing.text import Tokenizer

from keras.utils import to_categorical

import random

#modeling

from keras import models
from keras import layers
from keras.layers import Lambda, Dense, Dropout
from keras.callbacks import  EarlyStopping

# to save model
import pickle
from keras.models import Sequential, model_from_json


Using TensorFlow backend.


## Import Data

In [2]:
conn = sqlite3.connect( r"./data/emotion.db")
c = conn.cursor()

In [3]:
twitter_df = pd.read_sql_query("SELECT * FROM emotions_preprocessed_sample;", conn)

In [4]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300252 entries, 0 to 300251
Data columns (total 3 columns):
clean_text    300252 non-null object
emotion       300252 non-null object
text          300252 non-null object
dtypes: object(3)
memory usage: 6.9+ MB


In [5]:
twitter_df.shape

(300252, 3)

In [6]:
twitter_df.head()

Unnamed: 0,clean_text,emotion,text
0,only follower where have all the dodgy porn on...,sadness,Only 9 followers? Where have all the dodgy por...
1,just spent over on a raffle and did not win,sadness,just spent over $40 on a raffle and didn't win...
2,think i have hit my mid life crisis,sadness,Think I've hit my mid life crisis #depressed
3,i guess i do not have my best friend anymore n...,sadness,I Guess I Don't Have My Best friend Anymore No...
4,sitting outside the gym with a bag of chip and...,sadness,Sitting outside the gym with a bag of chips an...


## tokenization

In [7]:
num_words = 10000
comments = twitter_df['clean_text']
tokenizer = Tokenizer(num_words = num_words+1)
tokenizer.fit_on_texts(comments)
sequences = tokenizer.texts_to_sequences(comments)
print('sequences type:', type(sequences))

sequences type: <class 'list'>


In [8]:
one_hot_results= tokenizer.texts_to_matrix(comments, mode='binary')
print('one_hot_results type:', type(one_hot_results))

one_hot_results type: <class 'numpy.ndarray'>


In [9]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 67427 unique tokens.


In [10]:
reverse_index = dict([(value, key) for (key, value) in word_index.items()])
comment_idx_to_preview = 318
print('Original text:')
print(comments[comment_idx_to_preview])

Original text:
three yard separated from rose three measly yard poor choice to go for the punt block but still proud of


In [11]:
decoded_review = ' '.join([reverse_index.get(i) for i in sequences[comment_idx_to_preview]])
print('Decoded review from Tokenizer:')
print(decoded_review)

Decoded review from Tokenizer:
three yard separated from rose three yard poor choice to go for the punt block but still proud of


In [12]:
target = twitter_df['emotion']
le = preprocessing.LabelEncoder() #Initialize. le used as abbreviation fo label encoder
le.fit(target)
print("Original class labels:")
print(list(le.classes_))
print(target.shape)

Original class labels:
['anger', 'fear', 'joy', 'sadness']
(300252,)


In [13]:
target_cat = le.transform(target)
#If you wish to retrieve the original descriptive labels post production
print('New target labels:')
print(target_cat)
print('shape')
print (len(target_cat))
print('One hot labels; 4 binary columns, one for each of the categories.') #Each row will be all zeros except for the category for that observation.

New target labels:
[3 3 3 ... 0 0 0]
shape
300252
One hot labels; 4 binary columns, one for each of the categories.


In [14]:
target_onehot = to_categorical(target_cat)
print(target_onehot)
print('\n')
print('One hot labels shape:')
print(np.shape(target_onehot))
len(one_hot_results)*.3

[[0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 ...
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]


One hot labels shape:
(312094, 4)


93628.2

In [16]:
random.seed(42)
test_index = random.sample(range(0,300252), 90075)

In [17]:
test = one_hot_results[test_index]

In [18]:
len(test)

90075

In [19]:
train = np.delete(one_hot_results, test_index, 0)

MemoryError: 

In [None]:
label_test = target_onehot[test_index]
label_train = np.delete(target_onehot, test_index, 0)
print("Test label shape:", np.shape(label_test))
print("Train label shape:", np.shape(label_train))
print("Test shape:", np.shape(test))
print("Train shape:", np.shape(train))

In [None]:
from keras import backend as K
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall
def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision
def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
comments.shape[0]

In [None]:
model = models.Sequential()
# model.add(Dropout(.2))
# model.add(layers.Dense(1000, activation='relu', ))
model.add(Dropout(.2))
model.add(layers.Dense(500, activation='relu', input_shape=(10000,)))
model.add(Dropout(.2))
model.add(layers.Dense(100, activation='relu'))
model.add(Dropout(.2))
model.add(layers.Dense(50, activation='relu'))
model.add(Dropout(.2))
model.add(layers.Dense(4, activation='softmax'))
model.compile(optimizer='RMSprop',
             loss='categorical_crossentropy',
             metrics=['accuracy', f1_m, precision_m, recall_m])
es_callback = (EarlyStopping(monitor='val_loss', patience=3))

In [None]:
results = model.fit(train,
                   label_train,
                   epochs=120,
                   batch_size=250,
                   validation_split=0.2,
                   callbacks = [es_callback])

In [None]:
# evaluate the model
loss, accuracy, f1_score, precision, recall = model.evaluate(test, label_test, verbose=0)

In [None]:
print(loss, accuracy, f1_score, precision, recall )

In [None]:
label_pred = model.predict(test)
label_pred =label_pred >.5

In [None]:
#get test with prediction results in a dataframe
classes = list(le.inverse_transform([0, 1, 2, 3]))
pred = pd.DataFrame(label_pred, columns = ['anger', 'fear', 'joy', 'sadness'] )
test_text = all_data.iloc[test_index]
test_text.reset_index(inplace = True)
pred_comp = pd.concat([test_text,pred] , axis = 1)
pred_comp.drop(columns = 'index' , inplace = True)

In [None]:
pd.set_option('display.max_colwidth', -1)
pred_comp.head(5)

In [None]:
pred_comp.head(5)

In [None]:
pred_comp.query(anger == True)

In [None]:
# Creating the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(label_test.argmax(axis=1), label_pred.argmax(axis=1))

In [None]:
cm

In [3]:
!python --version

Python 3.7.3


In [1]:
from mlxtend.plotting import plot_confusion_matrix
# import matplotlib.pyplot as plt
# fig, ax = plot_confusion_matrix(conf_mat=cm)
# plt.xticks(ticks = range(4), labels = list(target.value_counts().keys()), size  ='small')
# plt.yticks(ticks = range(4), labels = list(target.value_counts().keys()), size  ='small')

# # interp.plot_confusion_matrix(figsize=(10,10))

# plt.show()

In [None]:
pred_comp.head()

In [None]:
def visualize_training_results(results):
    history = results.history
    plt.figure()
    plt.plot(history['val_loss'])
    plt.plot(history['loss'])
    plt.legend(['val_loss', 'loss'])
    plt.title('Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.show()
    plt.figure()
    plt.plot(history['val_f1_m'])
    plt.plot(history['f1_m'])
    plt.legend(['val_f1_m', 'f1_m'])
    plt.title('F1 Scores')
    plt.xlabel('Epochs')
    plt.ylabel('F1 Scores')
    plt.show()

In [None]:
visualize_training_results(results)

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("keras_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("keras_model.json.h5")
print("Saved model to disk")
 

In [None]:
# later...
 
# load json and create model
json_file = open('keras_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("keras_model.json.h5")
print("Loaded model from disk")