In [2]:
# Ref: https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
import os
import io
import tensorflow as tf

In [3]:
import sys
sys.path.append("../src") 

## Read data

In [5]:
from preprocessing import *

In [None]:
data = pd.read_csv('../Dataset/Tweets.csv')
text_col = 'content'
data = preprocess_text(data,text_col)
data.head(10)

# Categorize into 3 categories

In [None]:
# Categorize into 3 categories
data.loc[data['sentiment'] == 'anger'] = 'negative'
data.loc[data['sentiment'] == 'hate'] = 'negative'
data.loc[data['sentiment'] == 'worry'] = 'negative'
data.loc[data['sentiment'] == 'sadness'] = 'negative'
data.loc[data['sentiment'] == 'boredom'] = 'negative'
data.loc[data['sentiment'] == 'relief'] = 'positive'
data.loc[data['sentiment'] == 'happiness'] = 'positive'
data.loc[data['sentiment'] == 'love'] = 'positive'
data.loc[data['sentiment'] == 'enthusiasm'] = 'positive'
data.loc[data['sentiment'] == 'surprise'] = 'positive'
data.loc[data['sentiment'] == 'fun'] = 'positive'
data.loc[data['sentiment'] == 'empty'] = 'neutral'
data.sentiment.unique()

In [None]:
saved_data = data
neutral_data = data.loc[data['sentiment'] == 'neutral']
negative_data = data.loc[data['sentiment'] == 'negative']
positive_data = data.loc[data['sentiment'] == 'positive']

In [None]:
# Obtain 5000 from each category
data = pd.concat([neutral_data.sample(n=5000),negative_data.sample(n=5000),positive_data.sample(n=5000)])
# Shuffle data
data = data.sample(frac=1).reset_index(drop=True)
data.head()
# Check the number of items in each category
for i in data.sentiment.unique():
    print(len(data.loc[data['sentiment'] == i]),i)

## Create embedding layer

In [18]:
from embedding import *
from keras.preprocessing.text import Tokenizer

In [27]:
# create embedding layers from file
embeddings, dim = get_embeddings("../glove/glove.6B.100d.txt")
tokenizer = Tokenizer(filters='')
tokenizer.fit_on_texts([' '.join(list(embeddings.keys()))])
word_index = tokenizer.word_index
embedding_matrix = get_embedding_matrix(embeddings, tokenizer.word_index, dim)

In [None]:
embeddings, dim = getEmbeddings("../glove/glove.6B.100d.txt")

## Dataset

In [None]:
# Max number of words in each dialogue.
MAX_SEQUENCE_LENGTH = 30

In [None]:
X = tokenizer.texts_to_sequences(data['content'].values)
print(np.max([len(k) for k in X]))
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

In [None]:
#TODO: apply the column for data Y
Y = pd.get_dummies(data['sentiment']).values
print('Shape of label tensor:', Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.80, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

## Build MOdel

In [None]:
embedding_dim = embedding_matrix.shape[1]
embedding_layer = Embedding(embedding_matrix.shape[0],
                            embedding_dim,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)
model = Sequential()
model.add(embedding_layer)
model.add(GaussianNoise(0.6, input_shape=(None, MAX_SEQUENCE_LENGTH, embedding_dim)))
model.add(SpatialDropout1D(0.6))
model.add(Bidirectional(LSTM(64, dropout=0.6,recurrent_dropout=0.3)))
model.add(Dropout(0.6))
model.add(Dense(64, activation='relu'))
model.add(Dense(Y_train.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
#model.summary()

# Training Data

In [None]:
epochs = 10
batch_size = 50
history = model.fit(X_train,Y_train, epochs=epochs, batch_size=batch_size,
                    validation_data=(X_test,Y_test),
                    #validation_split=0.5, 
                    verbose = 1,
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.001)])

In [None]:
accr = model.evaluate(X_test,Y_test)
print(accr)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
#history.history

In [None]:
plt.title('Accuracy')
plt.plot(history.history['categorical_accuracy'], label='train')
plt.plot(history.history['val_categorical_accuracy'], label='test')
plt.legend()
plt.show();