In [1]:
import pandas as pd
import numpy as np
import sys
import os
import re

from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from collections import defaultdict

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical

from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten, LSTM, Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding, Merge, Dropout
from keras.models import Model
from keras.models import Sequential

Using Theano backend.


In [2]:
data = pd.read_csv('C:/Users/Sekhar/OneDrive/INSOFE/20170923-batch29-cse7321c-cute04-team12-master/data_new.csv')

In [3]:
##Specify these metrics before hand. Gonna be useful later on.
MAX_SEQUENCE_LENGTH = 500
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [4]:
##Write a function to clean the conversations
def clean_converse(raw_converse):
    #Convert the words to lower and split
    words = raw_converse.lower().split()
    #Stopwords
    stop = set(stopwords.words('english'))
    #Remove stop words
    words = [w for w in words if w not in stop]
    #Get the cleaned review
    return(" ".join(words))

In [5]:
##Append the cleaned reviews in the texts list and the target variable in the labels list
texts = []
labels = []
for idx in range(data.converse.shape[0]):
    texts.append(clean_converse(data.converse[idx].encode('ascii','ignore')))
    labels.append(data.categories[idx])

In [6]:
##Keras has some nice pre-processing steps for all the text. Below I'm tokenize the words and converting them to sequences
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [7]:
##Same as above, getting the word indexes. Will be useful later on
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 39147 unique tokens.


In [8]:
##Padding all the sequences to be of equal length
data_pad = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [9]:
##Okay, there is a peculiar reason why I'm converting the data into categorical here. "to_categorical" from sklearn is not working
##straight with the strings in the target variable. So I'm doing this.
labels = pd.Categorical(labels)


In [10]:
##Labels.codes gives you the numerical presentations which will be easy to one-hot vector
labels.codes

array([5, 1, 1, ..., 4, 4, 4], dtype=int8)

In [11]:
##Create one-hot vectors
labels = to_categorical(np.asarray(labels.codes))


In [12]:
len(labels)

57244

In [13]:
##Split the data into train and test
x_train,x_test,y_train,y_test = train_test_split(data_pad,labels,test_size = 0.2)

In [14]:
##This is the architecture for RNN, copied straight from the assignment.
embedding_vector_length = 32
model_LSTM = Sequential()
model_LSTM.add(Embedding(MAX_NB_WORDS, embedding_vector_length, input_length=MAX_SEQUENCE_LENGTH))
model_LSTM.add(Dropout(0.2))
model_LSTM.add(LSTM(100))
model_LSTM.add(Dropout(0.2))
model_LSTM.add(Dense(6, activation='sigmoid'))
model_LSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['recall'])
print(model_LSTM.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_1 (Embedding)          (None, 500, 32)       640000      embedding_input_1[0][0]          
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 500, 32)       0           embedding_1[0][0]                
____________________________________________________________________________________________________
lstm_1 (LSTM)                    (None, 100)           53200       dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 100)           0           lstm_1[0][0]                     
___________________________________________________________________________________________

In [None]:
##Fit the model, damn this is taking a lot of time.
model_LSTM.fit(x_train, y_train, validation_data=(x_test,y_test), nb_epoch=5, batch_size=256)      

Train on 45795 samples, validate on 11449 samples
Epoch 1/5
 2048/45795 [>.............................] - ETA: 2192s - loss: 1.7435 - recall: 0.6660

In [21]:
##This is the accuracy without dropout layers, epochs = 5 and batch size = 128. Highest train accuracy is 84.4%    
scores = model_LSTM.evaluate(x_test, y_test, verbose=0)
print("Recall: %.2f%%" % (scores[1]*100))

Accuracy: 80.04%
