In [11]:
# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.optimizers import SGD

# NLTK
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import Stemmer

# Other
import re
import timeit
import string
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [14]:

def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
        return v
    return v / norm


def build_data(train_input_, train_label_index):
    x = []
    y = []
    
    for i in range(len(train_input_)):
        for j in range(len(train_input_[i])):
            #x.append(normalize(train_input_[i][j]))
            x.append(train_input_[i][j])
            y.append(train_label_index[i][j])
                    
    return (np.array(x), np.array(y))



In [15]:

data_path = 'dataV2-Small/'
TRAIN_SIZE = 10000
TEST_SIZE = int(TRAIN_SIZE*.2)

#Load Embeddings
    
#Train data embeddings
train_input_ = np.load(data_path + 'train_input.npy')

#Train labels as indexes
train_label_index_ = np.load(data_path + 'train_label_index.npy')


#Test data embeddings
test_input_ = np.load(data_path + 'test_input.npy')

#Test labels as indexes
test_label_index_ = np.load(data_path + 'test_label_index.npy')


print("Data Loaded!!!\n")

x_train, y_train = build_data(train_input_[:TRAIN_SIZE], train_label_index_[:TRAIN_SIZE])

print("Train data len:", len(x_train))
assert len(x_train) == len(y_train), "**Size Mismatch!!!***"


x_test, y_test = build_data(test_input_[:TEST_SIZE], test_label_index_[:TEST_SIZE])

print("Train data len:", len(x_test))
assert len(x_test) == len(y_test), "**Size Mismatch!!!***"

print("\nData formatting done!!!")


# transform labels into one hot representation
y_train_one_hot = (np.arange(np.max(y_train) + 1) == y_train[:, None]).astype(float)

y_test_one_hot = (np.arange(np.max(y_test) + 1) == y_test[:, None]).astype(float)

#lr = np.arange(OP_DIM)
#test_labels_one_hot = (lr==test_labels).astype(np.float)

"""
#removing zeroes and ones from the labels:
y_train_one_hot[y_train_one_hot==0] = 0.01
y_train_one_hot[y_train_one_hot==1] = 0.99

y_test_one_hot[y_test_one_hot==0] = 0.01
y_test_one_hot[y_test_one_hot==1] = 0.99
"""

print("\nx_train shape", x_train.shape)
print("y_train shape", y_train.shape)
print("y_train_one_hot shape", y_train_one_hot.shape)


print("\nx_test shape", x_test.shape)
print("y_test shape", y_test.shape)
print("y_test_one_hot shape", y_test_one_hot.shape)



Data Loaded!!!

Train data len: 73447
Train data len: 17288

Data formatting done!!!

x_train shape (73447, 25)
y_train shape (73447,)
y_train_one_hot shape (73447, 8)

x_test shape (17288, 25)
y_test shape (17288,)
y_test_one_hot shape (17288, 8)


In [16]:
y_train_one_hot[0]

array([0., 0., 0., 0., 0., 0., 0., 1.])

In [None]:
model = Sequential()
model.add(Dense(5000, activation='relu', input_dim=X_train.shape[1]))
model.add(Dropout(0.1))
model.add(Dense(600, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(y_train.shape[1], activation='softmax'))

sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy',
              optimizer=sgd,
              metrics=['accuracy',])

model.fit(X_train, y_train,epochs=5,batch_size=2000)

score = model.evaluate(X_test, y_test, batch_size=2000)
score



In [20]:
#MLP Network architecture

IP_DIM = 25 #x_train.shape[1]
OP_DIM = 8  #y_train_one_hot.shape[1]


print('Building model...')
model_mlp = Sequential()

#max_features = 20000 #size of embedding
#Embedding(input_dim, output_dim, embeddings_initializer='uniform', ***, input_length=None)
#o/p will be model.output_shape == (None, 10 :input_dim, 64:output_dim), where None is the batch dimension of the matrix given.
#model_mlp.add(Embedding(max_features, 100, input_length=50))

## Dense(64) is a fully-connected layer with 64 hidden units.
# in the first layer, you must specify the expected input data shape: here, 20-dimensional vectors.
model_mlp.add(Dense(100, input_dim= IP_DIM, activation='relu'))
model_mlp.add(Dropout(0.5))

model_mlp.add(Dense(50, activation='relu'))
model_mlp.add(Dropout(0.5))

model_mlp.add(Dense(OP_DIM, activation='softmax'))

#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model_mlp.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  #optimizer = sgd,
                  metrics=['accuracy'])


print("Train data len:", len(x_train))

EPOCHS = 5
BATCH_SIZE = 32

print('\n\nTraining Model...')

start = timeit.default_timer()

#batch_size: Integer or None. Number of samples per gradient update. 
#If unspecified, batch_size will default to 32.
model_mlp.fit(x_train, y_train_one_hot,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              validation_data=(x_test, y_test_one_hot))

#model_lstm.fit(data, np.array(labels), validation_split=0.2, epochs=3)

print("\n\nTotal training time: %.4f seconds." % (timeit.default_timer() - start))

start = timeit.default_timer()
score, acc = model_mlp.evaluate(x_test, y_test_one_hot, batch_size = BATCH_SIZE)

print("\nTesting time: %.4f seconds." % (timeit.default_timer() - start))
print('\nTest score:', score)

print('Test accuracy:', acc)

Building model...
Train data len: 73447


Training Model...
Train on 73447 samples, validate on 17288 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Total training time: 25.8022 seconds.

Testing time: 0.3590 seconds.

Test score: 0.24893969592536175
Test accuracy: 0.9136973623322536


In [18]:
model_mlp.output_shape

(None, 8)

In [158]:

"""
model.predict(x, batch_size=None, verbose=0, steps=None)
Generates output predictions for the input samples.

Computation is done in batches.

Arguments

x: The input data, as a Numpy array (or list of Numpy arrays if the model has multiple inputs).

batch_size: Integer. If unspecified, it will default to 32.

verbose: Verbosity mode, 0 or 1.

steps: Total number of steps (batches of samples) before declaring the prediction round finished. 
Ignored with the default value of None.

Returns: Numpy array(s) of predictions.

Raises

ValueError: In case of mismatch between the provided input data and the model's expectations, 
or in case a stateful model receives a number of samples that is not a multiple of the batch size.

"""

sample = process_sample(["Is this is a fucking joke?", "This is good.", "This is insane man!!!"])

print(model_lstm.predict_classes(sample))

print(model_mlp.predict_classes(sample))



(None, 50, 1)

In [187]:
#LSTM Network architecture

IP_DIM = 25 #x_train.shape[1]
OP_DIM = 8  #y_train_one_hot.shape[1]

print('Building model...')

#The network starts with an embedding layer.
#Turns positive integers (indexes) into dense vectors of fixed size allowing the n/w to represent a word in a meaningful way.
#eg. [[4], [20]] -> [[0.25, 0.1], [0.6, -0.2]]
#This layer can only be used as the first layer in a model.

#keras.layers.Embedding(input_dim, output_dim, embeddings_initializer='uniform', ***, input_length=None)

#input_dim: int > 0. Size of the vocabulary, i.e. maximum integer index + 1.

#output_dim: int >= 0. Dimension of the dense embedding.

#input_length: Length of input sequences, when it is constant. 
#This argument is required if you are going to connect Flatten then Dense layers upstream 
#(without it, the shape of the dense outputs cannot be computed).

#eg. model.add(Embedding(1000, 64, input_length=10))

# the model will take as input an integer matrix of size (batch, input_length).
# where the largest integer (i.e. word index) in the input should be no larger than 999 (vocabulary size).

# o/p will be model.output_shape == (None, 10 :input_dim, 64:output_dim), where None is the batch dimension of the matrix given.

model_lstm = Sequential()

model_lstm.add(Embedding(20000, 100, input_length=IP_DIM))

model_lstm.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1, activation='softmax'))

model_lstm.compile(loss='categorical_crossentropy', 
                   optimizer='rmsprop', 
                   metrics=['accuracy'])

print("Train data len:", len(X_train), ", Test data len:", len(X_test))

EPOCHS = 3
BATCH_SIZE = 32

print('\n\nTraining Model...')

start = timeit.default_timer()

#batch_size: Integer or None. Number of samples per gradient update. 
#If unspecified, batch_size will default to 32.
model_lstm.fit(x_train_seq, y_train_one_hot,
              batch_size = BATCH_SIZE,
              epochs = EPOCHS,
              validation_data=(x_test, y_test_one_hot))

#model_lstm.fit(data, np.array(labels), validation_split=0.2, epochs=3)

print("\n\nTotal training time: %.4f seconds." % (timeit.default_timer() - start))

start = timeit.default_timer()
score, acc = model_lstm.evaluate(x_test_seq, y_test, batch_size = BATCH_SIZE)

print("\nTesting time: %.4f seconds." % (timeit.default_timer() - start))
print('\nTest score:', score)
print('Test accuracy:', acc)


Building model...
Train data len: 19826 , Test data len: 4957


Training Model...
Train on 19826 samples, validate on 4957 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


Total training time: 254.2815 seconds.

Testing time: 4.7194 seconds.

Test score: 0.41748238686642775
Test accuracy: 0.8448658461938203
