## Import Library

In [1]:
# import all libraries
import keras
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers.convolutional import Conv1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import spacy

nlp=spacy.load("en")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load Dataset

In [2]:
# load the dataset
train = pd.read_csv("./datasets/training.1600000.processed.noemoticon.csv" , encoding= "latin-1")
Y_train = train[train.columns[0]]
X_train = train[train.columns[5]]

In [3]:
# split the data into test and train
from sklearn.model_selection import train_test_split

trainset1x, trainset2x, trainset1y, trainset2y = train_test_split(X_train.values, Y_train.values, test_size=0.02,random_state=42 )
trainset2y=pd.get_dummies(trainset2y)

In [4]:
pd.DataFrame(data=trainset2x, columns=['text']).tail()

Unnamed: 0,text
395,@natestamp Probably with your girlfriend
396,It's my first day of school! Starting my cours...
397,@moneyhighway @BudgetPulse Ty both I apprecia...
398,inconsistent method signatures make me sad
399,@meg2e63 hows the beach retreat? I wanted to ...


## Data Preprocessing

In [5]:
#  function to remove stopwords
def stopwords(sentence):
    new=[]
    sentence=nlp(sentence)
    for w in sentence:
        if (w.is_stop == False) & (w.pos_ !="PUNCT"):
            new.append(w.string.strip())
        c=" ".join(str(x) for x in new)
    
    return c

In [6]:
# function to lemmatize the tweets
def lemmatize(sentence):
    sentence=nlp(sentence)
    str=""
    for w in sentence:
        str+=" "+w.lemma_
        
    return nlp(str)

In [7]:
# loading the glove model
def loadGloveModel(gloveFile):
    print("Loading Glove Model...")
    
    f = open(gloveFile,'r')
    model = {}
    
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = embedding
    
    print ("Done."),len(model),(" words loaded!")
    return model

In [8]:
# load the glove model
glove_model = loadGloveModel("./datasets/glove/glove.twitter.27B.200d.txt")

Loading Glove Model
Done.


#### Vectorizing the sentences

In [9]:
# vectorising the sentences
def sent_vectorizer(sent, model):
    sent_vec = np.zeros(200)
    numw = 0
    for w in sent.split():
        try:
            sent_vec = np.add(sent_vec, model[str(w)])
            numw+=1
        except:
            pass
    
    return sent_vec

In [10]:
# obtain a clean vector
cleanvector = []
for i in range(trainset2x.shape[0]):
    document=trainset2x[i]
    document=document.lower()
    document=lemmatize(document)
    document=str(document)
    cleanvector.append(sent_vectorizer(document, glove_model))

In [11]:
# getting the input and output in proper shape
cleanvector=np.array(cleanvector)
cleanvector =cleanvector.reshape(len(cleanvector),200,1)

In [12]:
# tokenizing the sequences
tokenizer = Tokenizer(num_words=16000)
tokenizer.fit_on_texts(trainset2x)
sequences = tokenizer.texts_to_sequences(trainset2x)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=15, padding="post")
print(data.shape)

Found 1873 unique tokens.
(400, 15)


#### Data vector preparation

In [13]:
# reshape the data and preparing to train
data = data.reshape(len(cleanvector),15,1)
from sklearn.model_selection import train_test_split
trainx, validx, trainy, validy = train_test_split(data, trainset2y, test_size=0.3, random_state=42)

In [14]:
# calculate the number of words
nb_words=len(tokenizer.word_index)+1

In [15]:
# obtain theembedding matrix
embedding_matrix = np.zeros((nb_words, 200))
for word, i in word_index.items():
    embedding_vector = glove_model.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Null word embeddings: 346


In [16]:
trainy = np.array(trainy)
validy = np.array(validy)

In [17]:
print(trainx.shape)

(280, 15, 1)


## RNN Model Architecture

In [18]:
# building a simple RNN model
def build_rnn_model():
    model = Sequential()
    model.add(keras.layers.InputLayer(input_shape=(15,1)))
    keras.layers.embeddings.Embedding(nb_words, 15, weights=[embedding_matrix], input_length=15, trainable=False)
 
    model.add(keras.layers.recurrent.SimpleRNN(units=100, activation='relu', use_bias=True))
    model.add(keras.layers.Dense(units=1000, input_dim=2000, activation='sigmoid'))
    model.add(keras.layers.Dense(units=500, input_dim=1000, activation='relu'))
    model.add(keras.layers.Dense(units=2, input_dim=500, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model

## Training Model

In [19]:
# compiling the model
rnn_model = build_rnn_model()
rnn_model.fit(trainx, trainy, epochs=10, batch_size=120, validation_data=(validx, validy))

Train on 280 samples, validate on 120 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x182e4dbfd0>

## Predict!

In [20]:
# sentence = "I love this impressive RNN model so much!"" # pass into the pipeline
# prediction = rnn_model.predict(sentence)

<hr/>