In [61]:
import numpy as np 
import os
print(os.listdir("./input"))
import re
from nltk.corpus import stopwords
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, GlobalMaxPool1D
from keras.models import Model
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from keras.preprocessing.text import Tokenizer

['IMDB Dataset.csv']


In [62]:
maxLength = 100
maxVocabNumber = 100000

In [63]:
def cleanText(text, removeStopwords=True, performStemming=True):
    
    remove_special_char = re.compile('r[^a-z\d]', re.IGNORECASE)
    replace_numerics = re.compile(r'\d+', re.IGNORECASE)
    text = remove_special_char.sub('', text)
    text = replace_numerics.sub('', text)

    stop_words = set(stopwords.words('english')) 
    stemmer = SnowballStemmer('english')
    lemmatizer = WordNetLemmatizer()
    
    text = text.lower().split()

    
    processedText = list()
    for word in text:        
        if removeStopwords:
            if word in stop_words:
                continue
        if performStemming:
            word = stemmer.stem(word)
            
        word = lemmatizer.lemmatize(word)
        word = lemmatizer.lemmatize(word, 'v')
            
        processedText.append(word)

    text = ' '.join(processedText)

    return text

In [65]:
imdb = pd.read_csv('./input/IMDB Dataset.csv')
X = [cleanText(text) for text in list(imdb['review'])]
Y = [1 if sentiment=='positive' else 0 for sentiment in list(imdb['sentiment'])]

In [66]:
tokenizer = Tokenizer(num_words=maxVocabNumber)
tokenizer.fit_on_texts(X)

In [6]:
X_tokenized = pad_sequences(tokenizer.texts_to_sequences(X), maxlen=maxLength)

In [7]:
inp = Input(shape=(maxLength,))
embedding = Embedding(maxVocabNumber, 100)(inp)
bilstm = Bidirectional(LSTM(60, return_sequences = True))(embedding)
maxpool = GlobalMaxPool1D()(bilstm)
out = Dense(60, activation='relu')(maxpool)
out = Dense(1, activation='sigmoid')(out)
model = Model(inputs=inp, outputs=out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          10000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 120)          77280     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 60)                7260      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 61        
Total params: 10,084,601
Trainable params: 10,084,601
Non-trainable params: 0
________________________________________________________________

In [10]:
batchSize = 1000
epochs = 4
hist = model.fit(X_tokenized, Y, batch_size=batchSize, epochs=epochs, verbose=1, shuffle=True, validation_split=0.5)

Train on 25000 samples, validate on 25000 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [68]:
review = ["It is a very bad movie and very disgusting"]
review = [cleanText(text) for text in review]
review = pad_sequences(tokenizer.texts_to_sequences(review),maxlen=maxLength)
print(review)
model.predict(review)

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0   21
     3 1552]]


array([[0.18235047]], dtype=float32)

In [17]:
model.save("sentimentanalysis.h5")
print("Saved model to disk")

Saved model to disk


In [46]:
from keras.models import load_model

# Returns a compiled model identical to the previous one
model = load_model('sentimentanalysis.h5')

In [55]:
model.predict(review)

array([[0.97607625]], dtype=float32)