# IMDB Review Sentiment Analysis
This should explain the problem with some links

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import os
import sys

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split, GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [2]:
# Checking if we have the data. If not download it
# This needs to be done...
# Mention that we could use the built-in keras.load_imdb() but want to do it manually for clarity
VOCAB_PATH = 'data/imdb.vocab'
TRAIN_PATH = 'data/train'
TEST_PATH = 'data/test'

SEED = 2018
VOCAB_SIZE = 100
MAX_REVIEW_LEN = 250
NUM_EPOCHS = 5
BATCH_SIZE = 64

In [3]:
def get_x_y(file_path):
    files = {}
    files['pos'] = glob(os.path.join(file_path, 'pos', '*.txt'))
    files['neg'] = glob(os.path.join(file_path, 'neg', '*.txt'))
    
    sentiment_map = {'pos': 1, 'neg': 0}
    x = []
    y = []
    for sentiment in files:
        for file_name in files[sentiment]:
            temp_ = []
            with open(file_name) as file_:
                temp_ = file_.read()
            x.append(temp_)
            y.append(sentiment_map[sentiment])
            
    return x, y

In [4]:
# Read in the text data
x_train, y_train = get_x_y(TRAIN_PATH)
x_test, y_test = get_x_y(TEST_PATH)

In [5]:
# Make our tokenizer
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(x_train)

In [6]:
# Fit our training data
x_train = tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=MAX_REVIEW_LEN)

# Fit our testing data
x_test = tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=MAX_REVIEW_LEN)

In [7]:
def create_model(embedding_vector_length=32, dropout_rate=0.2):
    model = Sequential()
    model.add(Embedding(VOCAB_SIZE, embedding_vector_length, input_length=MAX_REVIEW_LEN))
    model.add(Dropout(dropout_rate))
    model.add(LSTM(100))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [8]:
# model = create_model()
# model.fit(x_train, y_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
# scores = model.evaluate(x_test, y_test, verbose=0)
# print("Accuracy: %.2f%%" % (scores[1]*100))

In [9]:
model = KerasClassifier(build_fn=create_model, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, verbose=1)

In [10]:
params = {
    'dropout_rate': [0.1, 0.2, 0.5]
}

grid = GridSearchCV(estimator=model, scoring="accuracy", param_grid=params)

In [None]:
grid_result = grid.fit(x_train, y_train)

Epoch 1/5

In [None]:
rint("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))