In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from string import punctuation
from sklearn import svm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
#nltk.download('perluniprops')
from nltk import ngrams
from itertools import chain

import pickle

### Fetch the data

In [2]:
with open('tokenized_reviews.pickle', 'rb') as handle:
    df = pickle.load(handle)

df.head()

Unnamed: 0,Score,Label,WordIndeces
0,5,1,"[2, 16, 123, 317, 7, 1, 4998, 516, 102, 51, 20..."
1,1,0,"[40, 372, 2210, 22, 5809, 1960, 1075, 1, 1075,..."
2,4,1,"[9, 8, 4, 7126, 13, 49, 82, 275, 4, 162, 9315,..."
3,2,0,"[35, 18, 19, 250, 10, 1, 2625, 570, 11, 24049,..."
4,5,1,"[37, 3580, 31, 4, 37, 86, 78, 20, 4, 2073, 214..."


### Save data as labels and input

In [3]:
input_array = df.WordIndeces.values
labels = df.Label.values

In [4]:
tmp = np.concatenate(input_array).ravel()
input_data = np.reshape(tmp,(len(input_array),500,1))
print(input_data.shape)

(568454, 500, 1)


## Create a model

In [5]:
# Add your import statements here
from keras import Input, Model
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, Embedding
from keras.layers.recurrent import LSTM
from keras.callbacks import TensorBoard, EarlyStopping
from keras.optimizers import Adam, RMSprop

def create_model(X_seq_len, output_n, hidden_dim, b_size):
    """ Define a keras sequence-to-sequence model. 
    
    Arguments:
    input_n - integer, the number of inputs for the network (the length of a one-hot vector from `X`)
    X_seq_len - integer, the length of a sequence from `X`. Should be constant and you made sure by using padding
    output_n - integer, the number of outputs for the network (the length of a one-hot vector from `Y`)
    Y_seq_len - integer, the length of a sequence from `Y`. Should be constant and you made sure by using padding
    hidden_dim - integer, number of units in the LSTM's memory cell.
    embedding_dim - output dimension of the embedding layer.
    
    Returns:
    The compiled keras model
    
    """
    # Input and embedding layers
    input_layer = Input(batch_shape=(b_size,X_seq_len),shape=(X_seq_len,1))
    input_layer.reshape((b_size,X_seq_len,1))
    print(input_layer)
    
    # Create the encoder LSTM.
    # correct number of units?
    encoder_LSTM = LSTM(units=hidden_dim, activation='tanh', recurrent_activation='hard_sigmoid')
    encoder_output = encoder_LSTM(input_layer)
    
    # Add a fully connected layer and a softmax to the outputs of the decoder
    fully_connected_output = TimeDistributed(Dense(output_n))(encoder_output)
    softmax_output = Activation('softmax')(fully_connected_output)
    
    # Create final model and compile it
    model = Model([input_layer], softmax_output)
    
    # Compile the model. Use a loss function, optimizer, and metrics of your choice
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Add these arguments to the model for convenience
    model.hidden_dim = hidden_dim
    
    return model

Using TensorFlow backend.


In [10]:
from keras.models import Sequential

def create_model(X_seq_len, output_n, hidden_dim):

    model = Sequential()
    model.add(LSTM(hidden_dim, input_shape=(X_seq_len, 1)))
    model.add(Dense(output_n,activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

## Small fix

In [11]:
# lessen the amount of data
num_smaller = 10000
input_data_lessen = input_data[0:num_smaller]
labels_lessen = labels[0:num_smaller]

## First model, uneven split (bleh bad)

In [13]:
X_seq_len = len(input_data[0]) #the input data is padded
output_dim = 1
hidden_dim = 10
num_epochs = 5
b_size = 64

model = create_model(X_seq_len,output_dim,hidden_dim)
name = 'first_model'

# Define a tensorboard callback
tb = TensorBoard(log_dir='./logs/'+name)
    
# Print model summary and train
model.summary()
current_history = model.fit(input_data_lessen, labels_lessen, verbose=1, batch_size=b_size, epochs=num_epochs, callbacks=[tb], validation_split=0.2);
    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 10)                480       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 491
Trainable params: 491
Non-trainable params: 0
_________________________________________________________________
Train on 8000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Fetch even data

In [17]:
with open('X_train_even.pickle', 'rb') as handle:
    X_train_even = pickle.load(handle)
    
# with open('X_test_even.pickle', 'rb') as handle:
#     X_test_even = pickle.load(handle)
    
with open('Y_train_even.pickle', 'rb') as handle:
    Y_train_even = pickle.load(handle)
    
# with open('Y_test_even.pickle', 'rb') as handle:
#     Y_test_even = pickle.load(handle)

print('Total number of even training data points: %d' %len(X_train_even))

Total number of even training data points: 199483


## First model on even data

#### Create a smaller data set for testing

In [23]:
lesser_number = 10000

small_Y = Y_train_even[0:lesser_number]
small_X = X_train_even[0:lesser_number]
neg_share = len(small_Y[small_Y==1])/lesser_number

print('The share of negative reviews:')
print(neg_share)
print('It is approximately even!')

The share of negative reviews:
0.4924
It is approximately even!


In [24]:
X_seq_len = len(small_X[0]) #the input data is padded
output_dim = 1
hidden_dim = 10
num_epochs = 5
b_size = 64

model = create_model(X_seq_len,output_dim,hidden_dim)
name = 'first_model_even_data'

# Define a tensorboard callback
tb = TensorBoard(log_dir='./logs/'+name)
    
# Print model summary and train
model.summary()
current_history = model.fit(small_X, small_Y, verbose=1, batch_size=b_size, epochs=num_epochs, callbacks=[tb], validation_split=0.2);
    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_8 (LSTM)                (None, 10)                480       
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 11        
Total params: 491
Trainable params: 491
Non-trainable params: 0
_________________________________________________________________
Train on 8000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
