In [1]:
from collections import Counter
from datetime import datetime
 
import json
 
from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
 
import numpy as np

Using TensorFlow backend.


In [2]:
# Loading the data
t = datetime.now()
with open("data/dataset/review.json") as f:
    reviews = f.read().strip().split("\n")
    
reviews = [json.loads(review) for review in reviews]
print(datetime.now() - t)

0:01:39.482764


In [4]:
# Generate a balanced sample of pos and neg reviews
texts = [review['text'] for review in reviews]

# Make binary classifier out of date pos and neg
binstars = [0 if review['stars'] <= 3 else 1 for review in reviews]
balanced_texts = []
balanced_labels = []
limit = 100000
neg_pos_counts = [0, 0]
for i in range(len(texts)):
    polarity = binstars[i]
    if neg_pos_counts[polarity] < limit:
        balanced_texts.append(texts[i])
        balanced_labels.append(polarity)
        neg_pos_counts[polarity] += 1
    
Counter(balanced_labels)

Counter({0: 100000, 1: 100000})

In [8]:
# Tokenize all of the texts into vectors!
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(balanced_texts)
sequences = tokenizer.texts_to_sequences(balanced_texts)
data = pad_sequences(sequences, maxlen=300)
print(data[:10])

[[   0    0    0 ...,  688  249   84]
 [   0    0    0 ...,  441    2  122]
 [   0    0    0 ...,   51    1 1081]
 ..., 
 [   0    0    0 ...,    2    1 4743]
 [   0    0    0 ...,   29   54  107]
 [   0    0    0 ...,    2 1513  261]]


In [11]:
# Build the model. We need to go deeper!
model = Sequential()
model.add(Embedding(20000, 128, input_length = 300))
model.add(Dropout(0.2))

model.add(Conv1D(64, 5, activation = 'relu'))
model.add(MaxPooling1D(pool_size = 4))

model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.fit(data, np.array(balanced_labels), validation_split=0.5, epochs=3) # Train rocky!

Train on 100000 samples, validate on 100000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fcae06305f8>

In [12]:
import pickle
 
# save the tokenizer and model
with open("keras_tokenizer.pickle", "wb") as f:
   pickle.dump(tokenizer, f)
model.save("yelp_sentiment_model.hdf5")

In [14]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import pickle
 
# load the tokenizer and the model
with open("keras_tokenizer.pickle", "rb") as f:
   tokenizer = pickle.load(f)
 
model = load_model("yelp_sentiment_model.hdf5")
 
# replace with the data you want to classify
newtexts = ["Your new data", "More new data", "Everything sucks"]
 
# note that we shouldn't call "fit" on the tokenizer again
sequences = tokenizer.texts_to_sequences(newtexts)
data = pad_sequences(sequences, maxlen=300)
 
# get predictions for each of your new texts
predictions = model.predict(data)
print(predictions)

[[ 0.9240576 ]
 [ 0.79750961]
 [ 0.13258116]]
