Download the amazon product dataset from [here](http://jmcauley.ucsd.edu/data/amazon/)

In [5]:
import json

def get_reviews():
    for line in open("/Users/plee/workspace/ipython_notebooks/data/Apps_for_Android_5.json", "rb").readlines():
        js = json.loads(line)    
        yield js['reviewText'], js['overall']

I am just taking the reviews with less than 75 words

In [8]:
import numpy as np

SENTENCE_LENGTH = 75

reviews = []
scores = []
for review, score in get_reviews():
    if score != 3:
        word_count = len(review.split(' '))
        if word_count < SENTENCE_LENGTH:
            reviews.append(str(review))
            scores.append(score)

In [7]:
y = map(lambda x: 0 if x < 3 else 1, scores)

In [13]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

class Vectorizer:
    def __init__(self, maxlen):
        self.tokenizer = Tokenizer()
        self.maxlen = maxlen
        
    def fit(self, texts):
        self.tokenizer.fit_on_texts(texts)
        self.word_index = self.tokenizer.word_index

    def transform(self, texts):
        sequences = self.tokenizer.texts_to_sequences(texts)
        return pad_sequences(sequences, maxlen=self.maxlen)

    def fit_transform(self, texts):
        self.fit(texts)
        return self.transform(texts)

In [98]:
# import numpy as np

# reviews = []
# scores = []
# for review, score in get_reviews():
#     if score != 3:
#         word_count = len(review.split(' '))
#         if word_count < 75:
#             reviews.append(str(review))
#             scores.append(score)
            
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# y = scaler.fit_transform(np.array(scores).reshape((len(scores),1)))






In [14]:
v = Vectorizer(SENTENCE_LENGTH)
X = v.fit_transform(reviews)
input_dim = len(v.word_index) + 1
del reviews


In [16]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Conv1D, MaxPooling1D, Dropout, Flatten, AveragePooling1D
dropout = 0.1
nb_filter = 4
nb_conv = 3
model = Sequential()
model.add(Embedding(input_dim, 32, input_length=SENTENCE_LENGTH))
model.add(Dropout(dropout))

model.add(Conv1D(nb_filter, nb_conv, padding='same'))

model.add(MaxPooling1D(pool_size=6, padding='same'))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 75, 32)            4245632   
_________________________________________________________________
dropout_1 (Dropout)          (None, 75, 32)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 75, 4)             388       
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 13, 4)             0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 52)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 53        
Total params: 4,246,073
Trainable params: 4,246,073
Non-trainable params: 0
_________________________________________________________________


In [88]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Conv1D, MaxPooling1D, Dropout, Flatten, AveragePooling1D
dropout = 0.1
nb_filter = 4
nb_conv = 5
model = Sequential()
model.add(Embedding(input_dim, 16, input_length=Vectorizer.MAX_SEQUENCE_LENGTH))
model.add(Dropout(dropout))

model.add(Conv1D(nb_filter, nb_conv, padding='same'))

model.add(MaxPooling1D(pool_size=6, padding='same'))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])

In [93]:

# def pickling(obj, name):
#     open(name, 'wb').write(pickle.dumps(obj))

# def get_reviews():
#     for line in open("reviews_Apps_for_Android_5.json", "rb").readlines():
#         js = json.loads(line)
#         yield js['reviewText'], js['overall']

import os
i = 3

In [90]:
i += 1
folder = "lstm_%s"%i
os.mkdir(folder)
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding, Conv1D, MaxPooling1D, Dropout, Flatten, AveragePooling1D, LSTM
dropout = 0.5
embedding_vector_length = 16
model = Sequential()
model.add(Embedding(input_dim, embedding_vector_length, input_length=75))
model.add(Dropout(dropout))
#without return_sequences, LSTM will become 2d instead of 3d as expected
model.add(LSTM(embedding_vector_length, return_sequences=True))
model.add(Dropout(dropout))
model.add(LSTM(embedding_vector_length, return_sequences=True))
model.add(Dropout(dropout))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
#model.add(Dense(1, activation='softmax'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

with open(os.path.join(folder, 'summary.txt'), 'wb') as f:
    pr = lambda x: f.write("%s\n"%x)
    model.summary(print_fn=pr)
    
from keras.callbacks import EarlyStopping
early_stopping_monitor = EarlyStopping(patience=2)

callbacks = []
callbacks.append(early_stopping_monitor)
history = model.fit(X, y, epochs=5, validation_split=0.2,
        callbacks=callbacks)

open(os.path.join(folder, 'history.txt'), 'w').write(str(history.history))


### I have done the above code in a terminal instead

In [4]:
from keras.models import load_model
model = load_model('amazon')
from perry import unpickling
#vector class needs to be defined first
v = unpickling('vectorizer.pkl')

In [16]:
reviews[:18]

[u"Loves the song, so he really couldn't wait to play this. A little less interesting for him so he doesn't play long, but he is almost 3 and likes to play the older games, but really cute for a younger child.",
 u'Oh, how my little grandson loves this app. He\'s always asking for "Monkey." Grandma has tired of it long before he has. Finding the items on each page that he can touch and activate is endlessly entertaining for him, at least for now. Well worth the $.99.',
 u"I found this at a perfect time since my daughter's favorite song at the moment is five little monkeys. It's easy for little ones to operate and fun with all the things you can play with. The different generes are nice too, gives its a little variety.  Worth the money. Highly recommended!",
 u"My 1 year old goes back to this game over and over again.  It is simple but very easy for a toddler to use.  I've even caught my 5 year old playing it as well.  Good app for little guys!",
 u"There are three different versions of

In [34]:
model.predict_proba(v.transform(["I am happy"]))



array([[ 0.91688085]], dtype=float32)

In [54]:
import pandas as pd

test = pd.read_csv('/tmp/all_test.csv')
train = pd.read_csv('/tmp/all_train.csv')

In [55]:
from keras.models import load_model



In [59]:
def true_y(df):
    true_y = []
    for y in df['Unnamed: 3']:
        true_y.append(0 if y < 3 else 1)
    return true_y

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
y = scaler.fit_transform(np.array(scores).reshape((604106,1)))

import numpy as np
from vectorizer import Vectorizer
from sklearn.metrics import f1_score

In [61]:
pred_y = model.predict_proba(v.transform(test['Segment']))
len(pred_y[pred_y[:,0] == test_y])


f1_score(pred_y[:,0], y)

0    2
1    2
2    2
3    2
4    2
5    2
6    2
7    2
8    2
9    2
Name: Unnamed: 3, dtype: int64

In [62]:
model.predict_proba(v.transform(train['Segment'][:10]))



array([[ 0.87394553],
       [ 0.29533932],
       [ 0.9721629 ],
       [ 0.8462379 ],
       [ 0.85978496],
       [ 0.61218536],
       [ 0.83538377],
       [ 0.83985728],
       [ 0.34078816],
       [ 0.89358461]], dtype=float32)

In [60]:
ddfrom sklearn.metrics import f1_score
f1_score(true_y, pred_y)

0.68888888888888877

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVC, SVR
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(ngram_range=(1,2))
X = v.fit_transform(reviews)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     X, y, test_size=0.3, random_state=42)

clf = SVR()
#clf = GradientBoostingRegressor()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

import numpy as np
np.abs(y_pred - y_test).sum()/len(y_test)
