In [9]:
from __future__ import division, print_function, absolute_import
from sklearn.feature_extraction.text import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split

import keras
from keras.datasets import mnist
from keras.layers import Dense, Flatten, LSTM, Dropout
from keras.layers import Conv2D, MaxPooling2D
from keras.models import Sequential
from keras.models import model_from_json

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random
import sklearn
import string

In [3]:
def split_train_test(df, test_size):
    train, test = train_test_split(df, test_size=test_size, random_state=random.randint(0, 99))
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    return train, test

dataset_path = 'new_data_william.csv' # Minimized dataset

df = pd.read_csv(dataset_path, sep=',', index_col=0)

# Training: 60%, Validation: 20%, Testing: 20%
df_train, df_test = split_train_test(df, 0.200)
df_train, df_val = split_train_test(df_train, 0.250)

df_train.head()

Unnamed: 0,reviewer_id,asin,overall,category,score,good,bad,review_text
0,A3I92N6EHQI3IP,B000ERAON2,5,10,0.85,1,0,ultimate portable amazing much prices fallen p...
1,A2UQK3DAZ8NO2T,B000N48GEU,5,10,0.960784,1,0,potential king compacts nikon two newest lcd e...
2,A1LKSZ9CYJ6829,300110308,3,5,0.727273,0,0,crossfire become conversation according author...
3,A2YC63G8RC4ILQ,B002QEBMAK,4,10,0.888889,1,0,great external drive money elements replaced s...
4,A3DRQQ9DIFW5W9,425234339,3,5,0.1,0,1,least better last got book library given buyin...


In [5]:
class TfidfFeaturizer(object):
    def __init__(self, col_name='review_text', max_features=1000):
        self.col_name = col_name
        self.max_features = max_features
        self.vectorizer = None
        
    def fit_transform(self, df):
        docs = self.__create_doc_list(df)
        self.vectorizer = TfidfVectorizer(
            ngram_range=(1,3),
            max_features=self.max_features
        )
        return (self.vectorizer.fit_transform(docs)).toarray()

    def transform(self, df):
        if self.vectorizer is not None:
            docs = self.__create_doc_list(df)
            return (self.vectorizer.transform(docs)).toarray()
        else:
            return None

    def __create_doc_list(self, df):
        return df[self.col_name].tolist()

n_tfidf_features = 500
tfidf_featurizer = TfidfFeaturizer(max_features=n_tfidf_features)

print('Fitting tf-idf featurizer.')
tfidf_featurizer.fit_transform(df)

Fitting tf-idf featurizer.


array([[ 0.     ,  0.     ,  0.     , ...,  0.14104,  0.     ,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       ..., 
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ],
       [ 0.     ,  0.     ,  0.     , ...,  0.     ,  0.     ,  0.     ]])

In [30]:
import pickle
with open('vectorizer_v2.pk', 'wb') as fin:
    pickle.dump(tfidf_featurizer, fin, protocol=2)

In [11]:
import pandas as pd

bad_review = pd.read_csv('data_wei_kitchen_0.3.csv', sep=',', index_col=0)
tfidf_bad_review = tfidf_featurizer.transform(bad_review)

In [13]:
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model.h5")
print("Loaded model from disk")

Loaded model from disk


In [18]:
bad_review['predict_prob'] = model.predict(tfidf_bad_review)

In [19]:
bad_review['predict_classes'] = model.predict_classes(tfidf_bad_review)



In [25]:
counter_0 = 0
for i in bad_review['predict_classes']:
    if i == 0:
        counter_0 = counter_0 + 1
print(counter_0/len(bad_review))        

0.7777777777777778


In [20]:
bad_review.to_csv('bad_review.csv')

In [22]:
good_review = pd.read_csv('data_wei_kitchen_0.8.csv', sep=',', index_col=0)
tfidf_good_review = tfidf_featurizer.transform(good_review)
good_review['predict_prob'] = model.predict(tfidf_good_review)
good_review['predict_classes'] = model.predict_classes(tfidf_good_review)
good_review.to_csv('good_review.csv')



In [26]:
counter_1 = 0
for i in good_review['predict_classes']:
    if i == 0:
        counter_1 = counter_1 + 1
print(counter_1/len(good_review))

0.56455042527339
