In [1]:
from __future__ import division, print_function, absolute_import
from sklearn.feature_extraction.text import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.utils import resample

import keras
from keras.datasets import mnist
from keras.layers import Dense, Flatten, LSTM, Dropout
from keras.layers import Conv2D, MaxPooling2D
from keras.models import Sequential
from keras.models import model_from_json
from nltk import word_tokenize

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import random
import sklearn
import string

import nltk
nltk.download('punkt')

Using TensorFlow backend.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\s4341237\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def split_train_test(df, test_size):
    train, test = train_test_split(df, test_size=test_size, random_state=random.randint(0, 99))
    train.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    return train, test

dataset_path = 'new_data_william.csv' # Minimized dataset

df = pd.read_csv(dataset_path, sep=',', index_col=0)
print("Old data size:", len(df))
df_majority = df[df.good==0]
df_minority = df[df.good==1]
df_minority_upsampled = resample(df_minority, replace=True, 
        n_samples=len(df_majority), random_state=123)
df = pd.concat([df_majority, df_minority_upsampled])

del(df_majority)
del(df_minority)
del(df_minority_upsampled)

# Training: 60%, Validation: 20%, Testing: 20%
df_train, df_test = split_train_test(df, 0.200)
df_train, df_val = split_train_test(df_train, 0.250)
print("New data size:", len(df))

df_train.head()

Old data size: 98188
New data size: 146376


Unnamed: 0,reviewer_id,asin,overall,category,score,good,bad,review_text
0,A2IQYGB0H03MVU,B00021XIJW,4,10,0.990385,1,0,great way boost wireless range picked linksys ...
1,A31RNXQHD106YY,60516054,3,5,0.5,0,0,smug ethnocentrism book smug ethnocentrism sho...
2,AT7W1EXT0PF2Y,471272426,1,5,0.197531,0,1,skeptic dictionary warning review contains mas...
3,A1MH1V09NGMJ2P,B000BVCSMG,5,10,0.847222,1,0,power conditioning important snake oil first a...
4,A2ZMI5UAMCTQHA,B0013A1XDE,5,10,0.967213,1,0,compared note updated review note amateur thin...


In [7]:
class TfidfFeaturizer(object):
    def __init__(self, col_name='review_text', max_features=1000):
        self.col_name = col_name
        self.max_features = max_features
        self.vectorizer = None
        
    def fit_transform(self, df):
        docs = self.__create_doc_list(df)
        self.vectorizer = TfidfVectorizer(ngram_range=(0,3),
                                          analyzer='word',
                                          token_pattern='[a-zA-Z0-9]+', 
                                          tokenizer=word_tokenize,
                                          max_features=self.max_features)
        return (self.vectorizer.fit_transform(docs)).toarray()

    def transform(self, df):
        if self.vectorizer is not None:
            docs = self.__create_doc_list(df)
            return (self.vectorizer.transform(docs)).toarray()
        else:
            return None

    def __create_doc_list(self, df):
        return df[self.col_name].tolist()

n_tfidf_features = 500
tfidf_featurizer = TfidfFeaturizer(max_features=n_tfidf_features)

print('Fitting tf-idf featurizer.')
tfidf_featurizer.fit_transform(df)

#print('Featurizing training set.')
#tfidf_train = tfidf_featurizer.transform(df_train)
#n_train = tfidf_train.shape[0]
#print('Shape:', tfidf_train.shape)

#print('Featurizing validation set.')
#tfidf_val = tfidf_featurizer.transform(df_val)
#n_val = tfidf_val.shape[0]
#print('Shape:', tfidf_val.shape)

#print('Featurizing test set.')
#tfidf_test = tfidf_featurizer.transform(df_test)
#n_test = tfidf_test.shape[0]
#print('Shape:', tfidf_test.shape)

Fitting tf-idf featurizer.


array([[ 0.97183459,  0.        ,  0.        , ...,  0.        ,
         0.0385293 ,  0.        ],
       [ 0.93216633,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.95128994,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.97934358,  0.        ,  0.        , ...,  0.        ,
         0.01310616,  0.        ],
       [ 0.937278  ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.9526439 ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.06239837]])

In [None]:
matrix_train = tfidf_train
matrix_val   = tfidf_val
matrix_test  = tfidf_test

In [None]:
# Configure labels
target_class = 'good'

labels_train = np.array(df_train[target_class])
labels_val = np.array(df_val[target_class])
labels_test = np.array(df_test[target_class])

In [None]:
del(tfidf_train)
del(tfidf_val)
del(tfidf_test)
del(df)
del(df_train)
del(df_val)
del(df_test)

In [None]:
batch_size = 100
epochs = 30

model = Sequential()
model.add(Dense(500, input_dim=500, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(100, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Fit the model
model.fit(matrix_train, labels_train,
          epochs=epochs,
          batch_size=batch_size,
          verbose=2,
          validation_data=(matrix_test, labels_test))

In [8]:
import pickle
with open('vectorizer_v3.pk', 'wb') as fin:
    pickle.dump(tfidf_featurizer, fin, protocol=2)

In [10]:
import pandas as pd

bad_review = pd.read_csv('Dataset/data_wei_kitchen_0.3.csv', sep=',', index_col=0)
tfidf_bad_review = tfidf_featurizer.transform(bad_review)

In [11]:
json_file = open('model_v3.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
# load weights into new model
model.load_weights("model_v3.h5")
print("Loaded model from disk")

Loaded model from disk


In [12]:
bad_review['predict_prob'] = model.predict(tfidf_bad_review)

In [13]:
bad_review['predict_classes'] = model.predict_classes(tfidf_bad_review)



In [14]:
counter_0 = 0
for i in bad_review['predict_classes']:
    if i == 0:
        counter_0 = counter_0 + 1
print(counter_0/len(bad_review))        

0.8590785907859079


In [15]:
bad_review.to_csv('bad_review.csv')

In [17]:
good_review = pd.read_csv('Dataset/data_wei_kitchen_0.8.csv', sep=',', index_col=0)
tfidf_good_review = tfidf_featurizer.transform(good_review)
good_review['predict_prob'] = model.predict(tfidf_good_review)
good_review['predict_classes'] = model.predict_classes(tfidf_good_review)
good_review.to_csv('good_review.csv')



In [18]:
counter_1 = 0
for i in good_review['predict_classes']:
    if i == 1:
        counter_1 = counter_1 + 1
print(counter_1/len(good_review))

0.7135479951397327
