In [2]:
import csv
import json
import io
import re
import array
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import random
from pprint import pprint
import collections
from scipy.stats import norm
import gzip
import math
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
#####################
# 01_Helper functions
#####################

In [4]:
def ci_lower_bound(pos, n, confidence):
    if n == 0:
        return 0
    z = norm.ppf(1-(1-confidence)/2)
    phat = 1.0*pos/n
    return (phat + z*z/(2*n) - z * math.sqrt((phat*(1-phat)+z*z/(4*n))/n))/(1+z*z/n)

In [5]:
def parse(path):
    g = gzip.open(path, 'r')
    for l in g:
        yield eval(l)

In [6]:
# Returns a dictionary whose keys are the full corpus of words in the training data set, and whose assigned values are all 0
def generate_corpus(train_data_list):
	all_words = set()
	feature_vec = {}

	for k in range(0,len(train_data_list)):
		wordList = re.sub("[^\w]", " ",  train_data_list[k]).split() # Clean and split data
		for words in wordList:
			all_words.add(words.lower())

	return list(all_words)
	#feature_vec = {key: 0 for key in all_words}

	#for j in all_words:
	#	feature_vec[j] = 0

	#return feature_vec

In [7]:
# For a string passed in, returns an ordered list of frequencies associated with each word in the corpus
def return_dict_of_sentence(string_passed, corpus):
	#feature_vec = collections.defaultdict(int)
	feature_vec = corpus
	wordList = re.sub("[^\w]", " ",  string_passed).split() # Clean and split data
	for words in wordList:
		if words in corpus.keys():
			feature_vec[words.lower()] += 1

	#return feature_vec
	#return np.array([feature_vec.keys(), feature_vec.values()])
	return feature_vec.values()

In [9]:
#####################
# 02_Main function
#####################

reviews = []
number_of_data_points_to_read = 15000 #Select number of data points to read
counter= 1
path= 'reviews_Health_and_Personal_Care_5.json.gz'

for line in parse(path):
    if counter <number_of_data_points_to_read:
        reviews.append(line)
        counter += 1
    else:
        break
print "done with file"


done with file


In [10]:
helpful_dict = collections.defaultdict(list)
review_dict = collections.defaultdict(str)

id = 1
for i in reviews:
    if i['helpful'][1] != 0:
            helpful_dict[id] = i['helpful']
            review_dict[id] = i['reviewText']
            id += 1
print id
print "Read-in done"

7247
Read-in done


In [11]:
##### Split into testing and training sets #####
random.seed(42)

train_set_x, test_set_x, train_set_y, test_set_y = train_test_split(review_dict.values(), [ci_lower_bound(i[0], i[1], 0.95) for i in helpful_dict.values()], test_size=0.2, random_state=None, shuffle= True) #split into train and test set

print "Split into sets"

Split into sets


In [12]:
##### Create corpus, convert reviews to feature vectors #####

if False: # Change to False once corpus generated
    corpus = generate_corpus(train_set_x)
    with io.open('hpc_corpus.csv', 'wb') as myfile:
        wr = csv.writer(myfile)
        for i in corpus:
            wr.writerow([i])
else:
    with io.open('hpc_corpus.csv', 'rb') as csvfile:
        data = csv.reader(csvfile)
        corpus = list(data)
        corpus = [corpus[i][0] for i in range(len(corpus))]

corpus = {c: 0 for c in corpus}
print "Corpus generated"

Corpus generated


In [None]:
### Create the dictionary of each example
features = []
for xi in range(0,len(train_set_x)):
    #### return_dict_of_sentence is a function that takes a sentence, and the corpus dictionary as input and returns an updated dictionary with count of present words updated
    features.append(return_dict_of_sentence(train_set_x[xi], corpus))

print len(features)
print len(train_set_y)

In [None]:
### Run MLP Regression on training set

clf = MLPRegressor()
clf.fit(X = features, y = train_set_y)
score = clf.score(X = features, y = train_set_y)
train_predict = clf.predict(features)

# Calculate the proportion correctly classified for training set
print "TRAIN DATA SET"
print "Average predicted score"
print mean(train_predict)
print "Actual score range"
print (min(train_set_y) + "-" + min(train_set_y))
print "Training error"
meanSquaredError=mean_squared_error(train_set_y, train_predict)
print("MSE:", meanSquaredError)
rootMeanSquaredError = math.sqrt(meanSquaredError)
print("RMSE:", rootMeanSquaredError)

In [None]:
###################
# 03_Running the test data set through feature vector calculated above
###################

##### Extract review and rating #####

### Create the dictionary of each example
test_features = []
for xi in range(0,len(test_set_x)):
    #### return_dict_of_sentence is a function that takes a sentence, and the corpus dictionary as input and returns an updated dictionary with count of present words updated
    test_features.append(return_dict_of_sentence(test_set_x[xi], corpus))

#predictions = logit.predict(test_features)
predictions = clf.predict(test_features)


# Calculate the proportion correctly classified for testing set
print "TEST DATA SET"
print "Average predicted score"
print mean(predictions)
print "Testing error"
meanSquaredError=mean_squared_error(test_set_y, predictions)
print("MSE:", meanSquaredError)
rootMeanSquaredError = math.sqrt(meanSquaredError)
print("RMSE:", rootMeanSquaredError)

In [None]:
###################
# 04_Predict usefulness score of new review
###################

input_set_x = ["It was great!"]

### Create the dictionary of each example
input_features = []
for xi in range(0,len(input_set_x)):
    #### return_dict_of_sentence is a function that takes a sentence, and the corpus dictionary as input and returns an updated dictionary with count of present words updated
    input_features.append(return_dict_of_sentence(input_set_x[xi], corpus))

input_prediction = clf.predict(input_features)

print input_set_x
print input_prediction
