In [485]:
import numpy as np
import pandas as pd
import json
import string
import re
import nltk
import math
from sklearn.linear_model import LinearRegression
from nltk.corpus import stopwords
STOPS = set(stopwords.words('english'))
from nltk.stem.porter import PorterStemmer
STEMMER = PorterStemmer()


# Read in businesses from json file

In [382]:
%%time
#150,000 businesses
fields_wanted = ['business_id','name','stars','review_count']
review_threshold = 50 #amount of reviews a business needs in order to be considered
read_line_limit = np.inf
bus = pd.DataFrame(columns=fields_wanted)
with open('../data/yelp_academic_dataset_business.json',encoding='utf-8') as d:
    #print('num businesses: ' + str(len(d.readlines())))
    counter = 0
    for line in d:
        L = json.loads(line)
        if L['review_count'] < review_threshold:
            continue
        less_fields = {key: L[key] for key in fields_wanted }
        bus.loc[counter] = less_fields
        counter += 1
        if counter == read_line_limit:
            break

CPU times: total: 1min 39s
Wall time: 1min 43s


# Read in reviews from json file
> - 6.9 million total reviews
> -Review fields
> - review_id (string), user_id (string), business_id (string), stars (float), useful (int - how many other users marked it as useful), funny (int), cool (int), text (string), date (ex. '2018-07-07 22:09:11')


In [585]:
%%time
review_fields_wanted = ['review_id','business_id','text','stars']
review_limit = 6900
rev = pd.DataFrame(columns=review_fields_wanted)
with open('../data/yelp_academic_dataset_review.json',encoding='utf-8') as d:
    counter = 0
    for line in d:
        L = json.loads(line)
        less_fields = {key: L[key] for key in review_fields_wanted }
        rev.loc[counter] = less_fields
        counter += 1
        if counter == review_limit:
            break
            
rev = rev.rename(columns = {'text':'_text','stars':'_stars'})


CPU times: total: 23.2 s
Wall time: 24.3 s


# Pre-process review text data
> - remove punctuation, tokenize words into list, 
> - remove stop words (ex. is, am, to, the, etc.) 
> - stem words using porter stemmer

In [586]:
%%time
edit_limit = review_limit
def preprocess(text_str):
    #remove punctuation, make lowercase, and tokenize into a list of words
    out_L = re.sub(r'[^\w\s]', '', text_str).lower().split()
    #remove stop words and stem the words
    out_L = [STEMMER.stem(x1) for x1 in out_L if x1 not in STOPS]
    return out_L

CPU times: total: 0 ns
Wall time: 1.01 ms


In [587]:
%%time
original_text = rev['_text'].copy()
Xtext = np.array(rev['_text'].map(preprocess))
Ystars = np.array(rev['_stars'])
#above is 46.4 MB of memory for 69,000 reviews

CPU times: total: 11.8 s
Wall time: 12.1 s


# Naive bayesian classifier
> - Positive review is >= 4 stars
> - Negative review is <= 3 stars
> - Uses a naive bayesian statistical model to classify positive and negative reviews

In [639]:
%%time
#count occurrences of words in pos/neg reviews
#word_L_ser is a series whose elements are lists of words
#label_ser is a series of the same length as word_L_ser whose elements
#are True if the corresponding word_L_ser entry is labeled positive, and False if negative
#pos_freq is a dictionary whose keys are words and values are occurrences in positive reviews
#neg_freq is the same as pos_freq but for negative reviews
def pos_neg_count_L(word_L, label,pos_freq ,neg_freq ):
    toadd = None
    other = None
    if label:
        toadd = pos_freq
        other = neg_freq
    else:
        toadd = neg_freq
        other = pos_freq
    for word in word_L:
        if word in toadd:
            toadd[word] += 1
        else:
            toadd[word] = 1
        if word not in other:
            other[word] = 0  
                
#Calculate probability of each word occurring in pos/neg review
#Perform Laplacian (add 1) smoothing to the probabilities
def count_to_prob(pos_freq,neg_freq):
    pos_sum = sum(pos_freq.values())
    pos_len = len(pos_freq)
    pos_probs = {x:math.log((pos_freq[x]+1)/(pos_sum+pos_len)) for x in pos_freq.keys()}

    neg_sum = sum(neg_freq.values())
    neg_len = len(neg_freq)
    neg_probs = {x : math.log((neg_freq[x]+1)/(neg_sum+neg_len)) for x in neg_freq.keys()}
    
    return pos_probs, neg_probs


def calc_log_prior(Y):
    pos_rev = np.sum(Y_train)
    neg_rev = Y_train.shape[0] - pos_rev
    return math.log(pos_rev/neg_rev)

def predict_naive_bayes(X,pos_p_in,neg_p_in, log_prior):
    pred = np.ones(X.shape[0])*log_prior
    for i in range(X.shape[0]):
        for word in X[i]:
            pred[i] += pos_p_in.get(word,0) - neg_p_in.get(word,0)
    return (pred >= 0).astype(int)

def score_naive_bayes(H,Y):
    acc = np.sum(H == Y)/Y.shape[0]
    true_pos = np.sum(H == Y)
    false_neg = np.sum(np.logical_and(H == 0, Y == 1))
    false_pos = np.sum(np.logical_and(H == 1, Y == 0))
    prec = true_pos/(true_pos + false_pos)
    recall = true_pos/(true_pos + false_neg)
    return (acc,prec,recall)

CPU times: total: 0 ns
Wall time: 942 µs


# Naive bayesian classifier results
> - Train accuracy is 93%
> - Test accuracy is 86%
> - This was using the first 6900 reviews

In [640]:
%%time
test_size = 0.2
end_train = math.ceil((1-test_size)*Xtext.shape[0])
X_train = Xtext[:end_train]
Y_train = (Ystars[:end_train] >= 4).astype(int)
X_test = Xtext[end_train:]
Y_test = (Ystars[end_train:] >= 4).astype(int)

pos_count2 = dict()
neg_count2 = dict()
add_count = np.vectorize(lambda x,y: pos_neg_count_L(x,y,pos_count2,neg_count2))
add_count(X_train,Y_train)

pos_rev_p, neg_rev_p = count_to_prob(pos_count,neg_count)

H_train = predict_naive_bayes(X_train,pos_rev_p,neg_rev_p, calc_log_prior(Y_train))
H_test = predict_naive_bayes(X_test,pos_rev_p,neg_rev_p, calc_log_prior(Y_train))


print('train: ' , list(zip(['accuracy','precision','recall'],score_naive_bayes(H_train,Y_train))))
print('test: ' , list(zip(['accuracy','precision','recall'],score_naive_bayes(H_test,Y_test))))

train:  [('accuracy', 0.928623188405797), ('precision', 0.9496109670248241), ('recall', 0.9767530487804879)]
test:  [('accuracy', 0.8623188405797102), ('precision', 0.9008327024981075), ('recall', 0.9527622097678142)]
CPU times: total: 516 ms
Wall time: 506 ms
