In [1]:
import json
import operator

In [8]:
#extract restuarant business IDs from business.json 
def filter_restaurants():
    print("Filtering Restaurants..")
    restaurants = open('restaurants.json', 'w')
    with open('./business.json') as f:
        for business in f:
            business_data = json.loads(business)
            business_id = business_data['business_id']
            categories = business_data['categories']
            if categories and 'Restaurants' in categories:
                restaurants.write(json.dumps(
                    {'business_id': business_id, 'stars': business_data['stars']}))
                restaurants.write('\n')
    f.close()
    restaurants.close()

In [9]:
#Sort the reviews by business ID 
def sortData(filename):
    print("Sorting Reviews by Restaurant ID..")
    reviews = []
    with open(filename) as f:
        for i in f: reviews.append(json.loads(i))
    f.close()
    reviews.sort(key=operator.itemgetter('business_id'))
    with open(filename, 'w') as f:
        for r in reviews:
            f.write(json.dumps(r))
            f.write('\n')
    f.close()

In [10]:
# Create new dataset of 15000 reviews
# as program crashes if there are too many
def extract_json():
    filter_restaurants()

    print("Extracting reviews based on Restaurant ID..")

    traning_set = open('labeled_reviews.json', 'w')
    restaurants = []
    
    with open('./restaurants.json') as rest:
        for i in rest:
            rest_data = json.loads(i)
            restaurants.append(rest_data['business_id'])
    
    with open('./review.json') as f:
        reviewCount=0
        for review in f:
            if(reviewCount==15000): break
            data = json.loads(review)
            if data['business_id'] not in restaurants: continue

            currReview = {
                'business_id': data['business_id'], 'text': (data['text']).replace('\n', ' ').replace('\r', '').strip()}
            traning_set.write(json.dumps(currReview))
            traning_set.write('\n')
            reviewCount += 1

    traning_set.close()
    sortData('./labeled_reviews.json')

In [11]:
extract_json()

Filtering Restaurants..
Extracting reviews based on Restaurant ID..
Sorting Reviews by Restaurant ID..


In [12]:
# Label each review as postive, negative or neutral based on 
# positive/negative wordset from postive.txt and negative.txt

from sklearn.feature_extraction.text import CountVectorizer

def labelReviews(filename):

    print("Labeling reviews as Positive/Negative/Neutral..")

    positiveWords=[]
    negativeWords=[]
    data = []
    corpus=[]
    wordsData=[]

    with open('./positiveWordset.txt') as f:
        for line in f: positiveWords.append(line.replace("\n",""))

    with open('./negativeWordset.txt') as f:
        for line in f: negativeWords.append(line.replace("\n",""))

    with open(filename) as f:
        for line in f: data.append(json.loads(line))

    for d in data: corpus.append(d["text"])

    for c in corpus:
        vectorizer = CountVectorizer(stop_words="english")
        X = vectorizer.fit_transform([c])
        wordsData.append(vectorizer.get_feature_names())


    for index,d in enumerate(wordsData):
        posAggregate=sum(el in d for el in positiveWords)
        negAggregate=sum(el in d for el in negativeWords)
        
        if(posAggregate - negAggregate < -1): data[index]["label"]="Negative"
        elif(posAggregate - negAggregate > 1): data[index]["label"] = "Positive"
        else: data[index]["label"] = "Neutral"

    with open(filename,'w') as f:
        for d in data:
            f.write(json.dumps(d))
            f.write("\n")
    f.close()

labelReviews('./labeled_reviews.json')

Labeling reviews as Positive/Negative/Neutral..


In [13]:
# Main Analysis Program Begins

def load_data():
    data = []
    data_labels = []
    data_id = []

    with open("./labeled_reviews.json") as f:
        for i in f:
            review = json.loads(i)
            data.append(review['text'])
            data_labels.append(review['label'])
            data_id.append(review['business_id'])

    return data, data_labels, data_id


data, data_labels, data_id = load_data()

In [14]:
def transform_to_features(data):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(
        analyzer='word',
        lowercase=False,
    )
    features = vectorizer.fit_transform(
        data
    )
    features_nd = features.toarray()
    return features_nd

features_nd = transform_to_features(data)

In [30]:
# TRAINING AND PREDICTION MODEL

def train_then_build_model(data_labels, features_nd, data, data_id):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        features_nd,
        data_labels,
        train_size=0.80,
        random_state=1234)

    from sklearn.linear_model import LogisticRegression
    log_model = LogisticRegression()

    log_model = log_model.fit(X=X_train, y=y_train)
    y_pred = log_model.predict(X_test)

    restReview = {}

    featuresList = features_nd.tolist()

    for i in range(len(X_test)):
        index = featuresList.index(X_test[i].tolist())
        if(data_id[index] not in restReview.keys()):
            restReview[data_id[index]] = {'positive': 0, 'neutral': 0, 'negative': 0}

        if(y_pred[i] == 'Positive'): restReview[data_id[index]]['positive'] += 1
        elif(y_pred[i] == 'Negative'): restReview[data_id[index]]['negative'] += 1
        else: restReview[data_id[index]]['neutral'] += 1


    overallReview={}
    for id in restReview:
        restReview[id]=sorted(restReview[id].items(), key=lambda x:x[1], reverse=True)
        overallReview[id]=restReview[id][0][0]

    yelpReview = {}
    with open('./restaurants.json') as rest:
        for i in rest:
            yelp_r = json.loads(i)
            yelpReview[yelp_r['business_id']] = yelp_r['stars']
    
    idCount, data = 0, [['Business ID',"Prediction", "Actual Rating"]]
    for id in restReview:
        if(idCount==15): break
        data.append([id, overallReview[id], yelpReview[id]])
        idCount+=1
    
    from sklearn.metrics import accuracy_score, confusion_matrix
    from sklearn.model_selection import cross_val_score
    import numpy as np
    
    from IPython.display import HTML, display

    display(HTML(
       '<table><tr>{}</tr></table>'.format(
           '</tr><tr>'.join(
               '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in data)
           )
    ))

    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    recall = np.diag(cm) / np.sum(cm, axis=1)
    precision = np.diag(cm) / np.sum(cm, axis=0)
    recall = np.mean(recall)
    precision = np.mean(precision)
    f1score = (2*precision*recall)/(precision+recall)
    print("Accuracy = {}".format(accuracy))
    print("Precision = {}".format(precision))
    print("Recall = {}".format(recall))
    print("F1-score = {}".format(f1score))
    print("Cross Validation Score = {}\n".format(cross_val_score(
        log_model, X_train, y_train, cv=3, scoring="accuracy")))

    
train_then_build_model(data_labels, features_nd, data, data_id)



0,1,2
Business ID,Prediction,Actual Rating
Xg5qEQiB-7L6kGJ5F4K3bQ,positive,5.0
yGMCl0vYigshkXiZFIDTNw,negative,3.0
oFHvr1cAktvU-bQgrl4aPw,positive,4.0
k2b3niokS_tosjah_rzCPw,positive,3.5
r48H_sNUGmcRGX1LsEc2mg,positive,3.0
75RP4HSsSJOe_e7e2e3jQQ,neutral,4.0
f_eiOrEcMnkHB7GvQVOHkQ,positive,4.0
YRyYbOSwvHkZsZOLv98oQg,positive,4.0
Cdk3wRR7TwJb1JW7agPJXw,neutral,4.0


Accuracy = 0.8176666666666667
Precision = 0.7067339474662129
Recall = 0.623658457978303
F1-score = 0.6626024052361132




Cross Validation Score = [0.8067983 0.80225   0.8032008]

