In [45]:
import pickle
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [2]:
from sklearn.externals import joblib

In [3]:
!ls -lrth models/

total 131M
-rw-rw-r-- 1 ubuntu ubuntu  19M May  7 01:05 Vectorizer.sav
-rw-rw-r-- 1 ubuntu ubuntu 2.6M May  7 01:41 Logistic-Regression.model
-rw-rw-r-- 1 ubuntu ubuntu  13M May  7 02:06 Neural_network.model
-rw-rw-r-- 1 ubuntu ubuntu  12M May  7 02:08 BernoulliNB.model
-rw-rw-r-- 1 ubuntu ubuntu  19M May  7 02:51 61vectorizer-2GB-RSE.sav
-rw-rw-r-- 1 ubuntu ubuntu  19M May  7 02:57 79vectorizer-2GB-RSE.sav
-rw-rw-r-- 1 ubuntu ubuntu  19M May  7 02:57 15vectorizer-2GB-RSE.sav
-rw-rw-r-- 1 ubuntu ubuntu  19M May  7 03:07 58vectorizer-2GB-RSE.sav
-rw-rw-r-- 1 ubuntu ubuntu  13M May  7 03:24 v.dat


In [104]:
test_file = 'data/test/10.json'
summaries_file = 'data/test/summaries.json'
model_lr = 'models/Logistic-Regression.model'
model_nn = 'models/Neural_network.model'
model_BNB = 'models/BernoulliNB.model'
vectorizer_file = 'models/Vectorizer.sav'

In [39]:
vectorizer =  pickle.load(open(vectorizer_file, 'rb'))

In [40]:
ds_test = pd.read_csv("/home/ubuntu/temp/ds_rse_test.csv", sep="\t")

In [41]:
ds_test['Y'] = ds_test['stars'] > 3 
X_test = ds_test['text']
y_test = ds_test['Y'].astype(int)

In [42]:
X_test_vec = vectorizer.transform(X_test)

In [81]:
clf = pickle.load(open(model_lr, 'rb'))

In [218]:
y_predict = clf.predict(X_test_vec)
accuracy_score(y_test[:], y_predict)

0.8907642456436801

In [219]:
def summary_lookup(for_rating):
    with open(summaries_file,'r') as json_file:    
        d = json.load(json_file)
    if for_rating > 0.7:
        return d['summaries'][0]['Rank_1']
    elif for_rating > 0.5:
        return d['summaries'][1]['Rank_2']
    elif for_rating > 0.3:
        return d['summaries'][2]['Rank_3']
    else:
        return d['summaries'][3]['Rank_4']
    

In [220]:
summary_lookup(0.6)

u'The food is pretty good. Few people have enjoyed it while some have not. The taste and presentation of food is good but it is a little pricey. Few times the orders brought in were wrong but it is worth trying. Some dishes were amazing and some were bland. Neat concept of food .Worth trying it for your own personal experience.'

In [225]:
class Restaurant():
    
    def __init__(self, business_id="", name="", reviews=[]):
        # Each rocket has an (x,y) position.
        self.bid = business_id
        self.name = name
        self.reviews = reviews
        self.pos = 0.0
        self.neg = 0.0
        self.predict()
        self.get_overall_rating()
        self.summary = summary_lookup(self.overall)
       
    def predict(self):
        for r in self.reviews:
            if clf.predict(vectorizer.transform([str(r)])):
                self.pos += 1.0
            else:
                self.neg += 1.0
        
    def get_overall_rating(self):
        self.overall = self.pos/(self.pos + self.neg)
        
    def print_summary(self):
        print("-------------------------")
        print("Business ID", self.bid)
        print("Over All Rating %f", self.overall)
        print("Summary", self.summary)
        print("-------------------------\n\n")
        

## Summary Extraction 

<h5> Algorithm </h5>

  
<pre>
class Resturant:
  name 
  id 
  reviews = []
  pos  = 0
  neg  = 0
  overall_rating = pos / (pos + neg)
  summary = lookup(overall_rating)
  
1. For Each Restaurant in Test Json
    2. For each Reviews:
       3. v = vectorizer.transform(review[i])
       4. if clf.predict(v): 
            pos += 1
          else:
            neg += 1
    6. overall_rating = pos/(pos+neg) 
    7. res.summary = lookup(overall_rating)
</pre>

In [226]:
res = []
for restaurant in data["yelp"]:
    res.append(Restaurant(business_id=restaurant["_id"], reviews=restaurant["reviews"]))


In [227]:
for r in res:
    r.print_summary()

-------------------------
('Business ID', u'--6MefnULPED_I942VcFNA')
('Over All Rating %f', 0.5945945945945946)
('Summary', u'The food is pretty good. Few people have enjoyed it while some have not. The taste and presentation of food is good but it is a little pricey. Few times the orders brought in were wrong but it is worth trying. Some dishes were amazing and some were bland. Neat concept of food .Worth trying it for your own personal experience.')
-------------------------


-------------------------
('Business ID', u'--7zmmkVg-IMGaXbuVd0SQ')
('Over All Rating %f', 0.8297872340425532)
('Summary', u'The food is amazing and the taste lives up to the price. Definitely recommended coming here for any day of the week. Great place to go for breakfast, lunch or dinner. Even the simplest looking dishes tastes very delicious. The food portion is perfect and scrumptious. Overall very delightful and highly recommended.')
-------------------------


-------------------------
('Business ID', u'