In [68]:
# Importing packages
import argparse
import json
import pprint
import requests
import sys
import urllib
import re
import pickle as pk
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from os import getcwd, chdir
import json


# This client code can run on Python 2.x or 3.x.  Your imports can be
# simpler if you only need one of those.
try:
    # For Python 3.0 and later
    from urllib.error import HTTPError
    from urllib.parse import quote
    from urllib.parse import urlencode
except ImportError:
    # Fall back to Python 2's urllib2 and urllib
    from urllib2 import HTTPError
    from urllib import quote
    from urllib import urlencode

In [69]:
# OAuth credential placeholders that must be filled in by users.
# You can find them on
# https://www.yelp.com/developers/v3/manage_app
CLIENT_ID = "CWrjzJHEP9wnCstt8u7CJg"
CLIENT_SECRET = "zdFTrdoQvZ4FnJ6nmunRR947PXkYKPSIQJOxNRfOklmkbPOxwx8xNLIo4vsdkQc7"

# API constants, you shouldn't have to change these.
API_HOST = 'https://api.yelp.com'
SEARCH_PATH = '/v3/businesses/search'
BUSINESS_PATH = '/v3/businesses/'  # Business ID will come after slash.
REVIEWS_PATH = '/reviews'
TOKEN_PATH = '/oauth2/token'
GRANT_TYPE = 'client_credentials'


# Defaults for our simple example.
DEFAULT_TERM = 'restaurants'
DEFAULT_LOCATION = 'Singapore'
SEARCH_LIMIT = 50

# Categories of restaurant
CATEGORIES = ["Singaporean", "French", "Japanese", "Korean","indpak"]

In [70]:
def obtain_bearer_token(host, path):
    """Given a bearer token, send a GET request to the API.

    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        url_params (dict): An optional set of query parameters in the request.

    Returns:
        str: OAuth bearer token, obtained using client_id and client_secret.

    Raises:
        HTTPError: An error occurs from the HTTP request.
    """
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    assert CLIENT_ID, "Please supply your client_id."
    assert CLIENT_SECRET, "Please supply your client_secret."
    data = urlencode({
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
        'grant_type': GRANT_TYPE,
    })
    headers = {
        'content-type': 'application/x-www-form-urlencoded',
    }
    response = requests.request('POST', url, data=data, headers=headers)
    bearer_token = response.json()['access_token']
    return bearer_token

In [71]:
def requestAPI(host, path, bearer_token, url_params=None):
    """Given a bearer token, send a GET request to the API.

    Args:
        host (str): The domain host of the API.
        path (str): The path of the API after the domain.
        bearer_token (str): OAuth bearer token, obtained using client_id and client_secret.
        url_params (dict): An optional set of query parameters in the request.

    Returns:
        dict: The JSON response from the request.

    Raises:
        HTTPError: An error occurs from the HTTP request.
    """
    #url_params = url_params or {}
    #print ("url_params are " + str(url_params))
    
    url = '{0}{1}'.format(host, quote(path.encode('utf8')))
    headers = {
        'Authorization': 'Bearer %s' % bearer_token,
    }

    #print(u'Querying {0} ...'.format(url))

    response = requests.request('GET', url, headers=headers, params=url_params)

    return response.json()

def search(bearer_token, term, location,category,offset):
    """Query the Search API by a search term and location.

    Args:
        term (str): The search term passed to the API.
        location (str): The search location passed to the API.

    Returns:
        dict: The JSON response from the request.
    """

    url_params = {
        'term': term.replace(' ', '+'),
        'location': location.replace(' ', '+'),
        'limit': SEARCH_LIMIT,
        'categories': category,
        'sort_by': 'review_count',
        'offset': offset
    }
    return requestAPI(API_HOST, SEARCH_PATH, bearer_token, url_params=url_params)

In [72]:
def get_business_reviews(bearer_token, business_id):
    """Query the Business REviews API by a business ID.

    Args:
        business_id (str): The ID of the business to query.

    Returns:
        dict: The JSON response from the request.
    """
    business_path = BUSINESS_PATH + business_id + REVIEWS_PATH

    return requestAPI(API_HOST, business_path, bearer_token)
def query_api(bearer_token, term, location, category,offset):
    """Queries the API by the input values from the user.

    Args:
        term (str): The search term to query.
        location (str): The location of the business to query.
    """
    
    
    return search(bearer_token, term, location, category,offset)

In [73]:
if __name__ == '__main__':
    bearer_token = obtain_bearer_token(API_HOST, TOKEN_PATH)
    restaurants_by_category = defaultdict(list)
    restaurant_reviews_count = defaultdict(int)
    restaurant_reviews = defaultdict(list)
    offset=0
    # hit api for 10 items with offset 50 for each category
    for category in CATEGORIES:
          for i in range(10):
              response = query_api(bearer_token,"restaurants",DEFAULT_LOCATION,category,offset)
              for restaurant in response["businesses"]:
                if restaurant["review_count"] > 2:
                    restaurants_by_category[category].append(restaurant["id"])
          offset+=50


    # unique restaurants
    restaurants = []
    for cat,restaurants_in_category in restaurants_by_category.items():
        restaurants+=restaurants_in_category
    restaurants = list(set(restaurants))

    # get reviews for business
    for restaurant in restaurants:
        reviews = get_business_reviews(bearer_token,restaurant)
        restaurant_reviews_count[restaurant] = reviews["total"]
        for review in reviews["reviews"]:
            restaurant_reviews[restaurant].append(review["text"])


In [74]:
# Change your path here
folder_path = "/home/ethi/ethi/nus/New_Media/Workshops/Data"
chdir(folder_path) 

classifier_svm = pk.load(open("classifier_svm.pk", "rb"),encoding="latin-1")
trainset = pk.load(open("trainset.pk",'rb'),encoding="latin-1")

def neg_tag(text):
    transformed = re.sub(r"\b(?:never|nothing|nowhere|noone|none|not|haven't|hasn't|hasnt|hadn't|hadnt|can't|cant|couldn't|couldnt|shouldn't|shouldnt|won't|wont|wouldn't|wouldnt|don't|dont|doesn't|doesnt|didn't|didnt|isnt|isn't|aren't|arent|aint|ain't|hardly|seldom)\b[\w\s]+[^\w\s]", lambda match: re.sub(r'(\s+)(\w+)', r'\1NEG_\2', match.group(0)), text, flags=re.IGNORECASE)
    return(transformed)


# Create a training list which will now contain reviews with Negatively tagged words and their labels
train_set_neg = []

# Append elements to the list
for doc in trainset:
    trans = neg_tag(doc[0])
    lab = doc[1]
    train_set_neg.append([trans, lab])


trainset = pk.load(open("trainset.pk",'rb'),encoding="latin-1")
train_nolab = [t[0].encode("utf-8") for t in train_set_neg]

vectorizer = TfidfVectorizer()

# this is used below for training the SVM
train_vectors = vectorizer.fit_transform(train_nolab)




In [75]:
#get score for reviews
def get_score(reviews):
    reviews_text = reviews
    reviews = list(map(neg_tag,reviews))
    reviews = vectorizer.transform(reviews)
    predSVM = classifier_svm.predict(reviews)
    pred = list(predSVM)
    pred = [-1 if i < 0 else 1 for i in pred]
    scored_reviews = []
    for i in range(len(reviews_text)):
        scored_reviews.append({"text":reviews_text[i],"score":pred[i]})
    return scored_reviews

In [76]:
# build json data
def build_restaurant_data(restaurants):
    restaurant_data = []
    for restaurant in restaurants:
        restaurant_data.append({"name": restaurant,
                                "nb_reviews": restaurant_reviews_count[restaurant],
                                "reviews":get_score(restaurant_reviews[restaurant])})
    return restaurant_data 
json_data = {}
for category, restaurants in restaurants_by_category.items():
    json_data[category] = build_restaurant_data(restaurants)

In [66]:
# Json Dump to file
result = json.dumps(json_data,indent=2, sort_keys=True)
with open('day2.json', 'w') as outfile:
    json.dump(result, outfile)