## Overview

Your goal is to use data science to produce a list of which restaurants you believe will be on the DC Michelin Guide, and how many stars each of your submitted restaurants earned. You are welcome to use any tools that you see fit.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import unicodedata
import requests
import json
import re

#### Using yelp's API, collect data from Chicago, New York, San Francisco (all Michelin reviewed cities) & DC.

In [None]:
authorization = {'Authorization': ***** }

In [None]:
dc_yelp = []

for offset in range(0, 300, 20):
    try:
        response = requests.get('https://api.yelp.com/v3/businesses/search?term=restaurants&location=Washington,+DC&offset={}&sort_by=rating&price=2,3,4'.format(offset), headers=authorization)
        dc_yelp.append(json.loads(response.text))
    except:
        pass

In [None]:
chicago_yelp = []

for offset in range(0, 300, 20):
    try:
        response = requests.get('https://api.yelp.com/v3/businesses/search?term=restaurants&location=Chicago&offset={}&sort_by=rating&price=2,3,4'.format(offset), headers=authorization)
        chicago_yelp.append(json.loads(response.text))
    except:
        pass

In [None]:
ny_yelp = []

for offset in range(0, 300, 20):
    try:
        response = requests.get('https://api.yelp.com/v3/businesses/search?term=restaurants&location=New+York&offset={}&sort_by=rating&price=2,3,4'.format(offset), headers=authorization)
        ny_yelp.append(json.loads(response.text))
    except:
        pass

In [None]:
sf_yelp = []

for offset in range(0, 300, 20):
    try:
        response = requests.get('https://api.yelp.com/v3/businesses/search?term=restaurants&location=San+Francisco&offset={}&sort_by=rating&price=2,3,4'.format(offset), headers=authorization)
        sf_yelp.append(json.loads(response.text))
    except:
        pass

#### Read data into dataframes.

In [None]:
def df_maker(yelp):
    rest_id = []
    name = []
    rating = []
    review_count = []
    price = []
    categories = []
    for dict in yelp:
        for k, v in dict.iteritems():
            if k == 'businesses':
                for dict in v:
                    for inner_k, inner_v in dict.iteritems():
                        if inner_k == 'id':
                            rest_id.append(inner_v)
                        elif inner_k == 'name':
                            name.append(inner_v)
                        elif inner_k == 'rating':
                            rating.append(inner_v)
                        elif inner_k == 'review_count':
                            review_count.append(inner_v)
                        elif inner_k == 'price':
                            price.append(inner_v)
                        elif inner_k == 'categories':
                            aliases = []
                            for dict in inner_v:
                                for deep_k, deep_v in dict.iteritems():
                                    if deep_k == 'alias':
                                        aliases.append(deep_v)
                            categories.append(" ".join(aliases))
                                        
                            
    df = pd.DataFrame({ 'num_reviews': review_count, 
                        'price': price, 
                        'rating': rating,
                        'restaurant': name,
                        'rest_id': rest_id,
                        'category': categories })

    df = df[['restaurant', 'rest_id', 'rating', 'price', 'num_reviews', 'category']]
    return df

In [None]:
dc = df_maker(dc_yelp)
chi = df_maker(chicago_yelp)
sf = df_maker(sf_yelp)
ny = df_maker(ny_yelp)

In [None]:
dc.restaurant = dc.restaurant.apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
sf.restaurant = sf.restaurant.apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
ny.restaurant = ny.restaurant.apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
chi.restaurant = chi.restaurant.apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))

#### I prepared (manually) a list of all US Michelin starred restaurants. Since only some were picked on the first   sweep of yelp's API,  I'll need to do a second request, this time asking for specific restaurant names. If this creates duplicates (it will), I'll deal with them later.


In [None]:
stars = pd.read_csv('/users/nick/desktop/michelin_stars.csv')

In [None]:
sf_stars = []
for i in stars.loc[stars.SanFran.notnull()].SanFran:
    sf_stars.append(i)
    
ny_stars = []
for i in stars.loc[stars.NewYork.notnull()].NewYork:
    ny_stars.append(i)
    
chi_stars = []
for i in stars.loc[stars.Chicago.notnull()].Chicago:
    chi_stars.append(i)

In [None]:
sf_michelin = []

for i in stars.loc[stars.SanFran.notnull()].SanFran:
    response = requests.get('https://api.yelp.com/v3/businesses/search?term={}&location=San+Francisco&categories=restaurants'.format(i.replace(' ', '+')), headers=authorization)
    try:
        for r in range(response.json()['total']):
            if (response.json()['businesses'][r]['name'] in i.decode('utf-8') or i.decode('utf-8') in response.json()['businesses'][r]['name']):
                sf_michelin.append([response.json()['businesses'][r]])
    except:
        pass

In [None]:
ny_michelin = []

for i in stars.loc[stars.NewYork.notnull()].NewYork:
    response = requests.get('https://api.yelp.com/v3/businesses/search?term={}&location=New+York&categories=restaurants'.format(i.replace(' ', '+')), headers=authorization)
    try:
        for r in range(response.json()['total']):
            if (response.json()['businesses'][r]['name'] in i.decode('utf-8') or i.decode('utf-8') in response.json()['businesses'][r]['name']):
                if response.json()['businesses'][r]['review_count'] > 15:
                    ny_michelin.append([response.json()['businesses'][r]])
    except:
        pass

In [None]:
chi_michelin = []

for i in stars.loc[stars.Chicago.notnull()].Chicago:
    response = requests.get('https://api.yelp.com/v3/businesses/search?term={}&location=Chicago&categories=restaurants'.format(i.replace(' ', '+')), headers=authorization)
    try:
        for r in range(response.json()['total']):
            if (response.json()['businesses'][r]['name'] in i.decode('utf-8') or i.decode('utf-8') in response.json()['businesses'][r]['name']):
                chi_michelin.append([response.json()['businesses'][r]])
    except:
        pass


In [None]:
def df_maker_v2(reviews):
    rest_id = []
    name = []
    rating = []
    review_count = []
    price = []
    categories = []
    for list in reviews:
        for dict in list:
            for k, v in dict.iteritems():
                if k == 'id':
                    rest_id.append(v)
                elif k == 'name':
                    name.append(v)
                elif k == 'rating':
                    rating.append(v)
                elif k == 'review_count':
                    review_count.append(v)
                elif k == 'price':
                    price.append(v)
                elif k == 'categories':
                            aliases = []
                            for dict in v:
                                for inner_k, inner_v in dict.iteritems():
                                    if inner_k == 'alias':
                                        aliases.append(inner_v)
                            categories.append(" ".join(aliases))
                            
    df = pd.DataFrame({ 'num_reviews': review_count, 
                        'price': price, 
                        'rating': rating,
                        'restaurant': name,
                        'rest_id': rest_id,
                        'category': categories })

    df = df[['restaurant', 'rest_id', 'rating', 'price', 'num_reviews', 'category']]
    return df

In [None]:
sf_stars = df_maker_v2(sf_michelin)
ny_stars = df_maker_v2(ny_michelin)
chi_stars = df_maker_v2(chi_michelin)

#### Updating dataframe with known Michelin star counts (from manually created list of Michelin starred restaurants).

In [None]:
for a, b in enumerate(sf_stars.restaurant):
    try:
        for x, y in enumerate(stars.loc[stars.SanFran.notnull()].SanFran):
            if (b.encode('utf-8') in y or y in b.encode('utf-8')):
                sf_stars.loc[a, 'stars'] = stars.loc[x, 'sf_stars']
    except:
        sf_stars.loc[a, 'stars'] = 0

In [None]:
for a, b in enumerate(ny_stars.restaurant):
    try:
        for x, y in enumerate(stars.loc[stars.NewYork.notnull()].NewYork):
            if (b.encode('utf-8') in y or y in b.encode('utf-8')):
                ny_stars.loc[a, 'stars'] = stars.loc[x, 'ny_stars']
    except:
        ny_stars.loc[a, 'stars'] = 0

In [None]:
for a, b in enumerate(chi_stars.restaurant):
    try:
        for x, y in enumerate(stars.loc[stars.Chicago.notnull()].Chicago):
            if (b.encode('utf-8') in y or y in b.encode('utf-8')):
                chi_stars.loc[a, 'stars'] = stars.loc[x, 'chi_stars']
    except:
        chi_stars.loc[a, 'stars'] = 0

In [None]:
ny = pd.concat([ny, ny_stars])
ny.reset_index(drop=True, inplace=True)
ny.stars.replace(np.nan, 0, inplace=True)

In [None]:
chi = pd.concat([chi, chi_stars])
chi.reset_index(drop=True, inplace=True)
chi.stars.replace(np.nan, 0, inplace=True)

In [None]:
sf = pd.concat([sf, sf_stars])
sf.reset_index(drop=True, inplace=True)
sf.stars.replace(np.nan, 0, inplace=True)

#### DC touch-ups

In [None]:
'''Looking through the DC restaurants pulled from yelp, many were missing that I think will be Michelin contenders. 
So, as above, I'll return to yelp's API.''' 
contenders = ["Kinship", "Marcel's", "Masseria", "Riggsby", "Garrison", "Mintwood", "Red Hen", "G by Mike Isabella", "Fiola Mare", "Le Diplomate", "Boss Shepherd's", "minibar", "The Dabney", "Metier", "Little Serow", "Source", "Obelisk", "Plume"]

In [None]:
dc_contenders = []

for i in contenders:
    response = requests.get('https://api.yelp.com/v3/businesses/search?term={}&location=Washington,+DC&categories=restaurants'.format(i.replace(' ', '+')), headers=authorization)
    try:
        for r in range(response.json()['total']):
            if (response.json()['businesses'][r]['name'] in i.decode('utf-8') or i.decode('utf-8') in response.json()['businesses'][r]['name']):
                dc_contenders.append([response.json()['businesses'][r]])
    except:
        pass

In [None]:
dc_contenders = df_maker_v2(dc_contenders)
dc = pd.concat([dc, dc_contenders])
dc.reset_index(drop=True, inplace=True)

####  Because I made two passes through yelp's API, there are duplicates. Need to get rid of those. Also, some restaurants have multiple yelp pages. I want only those with the highest number of reviews.

In [None]:
def duplicate_dropper(city):
    
    for i, e in enumerate(city.restaurant):
        if e in city.loc[city.duplicated(subset='restaurant') & (city.stars > 0)].restaurant.values:
            if city.loc[i, 'stars'] == 0:
                city.drop(i, axis=0, inplace=True) 
            
    city.reset_index(drop=True, inplace=True)
            
    for i, e in enumerate(city.restaurant):
        for r in range(len(city)):
            try:
                if (e == city.loc[r, 'restaurant'] and i != r):
                    if city.loc[i, 'num_reviews'] > city.loc[r, 'num_reviews']:
                        city.drop(r, axis=0, inplace=True)
                    else:
                        city.drop(i, axis=0, inplace=True)
            except:
                pass
            
    city.reset_index(drop=True, inplace=True)

In [None]:
duplicate_dropper(chi)
duplicate_dropper(ny)
duplicate_dropper(sf)

In [None]:
'''Since 'dc' doesn't have a 'stars' columns (a column that's part of my duplicate checker function), I'll check
for duplicates w/o using that function.'''

for i, e in enumerate(dc.restaurant):
    for r in range(len(dc)):
        try:
            if (e == dc.loc[r, 'restaurant'] and i != r):
                if dc.loc[i, 'num_reviews'] > dc.loc[r, 'num_reviews']:
                    dc.drop(r, axis=0, inplace=True)
                else:
                    dc.drop(i, axis=0, inplace=True)
        except:
            pass
            
dc.reset_index(drop=True, inplace=True)

####  I want review text, which requires another (tailored) API request.

In [None]:
ny_reviews = []

for i in ny.rest_id:
    response = requests.get('https://api.yelp.com/v3/businesses/{}/reviews'.format(i.encode('utf-8')), headers=authorization)
    store_reviews = []
    for r in range(response.json()['total']):
            store_reviews.append(response.json()['reviews'][r]['text'])
    ny_reviews.append(" ".join(store_reviews))

In [None]:
sf_reviews = []

for i in sf.rest_id:
    response = requests.get('https://api.yelp.com/v3/businesses/{}/reviews'.format(i.encode('utf-8')), headers=authorization)
    store_reviews = []
    for r in range(response.json()['total']):
            store_reviews.append(response.json()['reviews'][r]['text'])
    sf_reviews.append(" ".join(store_reviews))

In [None]:
chi_reviews = []

for i in chi.rest_id:
    response = requests.get('https://api.yelp.com/v3/businesses/{}/reviews'.format(i.encode('utf-8')), headers=authorization)
    store_reviews = []
    for r in range(response.json()['total']):
            store_reviews.append(response.json()['reviews'][r]['text'])
    chi_reviews.append(" ".join(store_reviews))

In [None]:
dc_reviews = []

for i in dc.rest_id:
    response = requests.get('https://api.yelp.com/v3/businesses/{}/reviews'.format(i.encode('utf-8')), headers=authorization)
    store_reviews = []
    for r in range(response.json()['total']):
            store_reviews.append(response.json()['reviews'][r]['text'])
    dc_reviews.append(" ".join(store_reviews))

In [None]:
chi['text'] = chi.category + ' ' + chi_reviews
ny['text'] = ny.category + ' ' + ny_reviews
sf['text'] = sf.category + ' ' + sf_reviews
dc['text'] = dc.category + ' ' + dc_reviews

In [None]:
# Change dollar signs to integers so price can be used for modeling.

price_map = { '$$': 1,
              '$$$': 2,
              '$$$$': 3 }

chi.price = chi.price.map(price_map)
sf.price = sf.price.map(price_map)
dc.price = dc.price.map(price_map)
ny.price = ny.price.map(price_map)

### DONE WITH DATA CLEANING

#### I will be modeling on San Francisco and New York data, and testing against Chicago data. 

#### Some light text analysis. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(ngram_range=(1,2), stop_words='english')

In [None]:
sf_ny = pd.concat([sf, ny])
sf_ny.reset_index(drop=True, inplace=True)

In [None]:
tvec.fit(sf_ny.text)

In [None]:
text_train = pd.DataFrame(tvec.transform(sf_ny.text).todense(),
                   columns=tvec.get_feature_names())
text_test = pd.DataFrame(tvec.transform(chi.text).todense(),
                   columns=tvec.get_feature_names())

y_train = sf_ny.stars
y_test = chi.stars

#### Quick check of the predictive power of text.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

rfc.fit(text_train, y_train)
y_pred = rfc.predict(text_test)

print metrics.accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

#### Text wound up not being useful. Yelp's API clips access to user reviews to three per restaurant, AND ~ one sentence per review. And while "star restaurant", "michelin", "michelin star", were (somewhat) predictive in Chicago, those are words unlikely to appear in DC reviews. Still, ever optimistic, I decided to create one column, 'natm', that tallies whether 'newamerican' or 'tasting menu' appeared in category/review text. The thinking being that new american really means cutting edge fine dining, which describes many  of DC's most hyped restaurants.

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = text_train.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False).head(10)
feature_importances

In [None]:
sf_ny['natm'] = sf_ny.category.apply(lambda x: 1 if ('newamerican' in x) or ('tasting menu' in x) else 0)
chi['natm'] = chi.category.apply(lambda x: 1 if ('newamerican' in x) or ('tasting menu' in x) else 0)
dc['natm'] = dc.category.apply(lambda x: 1 if ('newamerican' in x) or ('tasting menu' in x) else 0)

## Modeling

### Random Forest

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import StratifiedKFold

In [None]:
X_train = sf_ny.loc[:, ['num_reviews', 'price', 'rating', 'natm']]
y_train = sf_ny.loc[:, 'stars']

In [None]:
X_test = chi.loc[:, ['num_reviews', 'price', 'rating', 'natm']]
y_test = chi.loc[:, 'stars']

In [None]:
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [None]:
param_grid = {'class_weight': [None, 'balanced'],
              'criterion': ['gini', 'entropy'],
              'min_samples_leaf': [1, 2]}

rfc = RandomForestClassifier()
gs = GridSearchCV(rfc, param_grid).fit(X_train, y_train)

In [None]:
print gs.best_params_
print gs.score(X_test, y_test)
print gs.best_estimator_.feature_importances_

In [None]:
y_pred = gs.predict(X_test)
confusion_matrix(y_test, y_pred)

### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()

In [None]:
param_grid = {'min_samples_leaf': [1, 2, 3],
              'max_features': [.2, .3, .4, .6, .8, None]}

gs = GridSearchCV(gbc, param_grid, cv = 5).fit(X_train, y_train)


In [None]:
print gs.best_params_
print gs.score(X_test, y_test)

In [None]:
y_pred = gs.predict(X_test)
confusion_matrix(y_test, y_pred)

### K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [None]:
param_grid = {'weights': ['uniform', 'distance'],
              'metric': ['braycurtis', 'minkowski']}

gs = GridSearchCV(knn, param_grid, cv=5).fit(X_train, y_train)

In [None]:
print gs.best_params_
print gs.score(X_test, y_test)

In [None]:
y_pred = gs.predict(X_test)
confusion_matrix(y_test, y_pred)

### Support Vector Classifier

In [None]:
from sklearn import svm
svm = SVC()

In [None]:
param_grid = {'C': np.logspace(-3,3,7),
              'gamma': np.logspace(-3,3,7),
              'class_weight': [None, 'balanced']}

gs = GridSearchCV(svm, param_grid, cv=5).fit(X_train, y_train)

In [None]:
print gs.best_params_
print gs.score(X_test, y_test)

In [None]:
y_pred = gs.predict(X_test)
confusion_matrix(y_test, y_pred)

### Voting Ensemble

#### Using a voting ensembler as added 'regularization'. Only those restaurants that were consistently (across multiple models) identified as Michelin material get the nod.

In [None]:
svm = SVC(gamma=0.01, probability=True)
rfc = RandomForestClassifier(criterion='entropy', min_samples_leaf=2)
gbc = GradientBoostingClassifier(max_features=0.4, min_samples_leaf=2)
knn = KNeighborsClassifier(metric='braycurtis')

In [None]:
from sklearn.ensemble import VotingClassifier()

voter = VotingClassifier(estimators=[ ('svm', svm), ('knn', knn), ('rfc', rfc), ('gbc', gbc)],
                              voting='soft').fit(X_train, y_train)

print voter.score(X_test, y_test)

In [None]:
y_pred = voter.predict(X_test)
confusion_matrix(y_test, y_pred)

## DC Predictions

In [None]:
dc_X = dc.loc[:, ['num_reviews', 'price', 'rating', 'natm']]
dc_X = StandardScaler().fit_transform(dc_X)

In [None]:
dc_pred = voter.predict(dc_X)
dc['predictions'] = dc_pred

In [None]:
dc['none'] = voter.predict_proba(dc_X)[:, 0]
dc['one'] = voter.predict_proba(dc_X)[:, 1]
dc['two'] = voter.predict_proba(dc_X)[:, 2]
dc['three'] = voter.predict_proba(dc_X)[:, 3]

#### Look at the cluster of points around the origin. Not the entirety of the cell defined by 0.0 and 0.1 on the 'two' axis and 0.0 and 0.02 on the 'three' axis -- just the bottom left corner of that cell. That cluster represents ~180 restaurants, and among them 9 restaurants that my algorithm predicts. That seems to me very fluky, a coincidence of review count equalling the review count of a Michelin starred restaurant, for instance. So I'm going to drop all predictions within that cluster.

In [None]:
sns.lmplot('two', 'three', data=dc, hue='predictions', fit_reg=False)

In [None]:
# Won't include Off the Record because it is a bar -- not a restaurant.
dc.loc[(dc.predictions > 0) & (dc.two + dc.three > 0.03)]