In [9]:
import re, math, collections, itertools, os
import nltk, nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk import precision,recall
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import io
import hashlib
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline
import json
import os
from nltk.corpus import sentiwordnet as swn

In [10]:
config_dir = 'config'
cuisines_file = config_dir + '/cuisines.txt'
city_data_dir = 'CITY_DATA'
cities_file = city_data_dir + '/cities_data.json'

In [11]:
def read_cuisines():
    with open(cuisines_file, 'r') as f:
        tup = [(line.split(" ")[0],line.split(" ")[1]) for line in f]
    return tup

In [12]:
cuisines = [c[0] for c in read_cuisines()]
print u'Cuisines we are interested in are: "{0}"'.format(cuisines)

Cuisines we are interested in are: "['Chinese', 'Mexican', 'Italian', 'Sushi', 'Greek', 'French', 'Thai', 'Spanish', 'Indian', 'Mediterranean']"


In [13]:
# Load and read city names and pincodes
with open(cities_file) as data_file:    
    data = json.load(data_file)

city_names = [city_data["name"] for city_data in data["geonames"]]
city_lats = [city_data["lat"] for city_data in data["geonames"]]
city_lngs = [city_data["lng"] for city_data in data["geonames"]]
city_states = [city_data["adminCode1"] for city_data in data["geonames"]]
city_info = zip(city_names,city_states,city_lats,city_lngs)
print u'Data of {0} cities retrieved successfully...'.format(len(city_info))

Data of 1000 cities retrieved successfully...


In [14]:
def tokenize(text):
    return filter(None,re.split(r"[\W]",text.lower()))

In [15]:
path = 'YELP_REVIEW_DATA'
#Finding the weighted objective score of reviews/cuisine/city
all_cities = [x[0] for x in os.walk(path)]
all_cities.pop(0)
city_weight_dict = {}
for city in all_cities:
    tup_list_for_weights = []
    for cuisine in cuisines:
        tot_cus_weight = 0
        print city+os.sep+cuisine+".json"
        with open(city+os.sep+cuisine+".json") as data_file:    
            review_data = json.load(data_file)
            for res,reviews in review_data.iteritems():
                res_total = 0
                if len(reviews) != 0:
                    for review in reviews:
                        tot_rev_toks = []
                        for rev in review:
                            tot_rev_toks.append(tokenize(rev))
                        tok_final = [val for sublist in tot_rev_toks for val in sublist]
                        tot_score = 0
                        for tok in tok_final:
                            scores = swn.senti_synsets(tok)
                            if scores:
                                tot_score += (scores[0].pos_score()+scores[0].neg_score())
                    res_total += tot_score
            tot_cus_weight += res_total
            tup_list_for_weights.append((cuisine,tot_cus_weight))
    city_weight_dict.update({city.split("/")[1]:tup_list_for_weights})
    #print city_weight_dict

YELP_REVIEW_DATA/Borough of Queens/Chinese.json
YELP_REVIEW_DATA/Borough of Queens/Mexican.json
YELP_REVIEW_DATA/Borough of Queens/Italian.json
YELP_REVIEW_DATA/Borough of Queens/Sushi.json
YELP_REVIEW_DATA/Borough of Queens/Greek.json
YELP_REVIEW_DATA/Borough of Queens/French.json
YELP_REVIEW_DATA/Borough of Queens/Thai.json
YELP_REVIEW_DATA/Borough of Queens/Spanish.json
YELP_REVIEW_DATA/Borough of Queens/Indian.json
YELP_REVIEW_DATA/Borough of Queens/Mediterranean.json
{'Borough of Queens': [('Chinese', 2.25), ('Mexican', 8.75), ('Italian', 6.0), ('Sushi', 4.375), ('Greek', 0), ('French', 0), ('Thai', 0), ('Spanish', 6.125), ('Indian', 8.875), ('Mediterranean', 5.375)]}
YELP_REVIEW_DATA/Boston/Chinese.json
YELP_REVIEW_DATA/Boston/Mexican.json
YELP_REVIEW_DATA/Boston/Italian.json
YELP_REVIEW_DATA/Boston/Sushi.json
YELP_REVIEW_DATA/Boston/Greek.json
YELP_REVIEW_DATA/Boston/French.json
YELP_REVIEW_DATA/Boston/Thai.json
YELP_REVIEW_DATA/Boston/Spanish.json
YELP_REVIEW_DATA/Boston/Indian

ValueError: No JSON object could be decoded

In [16]:
cuisine_popularity = [c[1] for c in read_cuisines()]
cuisine_total_weight = 0
for c in read_cuisines():
    cuisine_total_weight += int(c[1].split("%")[0])
cuisine_dict = {}
for c in read_cuisines():
    cuisine_dict.update({c[0]:int(c[1].split("%")[0])})
print u'Popularity of Cuisines we are interested in are: "{0}"'.format(cuisine_popularity)
print cuisine_dict

Popularity of Cuisines we are interested in are: "['76%\n', '74%\n', '71%\n', '32%\n', '32%\n', '26%\n', '24%\n', '22%\n', '19%\n', '16%']"
{'Greek': 32, 'Indian': 19, 'Mexican': 74, 'Chinese': 76, 'Spanish': 22, 'Thai': 24, 'Sushi': 32, 'Mediterranean': 16, 'French': 26, 'Italian': 71}


In [17]:
agg_weight_dict = {}
for k,v in city_weight_dict.iteritems():
    agg_weight = []
    for cus in v:
        agg_weight.append((cus[0],(cus[1]*cuisine_total_weight)/cuisine_dict[cus[0]]))
    agg_weight_dict.update({k:sorted(agg_weight, key=lambda tup: tup[1],reverse=True)})

In [18]:
#determine position of indian food in each city
ranking_tup_list = []
for k,v in agg_weight_dict.iteritems():
    for cnt,tup in enumerate(v):
        if (tup[0] == "Indian"):
            ranking_tup_list.append((k,cnt+1))
print ranking_tup_list

[('Lincoln', 3), ('Nashville', 5), ('Jackson', 9), ('Honolulu', 9), ('Chicago', 1), ('Miami', 1), ('Little Rock', 10), ('Kansas City', 1), ('Denver', 9), ('Lansing', 9), ('Montgomery', 9), ('Manhattan', 9), ('Houston', 10), ('Columbus', 2), ('Indianapolis', 1), ('Columbia', 6), ('Phoenix', 10), ('Charlotte', 10), ('Philadelphia', 9), ('Boston', 6), ('Dallas', 2), ('Fort Worth', 3), ('Borough of Queens', 1), ('Brooklyn', 2), ('New York', 2), ('Hartford', 9), ('Los Angeles', 4), ('Des Moines', 9), ('Madison', 9), ('Oklahoma City', 9), ('Memphis', 9), ('Jacksonville', 9)]


In [19]:
print sorted(ranking_tup_list, key=lambda tup: tup[1])[:15]

[('Chicago', 1), ('Miami', 1), ('Kansas City', 1), ('Indianapolis', 1), ('Borough of Queens', 1), ('Columbus', 2), ('Dallas', 2), ('Brooklyn', 2), ('New York', 2), ('Lincoln', 3), ('Fort Worth', 3), ('Los Angeles', 4), ('Nashville', 5), ('Columbia', 6), ('Boston', 6)]
