In [282]:
import json
import random
import re
from collections import defaultdict, Counter
from functools import reduce

from IPython.display import HTML, display
import numpy as np
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB, ComplementNB
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import ShuffleSplit, cross_val_score
import tabulate

In [283]:
objs = []

with open("./business.json") as f:
    # Each line in business.json is a separate json object
    for line in f:
        # Read the line, convert it into a dictionary with json.loads,
        # then convert THAT into a defaultdict so that if we ask for a
        # field that a line doesn't have, it will give an empty string
        # instead of throwing an error
        objs.append(defaultdict(lambda: "", json.loads(line)))

In [284]:
# Get all unique columns
keys = set()
for obj in objs:
    keys.update(obj.keys())

In [286]:
# Split categories into list
for obj in objs:
    # This would be easier, but no guarantee categories always split by ", "
    # and not just ","
    # obj["categories"] = obj["categories"].split(", ")
    cats = obj["categories"]
    # Might be null
    if cats:
        obj["categories"] = [s.strip() for s in cats.split(",")]
    else:
        obj["categories"] = []

AttributeError: 'list' object has no attribute 'split'

In [287]:
len(objs)

192609

In [288]:
# Filter just restaurants
restaurants = [o for o in objs if "Restaurants" in o["categories"]]

In [289]:
len(restaurants)

59371

In [290]:
len(restaurants) / len(objs) * 100

30.824623979149468

In [291]:
# Promote "attributes" fields to full-blown columns
for r in restaurants:
    attrs = r["attributes"]
    if not attrs:
        continue
    for k in attrs.keys():
        keys.add(k)
        r[k] = attrs[k]
keys.remove("attributes") # Don't want in final table

In [292]:
# Filter just restaurants with prices
restaurants = [r for r in restaurants if r["RestaurantsPriceRange2"] in ["1", "2", "3", "4"]]

In [293]:
len(restaurants)

52043

In [294]:
prices = [r["RestaurantsPriceRange2"] for r in restaurants]

In [295]:
valRE = re.compile("u?'(.+)'")
def cleanVal(val):
    m = valRE.match(val)
    if m:
        return m[1]
    return val

In [296]:
def columnEncode(colName):
    col = [cleanVal(r[colName]) for r in restaurants]
    binCol = LabelBinarizer().fit_transform(col)
    return np.array(binCol)

In [297]:
multinomialColumnNames = [
    'AcceptsInsurance',
    'AgesAllowed',
    'Alcohol',
    'BYOB',
    'BYOBCorkage',
    'BikeParking',
    'BusinessAcceptsBitcoin',
    'BusinessAcceptsCreditCards',
    'ByAppointmentOnly',
    'Caters',
    'CoatCheck',
    'Corkage',
    'DogsAllowed',
    'DriveThru',
    'GoodForDancing',
    'GoodForKids',
    'HappyHour',
    'HasTV',
    'NoiseLevel',
    'Open24Hours',
    'OutdoorSeating',
    'RestaurantsAttire',
    'RestaurantsCounterService',
    'RestaurantsDelivery',
    'RestaurantsGoodForGroups',
    'RestaurantsReservations',
    'RestaurantsTableService',
    'RestaurantsTakeOut',
    'Smoking',
    'WheelchairAccessible',
    'WiFi'
]

In [298]:
rowOrder = list(range(len(prices)))
random.shuffle(rowOrder)
breakPoint = int(0.75*len(prices))
trainIndices = rowOrder[:breakPoint]
testIndices = rowOrder[breakPoint:]

trainPrices = np.array(prices)[trainIndices]

In [299]:
multinomialEncodedCols = {}
for c in multinomialColumnNames:
    multinomialEncodedCols[c] = columnEncode(c)

In [300]:
# Taken individually, features are not very meaningful
table = [[c,
          np.mean(cross_val_score(MultinomialNB(), multinomialEncodedCols[c], prices, cv=10, n_jobs=-1)),
          np.mean(cross_val_score(ComplementNB(), multinomialEncodedCols[c], prices, cv=10, n_jobs=-1))]
        for c in multinomialColumnNames]
display(HTML(tabulate.tabulate(table, tablefmt='html')))

exception calling callback for <Future at 0x7f9a16a9fa58 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback: 
'''
Traceback (most recent call last):
  File "/home/wt/.local/share/virtualenvs/412proj-b0lHi2qv/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 391, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "/usr/local/lib/python3.7/multiprocessing/queues.py", line 99, in get
    if not self._rlock.acquire(block, timeout):
KeyboardInterrupt
'''

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/wt/.local/share/virtualenvs/412proj-b0lHi2qv/lib/python3.7/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/home/wt/.local/share/virtualenvs/412proj-b0lHi2qv/lib/python3.7/site-packages/joblib/parallel.py", line 309, in __call__
    self.parallel.dispatch_next(

KeyboardInterrupt: 

In [301]:
multinomialModels = {}
for c in multinomialColumnNames:
    col = np.array(multinomialEncodedCols[c])[trainIndices]
    multinomialModels[c] = MultinomialNB()
    multinomialModels[c].fit(col, trainPrices)

In [302]:
def metaPredict(i):
    prob = np.array([1,1,1,1])
    for colName in multinomialColumnNames:
        val = np.array(multinomialEncodedCols[colName][i]).reshape(1,-1)
        model = multinomialModels[colName]
        newProbs = model.predict_proba(val)
        prob = np.multiply(prob, newProbs)
    return prob

In [303]:
priceOpts = ["1", "2", "3", "4"]

In [304]:
predictedProbs = reduce(np.multiply, [multinomialModels[c].predict_proba(
                                        np.array(multinomialEncodedCols[c])[testIndices]
                                      )for c in multinomialColumnNames])

In [305]:
getCategory = lambda vec: priceOpts[np.argmax(vec)]
predictions = np.apply_along_axis(getCategory, axis=1, arr=predictedProbs)

In [306]:
right = 0
wrong = 0
for (p, a) in zip(predictions, np.array(prices)[testIndices]):
    if p == a:
        right += 1
    else:
        wrong += 1

In [307]:
wrong / (right + wrong)

0.47029436630543386

In [197]:
complementModels = {}
for c in multinomialColumnNames:
    col = np.array(multinomialEncodedCols[c])[trainIndices]
    complementModels[c] = ComplementNB()
    complementModels[c].fit(col, trainPrices)

In [182]:
trainPrices[0:5]

array(['1', '1', '1', '2', '3'], dtype='<U1')

In [185]:
np.array(prices)[trainIndices[0:5]]

array(['1', '1', '1', '2', '3'], dtype='<U1')

In [198]:
def metaPredictComplement(i):
    prob = np.array([1,1,1,1])
    for colName in multinomialColumnNames:
        val = np.array(multinomialEncodedCols[colName][i]).reshape(1,-1)
        model = complementModels[colName]
        newProbs = model.predict_proba(val)
        prob = np.multiply(prob, newProbs)
    return prob

In [200]:
rightC = 0
wrongC = 0
for i in testIndices:
    truePrice = prices[i]
    predPrice = priceOpts[np.argmax(metaPredictComplement(i))]
    if truePrice == predPrice:
        rightC += 1
    else:
        wrongC += 1

In [201]:
(wrongC) / (rightC + wrongC) * 100

32.142033663822914

In [221]:
nnColNames = [
    'city',
    'GoodForKids',
    'RestaurantsReservations',
    'HasTV',
    'state',
    'review_count',
    'RestaurantsTakeOut',
    'RestaurantsDelivery',
    'stars'
]

def isNNRestaurant(r):
    for c in nnColNames:
        if c not in r or r[c] == '' or r[c] == 'None':
            return False
    return True

nnRestaurants = [r for r in restaurants if isNNRestaurant(r)]
len(nnRestaurants)

40900

In [241]:
nnPrices = [r["RestaurantsPriceRange2"] for r in nnRestaurants]

In [235]:
def toBin(str):
    s = str.strip().lower()
    if s == 'true':
        return 1
    elif s == 'false':
        return 0
    else:
        raise Exception(s)

nnFVector = [[toBin(r["GoodForKids"]),
              toBin(r["RestaurantsReservations"]),
              toBin(r["HasTV"]),
              toBin(r["RestaurantsTakeOut"]),
              toBin(r["RestaurantsDelivery"])] for r in nnRestaurants]

In [239]:
nnState = LabelBinarizer().fit_transform([r["state"].upper() for r in nnRestaurants])

In [240]:
nnGVector = [[r["review_count"], r["stars"]] for r in nnRestaurants]

In [243]:
nnRowOrder = list(range(len(nnPrices)))
random.shuffle(nnRowOrder)
nnBreakPoint = int(0.75*len(nnPrices))

nnTrainIndices = nnRowOrder[:nnBreakPoint]
nnTestIndices = nnRowOrder[nnBreakPoint:]

nnTrainPrices = np.array(nnPrices)[nnTrainIndices]

In [246]:
binNB = BernoulliNB()
binNB.fit(np.array(nnFVector)[nnTrainIndices], nnTrainPrices)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [247]:
stateNB = MultinomialNB()
stateNB.fit(np.array(nnState)[nnTrainIndices], nnTrainPrices)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [248]:
numNB = GaussianNB()
numNB.fit(np.array(nnGVector)[nnTrainIndices], nnTrainPrices)

GaussianNB(priors=None, var_smoothing=1e-09)

In [255]:
Counter(binNB.predict(np.array(nnFVector)[nnTestIndices]))

Counter({'1': 5617, '2': 4305, '3': 303})

In [256]:
Counter(stateNB.predict(np.array(nnState)[nnTestIndices]))

Counter({'2': 7874, '1': 2351})

In [257]:
Counter(numNB.predict(np.array(nnGVector)[nnTestIndices]))

Counter({'1': 8416, '2': 1665, '3': 144})

In [259]:
binaryPred = binNB.predict_proba(np.array(nnFVector)[nnTestIndices])
statePred = stateNB.predict_proba(np.array(nnState)[nnTestIndices])
numericPred = numNB.predict_proba(np.array(nnGVector)[nnTestIndices])

testMetaPredictions = np.multiply(binaryPred, statePred, numericPred)

In [262]:
getCategory = lambda vec: priceOpts[np.argmax(vec)]
nnPredictions = np.apply_along_axis(getCategory, axis=1, arr=testMetaPredictions)

In [277]:
right = 0
wrong = 0
for (p,a) in zip(nnPredictions, np.array(nnPrices)[nnTestIndices]):
    if p == a:
        right += 1
    else:
        wrong += 1

In [278]:
wrong / (right + wrong) * 100

31.716381418092908