## Imports

In [1]:
import pandas as pd
import altair as alt

from sklearn.model_selection import train_test_split

## Data Cleaning

In [2]:
df = pd.read_json("data/yelp_academic_dataset_business.json", lines=True)

df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Antiques, Fashion, Used, Vintage & Consignment...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0..."
3,oaepsyvc0J17qwi8cfrOWg,Great Clips,2566 Enterprise Rd,Orange City,FL,32763,28.914482,-81.295979,3.0,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Beauty & Spas, Hair Salons",
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': '{...","Gyms, Active Life, Interval Training Gyms, Fit...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'..."


In [3]:
df.dropna(subset=["attributes", "categories", "hours"], inplace=True)

df.isnull().any()

business_id     False
name            False
address         False
city            False
state           False
postal_code     False
latitude        False
longitude       False
stars           False
review_count    False
is_open         False
attributes      False
categories      False
hours           False
dtype: bool

In [4]:
df['categories'] = df['categories'].apply(lambda x: x.replace(" ", "").split(","))

df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","[Gastropubs, Food, BeerGardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","[Salad, Soup, Sandwiches, Delis, Restaurants, ...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
2,bvN78flM8NLprQ1a1y5dRg,The Reclaimory,4720 Hawthorne Ave,Portland,OR,97214,45.511907,-122.613693,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","[Antiques, Fashion, Used, Vintage&Consignment,...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0..."
4,PE9uqAjdw0E4-8mjGl3wVA,Crossfit Terminus,1046 Memorial Dr SE,Atlanta,GA,30316,33.747027,-84.353424,4.0,14,1,"{'GoodForKids': 'False', 'BusinessParking': '{...","[Gyms, ActiveLife, IntervalTrainingGyms, Fitne...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'..."
5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","[Restaurants, Thai]","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'..."


In [5]:
df_food = df[df['categories'].apply(lambda x: 'Restaurants' in x)]

df_food.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,6iYb2HFDywm3zjuRg0shjw,Oskar Blues Taproom,921 Pearl St,Boulder,CO,80302,40.017544,-105.283348,4.0,86,1,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","[Gastropubs, Food, BeerGardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,Flying Elephants at PDX,7000 NE Airport Way,Portland,OR,97218,45.588906,-122.593331,4.0,126,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","[Salad, Soup, Sandwiches, Delis, Restaurants, ...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
5,D4JtQNTI4X3KcbzacDJsMw,Bob Likes Thai Food,3755 Main St,Vancouver,BC,V5V,49.251342,-123.101333,3.5,169,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","[Restaurants, Thai]","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'..."
12,HPA_qyMEddpAEtFof02ixg,Mr G's Pizza & Subs,474 Lowell St,Peabody,MA,01960,42.541155,-70.973438,4.0,39,1,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","[Food, Pizza, Restaurants]","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
13,ufCxltuh56FF4-ZFZ6cVhg,Sister Honey's,247 E Michigan St,Orlando,FL,32806,28.513265,-81.374707,4.5,135,1,"{'BusinessParking': '{'garage': False, 'street...","[Restaurants, American(New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18..."


## Data Splitting

In [6]:
train_df, test_df = train_test_split(df_food, test_size=0.30, random_state=123)

In [7]:
train_df.shape

(29852, 14)

In [8]:
test_df.shape

(12794, 14)

## Exploratory Data Analysis

In [9]:
states = pd.DataFrame(df[['state']].value_counts().reset_index())

alt.Chart(states).mark_bar().encode(
    x=alt.X('state:N', sort='-y'),
    y=alt.Y('count')
)

In [10]:
cities = pd.DataFrame(df[['city']].value_counts().reset_index())

alt.Chart(cities).mark_bar().encode(
    x=alt.X('city:N', sort='-y'),
    y=alt.Y('count')
)

In [21]:
bc_food = train_df[train_df['state'] == 'BC']

In [26]:
categories = pd.DataFrame(bc_food['categories'].value_counts().reset_index())

alt.Chart(categories).mark_bar().encode(
    x=alt.X('categories:N', sort='-y'),
    y='count:Q' 
)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
58536,LxxgaqxNAKEYeQ3C32LrYA,One Shot Asian Fusion Bistro,8197 Main Street,Vancouver,BC,V5X 3L2,49.211167,-123.102191,3.0,5,0,"{'RestaurantsAttire': 'u'casual'', 'Alcohol': ...","[Korean, Japanese, Restaurants]","{'Monday': '11:0-21:0', 'Tuesday': '11:0-21:0'..."
129392,b6mYGDUgl1PC5Bpfy6fc3A,Westley Military Surplus,525 Front St,New Westminster,BC,V3L 1A4,49.203217,-122.906941,3.0,6,1,"{'BusinessParking': '{'garage': False, 'street...","[Restaurants, OutdoorGear, Fishing, Pizza, Hat...","{'Monday': '10:0-17:30', 'Tuesday': '10:0-17:3..."
110206,o22_Ixj-B5JHCBYYevXv9Q,Cho Sun BBQ Korean Restaurant,3486 Kingsway,Vancouver,BC,V5R 5L6,49.231742,-123.030398,3.0,156,1,"{'OutdoorSeating': 'False', 'RestaurantsReserv...","[Korean, Restaurants]","{'Monday': '11:30-23:30', 'Tuesday': '11:30-23..."
137181,EgM75hqM50IGdOxE4M10QA,Pomegranate Grillhouse & Cafe,4361 Gallant Avenue,North Vancouver,BC,V7G 1L1,49.326692,-122.950440,3.0,30,1,"{'GoodForKids': 'True', 'RestaurantsReservatio...","[Pizza, Restaurants, Cafes, Burgers]","{'Monday': '11:0-19:0', 'Wednesday': '11:0-20:..."
33011,SFDEM4EU8fYAAaFe3OHkng,House of Empanadas,976 Denman Street,Vancouver,BC,V6G 2M1,49.289621,-123.138044,3.0,9,0,"{'BusinessParking': '{'garage': False, 'street...","[LatinAmerican, Restaurants]","{'Monday': '8:0-21:0', 'Tuesday': '8:0-21:0', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45123,VRIQwLCLYJqzmxzGCUUPOQ,Sip Bowl La Mian,2255 W 41 Ave,Vancouver,BC,V6M 4L3,49.234805,-123.158742,3.5,43,1,"{'Ambience': '{'touristy': False, 'hipster': F...","[Restaurants, Noodles, Chinese]","{'Monday': '0:0-0:0', 'Tuesday': '11:0-21:0', ..."
90013,SHrLUxpK6LDt3N1f6QQb9A,RV’s Butter Kitchen,1355 Hornby St,Vancouver,BC,V6Z 1W7,49.276869,-123.130848,4.5,18,1,"{'BusinessParking': '{'garage': False, 'street...","[Indian, Restaurants]","{'Monday': '12:0-21:0', 'Tuesday': '12:0-21:0'..."
24644,FiC58EBZOGDtWrY5p12X2g,Eh! Restaurant,1050 Alberni Street,Vancouver,BC,V6E 2A3,49.284625,-123.123225,3.0,19,1,"{'NoiseLevel': 'u'quiet'', 'Alcohol': 'u'none'...","[American(New), Canadian(New), ModernEuropean,...","{'Monday': '7:0-19:0', 'Tuesday': '7:0-19:0', ..."
127290,9yVdZHqGclsEkYLlmKdBpA,Original Tandoori Kitchen,7215 Main Street,Vancouver,BC,V5X 3J3,49.219006,-123.101867,3.0,58,1,"{'BusinessParking': '{'garage': False, 'street...","[Indian, Restaurants]","{'Monday': '11:30-23:0', 'Tuesday': '11:30-23:..."
