# Restructuring Categories as Binary per zip code

In [1]:
import pandas as pd 

# Let's import `west_df.csv` to create binary columns of shop categories

In [2]:
df = pd.read_csv('./datasets/west_df.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,category1,category2,category3,category4,category5,city,latitude,longitude,price,rating,zip_code
0,0,mexican,?,?,?,?,Los Angeles,33.97499,-118.24696,1.0,3.5,90001
1,2,desserts,chocolate,?,?,?,Los Angeles,33.97363,-118.24989,2.0,5.0,90001
2,4,foodtrucks,mexican,?,?,?,Los Angeles,34.060716,-118.344931,1.0,4.5,90017
3,6,bakeries,cupcakes,customcakes,?,?,Los Angeles,33.974865,-118.240467,2.0,4.0,90001
4,8,mexican,?,?,?,?,South Gate,33.956748,-118.223968,1.0,4.0,90280


In [3]:
df.shape

(69335, 12)

# We're only focusing on `categories` in this notebook!

In [4]:
cats = ['category1', 'category2', 'category3', 'category4', 'category5', 'zip_code']
cat_df = df[cats]
cat_df.head()

Unnamed: 0,category1,category2,category3,category4,category5,zip_code
0,mexican,?,?,?,?,90001
1,desserts,chocolate,?,?,?,90001
2,foodtrucks,mexican,?,?,?,90017
3,bakeries,cupcakes,customcakes,?,?,90001
4,mexican,?,?,?,?,90280


In [5]:
#this code groups by zip code and collects all the unique categories for each column
categories_df = cat_df.astype(str).groupby('zip_code').agg(lambda x: ' '.join(x.unique()))

In [6]:
categories_df.head()

Unnamed: 0_level_0,category1,category2,category3,category4,category5
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
90000,salad,tradamerican,wraps,?,?
90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?
90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?
90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?
90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?


In [7]:
categories_df.shape

(2655, 5)

In [8]:
#combining all category columns together in order to turn into lists 
categories_df['all'] = categories_df['category1'] + ' ' + categories_df['category2'] + ' ' + categories_df['category3'] + ' ' + categories_df['category4'] + ' ' + categories_df['category5'] 

In [9]:
categories_df.head()

Unnamed: 0_level_0,category1,category2,category3,category4,category5,all
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
90000,salad,tradamerican,wraps,?,?,salad tradamerican wraps ? ?
90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...
90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...
90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...
90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...


In [10]:
categories_df['all'][0].split(' ')

['salad', 'tradamerican', 'wraps', '?', '?']

In [11]:
import numpy as np

# Getting unique categories for each zip code

In [12]:
categories_df['list_cat'] = categories_df['all'].str.split(' ').apply(np.unique)

In [13]:
categories_df.head()

Unnamed: 0_level_0,category1,category2,category3,category4,category5,all,list_cat
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
90000,salad,tradamerican,wraps,?,?,salad tradamerican wraps ? ?,"[?, salad, tradamerican, wraps]"
90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...,"[?, bakeries, breakfast_brunch, burgers, cater..."
90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...,"[?, breakfast_brunch, chinese, coffee, dessert..."
90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...,"[?, bakeries, bbq, breakfast_brunch, burgers, ..."
90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...,"[?, asianfusion, bakeries, bbq, beer_and_wine,..."


In [14]:
#sanity check
categories_df['list_cat'][50]

array(['?', 'breakfast_brunch', 'cafes', 'cocktailbars', 'coffee',
       'foodtrucks', 'italian', 'mexican', 'poke', 'salad', 'sandwiches',
       'sportsbars', 'streetvendors', 'tradamerican', 'venues'],
      dtype='<U16')

# Creating an object containing unique categories of ALL zipcodes

In [15]:
unique_cats = categories_df['list_cat'].apply(pd.Series).stack().unique()

In [16]:
len(unique_cats)

469

In [17]:
type(unique_cats)

numpy.ndarray

In [18]:
#sanity check 
'mexican' in unique_cats

True

In [19]:
for word in categories_df['list_cat'][0]:
    print(word)

?
salad
tradamerican
wraps


# This code is **VERY** important
- This code creates a dictionary and if one of the 469 categories is in the unique list of categories per zip code, we will add 1 to the dictionary. If not, we will add 0. 

In [21]:
cat_dictionary = {}
def categorize(cell):
    for category in unique_cats:
        #for cell in text:
            #for word in cell:
        if category in cell:
            cat_dictionary[category] = 1
        else: 
            cat_dictionary[category] = 0
    return cat_dictionary

In [22]:
#creating new column from array to list 
categories_df['cat_list'] = categories_df['list_cat'].tolist()

In [23]:
#sanity check
categories_df.head(1)

Unnamed: 0_level_0,category1,category2,category3,category4,category5,all,list_cat,cat_list
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
90000,salad,tradamerican,wraps,?,?,salad tradamerican wraps ? ?,"[?, salad, tradamerican, wraps]","[?, salad, tradamerican, wraps]"


In [24]:
#sanity check
categories_df['cat_list'][5]

array(['?', 'argentine', 'asianfusion', 'bakeries', 'bars', 'bbq',
       'beerbar', 'breakfast_brunch', 'bubbletea', 'burgers', 'cafes',
       'chicken_wings', 'chickenshop', 'chinese', 'cocktailbars',
       'coffee', 'desserts', 'donuts', 'empanadas', 'foodtrucks',
       'hotdogs', 'hotpot', 'icecream', 'italian', 'izakaya', 'japanese',
       'juicebars', 'karaoke', 'korean', 'latin', 'lounges',
       'newamerican', 'noodles', 'peruvian', 'ramen', 'salvadoran',
       'sandwiches', 'shavedice', 'soup', 'sportsbars', 'steak', 'sushi',
       'tapasmallplates', 'thai', 'vietnamese'], dtype='<U16')

#### Reading in zip codes file to append to future category dataframe since we need to append to the dataframe with avg latitude, longitude, and count of businesses based on price and ratings. 

In [25]:
main_zips = pd.read_csv('./datasets/MAINZIPCODES.CSV')
main_zips.head()

Unnamed: 0.1,Unnamed: 0,STATE,zipcode
0,3574,AK,99501
1,3580,AK,99502
2,3586,AK,99503
3,3592,AK,99504
4,3610,AK,99507


In [26]:
main_zips.shape

(2423, 3)

In [27]:
categories_df.shape

(2655, 8)

In [28]:
main_zips_list = list(main_zips['zipcode'])

In [29]:
features = ['category1', 'category2', 'category3', 'category4', 'category5', 'all', 'list_cat','cat_list']
categories_df = categories_df[features]

In [30]:
#here is the dataframe with the appended zip code
categories_df.reset_index(level=0, inplace=True)
categories_df.head()

Unnamed: 0,zip_code,category1,category2,category3,category4,category5,all,list_cat,cat_list
0,90000,salad,tradamerican,wraps,?,?,salad tradamerican wraps ? ?,"[?, salad, tradamerican, wraps]","[?, salad, tradamerican, wraps]"
1,90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...,"[?, bakeries, breakfast_brunch, burgers, cater...","[?, bakeries, breakfast_brunch, burgers, cater..."
2,90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...,"[?, breakfast_brunch, chinese, coffee, dessert...","[?, breakfast_brunch, chinese, coffee, dessert..."
3,90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...,"[?, bakeries, bbq, breakfast_brunch, burgers, ...","[?, bakeries, bbq, breakfast_brunch, burgers, ..."
4,90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...,"[?, asianfusion, bakeries, bbq, beer_and_wine,...","[?, asianfusion, bakeries, bbq, beer_and_wine,..."


In [31]:
#sanity
categories_df.shape

(2655, 9)

In [32]:
#sanity
len(main_zips_list)

2423

In [33]:
categories_df.dtypes

zip_code     object
category1    object
category2    object
category3    object
category4    object
category5    object
all          object
list_cat     object
cat_list     object
dtype: object

In [34]:
#changing zip code column to numeric
categories_df["zip_code"] = pd.to_numeric(categories_df["zip_code"])

In [35]:
categories_df.dtypes

zip_code      int64
category1    object
category2    object
category3    object
category4    object
category5    object
all          object
list_cat     object
cat_list     object
dtype: object

In [36]:
df = categories_df.loc[categories_df['zip_code'].isin(main_zips_list)]

In [37]:
df.head()

Unnamed: 0,zip_code,category1,category2,category3,category4,category5,all,list_cat,cat_list
1,90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...,"[?, bakeries, breakfast_brunch, burgers, cater...","[?, bakeries, breakfast_brunch, burgers, cater..."
2,90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...,"[?, breakfast_brunch, chinese, coffee, dessert...","[?, breakfast_brunch, chinese, coffee, dessert..."
3,90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...,"[?, bakeries, bbq, breakfast_brunch, burgers, ...","[?, bakeries, bbq, breakfast_brunch, burgers, ..."
4,90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...,"[?, asianfusion, bakeries, bbq, beer_and_wine,...","[?, asianfusion, bakeries, bbq, beer_and_wine,..."
5,90005,vietnamese korean chicken_wings lounges thai j...,? bbq korean steak desserts coffee tapasmallpl...,? bars burgers soup korean juicebars cocktailb...,?,?,vietnamese korean chicken_wings lounges thai j...,"[?, argentine, asianfusion, bakeries, bars, bb...","[?, argentine, asianfusion, bakeries, bars, bb..."


In [38]:
#sanity
df.shape

(2423, 9)

In [40]:
df['cat_list'][157]

array(['?', 'acaibowls', 'cafes', 'juicebars'], dtype='<U9')

In [41]:
#sanity
type(categorize(df['cat_list'][20]))

dict

In [42]:
#sanity
len(df)

2423

In [43]:
df.reset_index(inplace = True)

In [44]:
#saving dataframe for security
df.to_csv('./datasets/cat_sanitycheck.csv')

In [45]:
df.head()

Unnamed: 0,index,zip_code,category1,category2,category3,category4,category5,all,list_cat,cat_list
0,1,90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...,"[?, bakeries, breakfast_brunch, burgers, cater...","[?, bakeries, breakfast_brunch, burgers, cater..."
1,2,90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...,"[?, breakfast_brunch, chinese, coffee, dessert...","[?, breakfast_brunch, chinese, coffee, dessert..."
2,3,90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...,"[?, bakeries, bbq, breakfast_brunch, burgers, ...","[?, bakeries, bbq, breakfast_brunch, burgers, ..."
3,4,90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...,"[?, asianfusion, bakeries, bbq, beer_and_wine,...","[?, asianfusion, bakeries, bbq, beer_and_wine,..."
4,5,90005,vietnamese korean chicken_wings lounges thai j...,? bbq korean steak desserts coffee tapasmallpl...,? bars burgers soup korean juicebars cocktailb...,?,?,vietnamese korean chicken_wings lounges thai j...,"[?, argentine, asianfusion, bakeries, bars, bb...","[?, argentine, asianfusion, bakeries, bars, bb..."


In [46]:
len(df)

2423

### This is the code where the magic happens! Remember that `categorize` definition? We're going to apply it!

In [47]:
the_dict = {}
for i in range(0, len(df)):
    unique_or_not = dict(categorize(df['cat_list'][i]))
    the_dict[i] = unique_or_not

In [48]:
the_cat_df = pd.DataFrame.from_dict(the_dict)

In [49]:
the_cat_df[25].sum()
#at category 25, there are 65 zip codes that have that category

65

In [50]:
the_cat_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2413,2414,2415,2416,2417,2418,2419,2420,2421,2422
?,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
acaibowls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accessories,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
active,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acupuncture,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
the_cat_df.shape

(469, 2423)

In [52]:
the_cat_df.head()
#notice how the rows are categories and columns are the zip codes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2413,2414,2415,2416,2417,2418,2419,2420,2421,2422
?,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
acaibowls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
accessories,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
active,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
acupuncture,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Finally transposed!

In [53]:
df = the_cat_df.T

In [54]:
df.head()

Unnamed: 0,?,acaibowls,accessories,active,acupuncture,adultedu,afghani,african,airportlounges,amusementparks,...,wine_bars,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Removing `?` column

In [55]:
df.drop(labels = '?', axis = 'columns', inplace = True)

In [56]:
df.head()

Unnamed: 0,acaibowls,accessories,active,acupuncture,adultedu,afghani,african,airportlounges,amusementparks,antiques,...,wine_bars,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
df.shape

(2423, 468)

In [58]:
#saving it for security
df.to_csv('./datasets/category_binary_THEONE.csv')

# Sanity Checks

In [59]:
binary = pd.read_csv('./datasets/category_binary_THEONE.csv')
binary.head()

Unnamed: 0.1,Unnamed: 0,acaibowls,accessories,active,acupuncture,adultedu,afghani,african,airportlounges,amusementparks,...,wine_bars,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
binary.shape

(2423, 469)

### Let's read in another dataframe to append the zip code and make sure we have the right binaries for each category!

In [61]:
lists = pd.read_csv('./datasets/cat_sanitycheck.csv')
lists.head()

Unnamed: 0.1,Unnamed: 0,index,zip_code,category1,category2,category3,category4,category5,all,list_cat,cat_list
0,0,1,90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...,['?' 'bakeries' 'breakfast_brunch' 'burgers' '...,['?' 'bakeries' 'breakfast_brunch' 'burgers' '...
1,1,2,90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...,['?' 'breakfast_brunch' 'chinese' 'coffee' 'de...,['?' 'breakfast_brunch' 'chinese' 'coffee' 'de...
2,2,3,90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...,['?' 'bakeries' 'bbq' 'breakfast_brunch' 'burg...,['?' 'bakeries' 'bbq' 'breakfast_brunch' 'burg...
3,3,4,90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...,['?' 'asianfusion' 'bakeries' 'bbq' 'beer_and_...,['?' 'asianfusion' 'bakeries' 'bbq' 'beer_and_...
4,4,5,90005,vietnamese korean chicken_wings lounges thai j...,? bbq korean steak desserts coffee tapasmallpl...,? bars burgers soup korean juicebars cocktailb...,?,?,vietnamese korean chicken_wings lounges thai j...,['?' 'argentine' 'asianfusion' 'bakeries' 'bar...,['?' 'argentine' 'asianfusion' 'bakeries' 'bar...


In [62]:
lists.shape

(2423, 11)

In [83]:
binary.drop(labels = ['Unnamed: 0'], axis = 'columns', inplace = True)
binary.head()

Unnamed: 0,acaibowls,accessories,active,acupuncture,adultedu,afghani,african,airportlounges,amusementparks,antiques,...,wine_bars,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
#sanity
binary.head()

Unnamed: 0,acaibowls,accessories,active,acupuncture,adultedu,afghani,african,airportlounges,amusementparks,antiques,...,wine_bars,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
#sanity
lists.head()

Unnamed: 0.1,Unnamed: 0,index,zip_code,category1,category2,category3,category4,category5,all,list_cat,cat_list
0,0,1,90001,mexican desserts bakeries foodtrucks burgers p...,? chocolate cupcakes seafood comfortfood mexic...,? customcakes sandwiches hotdog catering comfo...,?,?,mexican desserts bakeries foodtrucks burgers p...,['?' 'bakeries' 'breakfast_brunch' 'burgers' '...,['?' 'bakeries' 'breakfast_brunch' 'burgers' '...
1,1,2,90002,salvadoran chinese coffee,? juicebars breakfast_brunch,? desserts,?,?,salvadoran chinese coffee ? juicebars breakfas...,['?' 'breakfast_brunch' 'chinese' 'coffee' 'de...,['?' 'breakfast_brunch' 'chinese' 'coffee' 'de...
2,2,3,90003,mexican foodtrucks soulfood bakeries pizza sea...,foodstands mexican ? noodles sandwiches soulfo...,? tradamerican waffles soulfood,?,?,mexican foodtrucks soulfood bakeries pizza sea...,['?' 'bakeries' 'bbq' 'breakfast_brunch' 'burg...,['?' 'bakeries' 'bbq' 'breakfast_brunch' 'burg...
3,3,4,90004,bakeries salvadoran sushi coffee breakfast_bru...,coffee ? cafes vegan comfortfood chinese korea...,cakeshop ? vegetarian soup noodles korean germ...,?,?,bakeries salvadoran sushi coffee breakfast_bru...,['?' 'asianfusion' 'bakeries' 'bbq' 'beer_and_...,['?' 'asianfusion' 'bakeries' 'bbq' 'beer_and_...
4,4,5,90005,vietnamese korean chicken_wings lounges thai j...,? bbq korean steak desserts coffee tapasmallpl...,? bars burgers soup korean juicebars cocktailb...,?,?,vietnamese korean chicken_wings lounges thai j...,['?' 'argentine' 'asianfusion' 'bakeries' 'bar...,['?' 'argentine' 'asianfusion' 'bakeries' 'bar...


### Reading in another dataframe to append the binary columns to construct the main dataframe

In [91]:
df = pd.read_csv('./datasets/NewSummarized.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,...,Rating_1.5,Rating_2,Rating_2.5,Rating_3,Rating_3.5,Rating_4,Rating_4.5,Rating_5,count_businesses,city
0,1,1,90001,30,33.97653,-118.24923,18,12,0,0,...,0,0,0,0,6,8,13,3,30,Los Angeles
1,2,2,90002,4,33.948102,-118.248582,3,1,0,0,...,0,0,0,0,0,3,1,0,4,Los Angeles
2,3,3,90003,20,33.971906,-118.272539,16,4,0,0,...,0,0,1,1,3,7,7,1,20,Los Angeles
3,4,4,90004,29,34.076563,-118.310331,10,19,0,0,...,0,0,0,0,1,19,6,3,29,Los Angeles
4,5,5,90005,45,34.059083,-118.298835,11,33,1,0,...,0,0,0,1,6,23,15,0,45,Los Angeles


In [92]:
#sanity
df.shape

(2423, 21)

In [93]:
#sanity
binary.shape

(2423, 468)

In [94]:
#sanity
lists.shape

(2423, 11)

In [95]:
#sanity
MOTHER = pd.concat([df, lists, binary], axis=1) 

In [96]:
#sanity
MOTHER.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,...,wine_bars,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos
0,1,1,90001,30,33.97653,-118.24923,18,12,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,2,90002,4,33.948102,-118.248582,3,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,3,90003,20,33.971906,-118.272539,16,4,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,4,90004,29,34.076563,-118.310331,10,19,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,5,90005,45,34.059083,-118.298835,11,33,1,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
#sanity
MOTHER.shape

(2423, 500)

In [99]:
# the sanity to my sanity
# nothing should be null 
MOTHER.isnull().sum().sum()

0

# Called `MOTHEROFMOTHERS.csv`

In [100]:
MOTHER.to_csv('./datasets/MOTHEROFMOTHERS.csv')