In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read business.csv file
df_business = pd.read_csv('business.csv')

# Get the following columns: address, attributes, categories, city, name, postal_code, review_count, stars, state
df_business = df_business[['address', 'attributes', 'categories', 'city', 'name', 'postal_code', 
                           'review_count', 'stars', 'state']]

# Get businesses in the 'state' of NV
df_business = df_business.loc[df_business['state'] == ('NV')]

# Create a column to check if the business is a 'Restaurant'
df_business['is a restaurant'] = df_business['categories'].str.contains("Restaurants")

# Filter dataframe for rows that are True for 'is a restaurant'
df_business = df_business.loc[df_business['is a restaurant'] == (True)]

# Get businesses in Las Vegas of NV
df_business = df_business.loc[df_business['city'].str.contains("Vegas")]

# Drop where attributes is NaN
df_business = df_business.dropna(subset=['attributes'])
df_business

Unnamed: 0,address,attributes,categories,city,name,postal_code,review_count,stars,state,is a restaurant
17,"1775 E Tropicana Ave, Ste 29","{'OutdoorSeating': 'False', 'BusinessAcceptsCr...","Restaurants, Italian",Las Vegas,Carluccio's Tivoli Gardens,89119,40,4.0,NV,True
25,6055 E Lake Mead Blvd,"{'BikeParking': 'True', 'BusinessParking': ""{'...","Mexican, Restaurants, Patisserie/Cake Shop, Fo...",Las Vegas,Maria's Mexican Restaurant & Bakery,89156,184,4.5,NV,True
75,6125 Spring Mountain Rd,"{'RestaurantsPriceRange2': '1', 'Ambience': ""{...","Fast Food, Food, Restaurants, Ice Cream & Froz...",Las Vegas,Dairy Queen,89146,33,2.0,NV,True
135,"Artisan Hotel, 1501 W Sahara Ave","{'RestaurantsAttire': ""'dressy'"", 'Corkage': '...","Restaurants, Pizza, Italian, American (New)",Las Vegas,Artisan Fine Dining Room,89102,3,2.0,NV,True
173,241 W Charleston Blvd,"{'BusinessParking': ""{'garage': False, 'street...","Food, Pizza, Wine Bars, Bars, Restaurants, Nig...",Las Vegas,Bistro Divino,89102,3,4.5,NV,True
174,3655 Las Vegas Blvd S,"{'RestaurantsTakeOut': 'True', 'RestaurantsDel...","French, Restaurants, Creperies",Las Vegas,La Creperie,89109,535,3.5,NV,True
176,2411 W Sahara Ave,"{'RestaurantsDelivery': 'False', 'RestaurantsT...","Buffets, Restaurants",Las Vegas,Feast Buffet,89102,287,3.0,NV,True
206,"3500 Las Vegas Blvd S, Ste E11","{'RestaurantsAttire': ""'casual'"", 'Restaurants...","Sandwiches, Restaurants, Delis, Desserts, Food",Las Vegas,Stage Deli Of Las Vegas,89109,28,3.5,NV,True
214,1263 Silverado Ranch Blvd,"{'RestaurantsDelivery': 'True', 'BusinessParki...","Middle Eastern, Restaurants, Vegetarian, Juice...",Las Vegas,Pita Pit,89183,77,4.0,NV,True
216,"3342 E Sandhill Rd, Ste 11","{'GoodForDancing': 'False', 'GoodForKids': 'Fa...","Dive Bars, Food, Bars, Pubs, Restaurants, Nigh...",Las Vegas,Mr G's Pub & Grub,89121,27,4.0,NV,True


In [3]:
# Convert the dictionary objects in 'attributes' to columns in a new df
df_business["attributes"] = df_business["attributes"].apply(lambda x : dict(eval(x)) )
df_attributes = df_business["attributes"].apply(pd.Series)
df_attributes

Unnamed: 0,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsDelivery,RestaurantsReservations,RestaurantsAttire,Ambience,HasTV,BYOBCorkage,NoiseLevel,RestaurantsTakeOut,...,BestNights,DogsAllowed,DriveThru,Smoking,CoatCheck,AgesAllowed,DietaryRestrictions,AcceptsInsurance,Open24Hours,RestaurantsCounterService
17,False,True,False,True,'casual',"{'romantic': True, 'intimate': False, 'tourist...",False,'no',u'quiet',True,...,,,,,,,,,,
25,False,True,False,False,u'casual',"{'romantic': False, 'intimate': False, 'classy...",True,,'average',True,...,,,,,,,,,,
75,False,True,False,False,u'casual',"{'romantic': False, 'intimate': False, 'classy...",False,,u'average',True,...,,,,,,,,,,
135,False,True,False,True,'dressy',,True,'yes_corkage',u'quiet',False,...,,,,,,,,,,
173,,True,,,,,,,,,...,,,,,,,,,,
174,False,True,False,False,'casual',"{'romantic': False, 'intimate': False, 'touris...",False,'yes_free',u'average',True,...,,,,,,,,,,
176,False,True,False,False,'casual',"{'romantic': False, 'intimate': False, 'touris...",False,'yes_free',u'average',False,...,,,,,,,,,,
206,False,True,False,,'casual',,,,,True,...,,,,,,,,,,
214,False,True,True,False,u'casual',"{'romantic': False, 'intimate': False, 'classy...",False,,'average',True,...,,,,,,,,,,
216,False,True,False,False,'casual',"{'romantic': False, 'intimate': False, 'classy...",True,,u'quiet',True,...,"{'monday': False, 'tuesday': False, 'friday': ...",,,,,,,,,


In [4]:
# Since 'Ambience' is a subjective measurement, we decided to remove it from the dataframe. 
# The definition for words like "romantic", "hipster", or "touristy" vary between Yelp users 
# and is difficult to account for.

# Drop 'Ambience' column
df_attributes = df_attributes.drop(['Ambience'], axis = 1)
df_attributes

Unnamed: 0,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsDelivery,RestaurantsReservations,RestaurantsAttire,HasTV,BYOBCorkage,NoiseLevel,RestaurantsTakeOut,RestaurantsPriceRange2,...,BestNights,DogsAllowed,DriveThru,Smoking,CoatCheck,AgesAllowed,DietaryRestrictions,AcceptsInsurance,Open24Hours,RestaurantsCounterService
17,False,True,False,True,'casual',False,'no',u'quiet',True,2,...,,,,,,,,,,
25,False,True,False,False,u'casual',True,,'average',True,1,...,,,,,,,,,,
75,False,True,False,False,u'casual',False,,u'average',True,1,...,,,,,,,,,,
135,False,True,False,True,'dressy',True,'yes_corkage',u'quiet',False,4,...,,,,,,,,,,
173,,True,,,,,,,,2,...,,,,,,,,,,
174,False,True,False,False,'casual',False,'yes_free',u'average',True,2,...,,,,,,,,,,
176,False,True,False,False,'casual',False,'yes_free',u'average',False,1,...,,,,,,,,,,
206,False,True,False,,'casual',,,,True,2,...,,,,,,,,,,
214,False,True,True,False,u'casual',False,,'average',True,1,...,,,,,,,,,,
216,False,True,False,False,'casual',True,,u'quiet',True,1,...,"{'monday': False, 'tuesday': False, 'friday': ...",,,,,,,,,


In [5]:
# Ratio of missing NaN attributes
df_missing = pd.DataFrame([{'column':c, 'missing': (df_attributes[c].isnull().sum()/df_attributes.shape[0])} for c in df_attributes.columns])
df_missing = df_missing.sort_values('missing', ascending=False)
df_missing

# lists of columns that have a missing ratio greater than 0.25%
droppable_features = []
droppable_features.extend(df_missing[df_missing.missing > 0.25].column.tolist())
droppable_features

# Drop columns with more than 25% missing data
df_attributes.drop(droppable_features, axis=1, inplace=True)
df_attributes

Unnamed: 0,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsDelivery,RestaurantsReservations,RestaurantsAttire,HasTV,NoiseLevel,RestaurantsTakeOut,RestaurantsPriceRange2,RestaurantsGoodForGroups,WiFi,GoodForKids,Alcohol,BusinessParking
17,False,True,False,True,'casual',False,u'quiet',True,2,True,u'no',True,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
25,False,True,False,False,u'casual',True,'average',True,1,True,u'no',True,u'beer_and_wine',"{'garage': False, 'street': False, 'validated'..."
75,False,True,False,False,u'casual',False,u'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
135,False,True,False,True,'dressy',True,u'quiet',False,4,True,u'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
173,,True,,,,,,,2,,,,,"{'garage': False, 'street': False, 'validated'..."
174,False,True,False,False,'casual',False,u'average',True,2,True,'no',True,'none',"{'garage': True, 'street': False, 'validated':..."
176,False,True,False,False,'casual',False,u'average',False,1,True,'free',True,u'beer_and_wine',"{'garage': True, 'street': False, 'validated':..."
206,False,True,False,,'casual',,,True,2,True,,True,,"{'garage': True, 'street': False, 'validated':..."
214,False,True,True,False,u'casual',False,'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
216,False,True,False,False,'casual',True,u'quiet',True,1,True,'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."


In [6]:
#reset index to allow easier looping through the dataframe then drop index and level0 columns that appear as a result
df_attributes = df_attributes.reset_index()
df_attributes.drop(columns = ["index"], inplace = True)
#df_attributes.drop(columns = ["level_0"], inplace = True)
    

In [7]:
df_attributes['BusinessParking'].replace('None',np.nan,inplace = True)
df_attributes['BusinessParking'].fillna("False",inplace = True)
#df_attributes['BusinessParking'].replace('None',False)
# df_attributes = df_attributes.loc['True' in df_attributes['BusinessParking'].values()]
df_attributes

# def search(values, searchFor):
#     for k in values:
#         for v in values[k]:
#             if searchFor in v:
#                 return True
#     return False

# parking = []
# #for x in df_attributes['BusinessParking']:
#     #parking.append(search(x., 'True'))
    
    
# print(parking)

Unnamed: 0,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsDelivery,RestaurantsReservations,RestaurantsAttire,HasTV,NoiseLevel,RestaurantsTakeOut,RestaurantsPriceRange2,RestaurantsGoodForGroups,WiFi,GoodForKids,Alcohol,BusinessParking
0,False,True,False,True,'casual',False,u'quiet',True,2,True,u'no',True,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
1,False,True,False,False,u'casual',True,'average',True,1,True,u'no',True,u'beer_and_wine',"{'garage': False, 'street': False, 'validated'..."
2,False,True,False,False,u'casual',False,u'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
3,False,True,False,True,'dressy',True,u'quiet',False,4,True,u'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
4,,True,,,,,,,2,,,,,"{'garage': False, 'street': False, 'validated'..."
5,False,True,False,False,'casual',False,u'average',True,2,True,'no',True,'none',"{'garage': True, 'street': False, 'validated':..."
6,False,True,False,False,'casual',False,u'average',False,1,True,'free',True,u'beer_and_wine',"{'garage': True, 'street': False, 'validated':..."
7,False,True,False,,'casual',,,True,2,True,,True,,"{'garage': True, 'street': False, 'validated':..."
8,False,True,True,False,u'casual',False,'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
9,False,True,False,False,'casual',True,u'quiet',True,1,True,'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."


In [12]:
print(len(df_attributes['BusinessParking']))
for index in np.arange(0,len(df_attributes['BusinessParking'])-1):
    print(index) 
    if "True" in df_attributes["BusinessParking"][index]:
        print(True)
    else:
        print(False)
    



6650
0
True
1
True
2
True
3
True
4
False
5
True
6
True
7
True
8
True
9
True
10
True
11
True
12
True
13
True
14
False
15
True
16
True
17
True
18
True
19
True
20
True
21
False
22
False
23
False
24
False
25
False
26
True
27
True
28
True
29
False
30
False
31
False
32
True
33
True
34
True
35
True
36
True
37
False
38
False
39
False
40
True
41
False
42
False
43
True
44
False
45
True
46
True
47
False
48
False
49
True
50
True
51
True
52
True
53
True
54
False
55
True
56
False
57
True
58
True
59
True
60
True
61
True
62
True
63
False
64
True
65
True
66
False
67
True
68
False
69
True
70
True
71
False
72
False
73
False
74
False
75
True
76
True
77
False
78
False
79
True
80
True
81
True
82
True
83
True
84
False
85
True
86
True
87
False
88
False
89
True
90
False
91
False
92
True
93
True
94
True
95
False
96
False
97
False
98
True
99
False
100
True
101
True
102
False
103
True
104
False
105
False
106
True
107
True
108
False
109
True
110
True
111
False
112
True
113
True
114
False
115
True
116
True
117
True

1049
True
1050
True
1051
True
1052
True
1053
True
1054
True
1055
False
1056
True
1057
True
1058
True
1059
True
1060
True
1061
True
1062
True
1063
True
1064
False
1065
False
1066
True
1067
True
1068
True
1069
True
1070
True
1071
False
1072
False
1073
True
1074
False
1075
True
1076
False
1077
True
1078
True
1079
True
1080
False
1081
True
1082
False
1083
True
1084
False
1085
False
1086
False
1087
True
1088
False
1089
True
1090
True
1091
True
1092
True
1093
True
1094
False
1095
True
1096
True
1097
True
1098
True
1099
True
1100
True
1101
False
1102
True
1103
False
1104
False
1105
True
1106
True
1107
True
1108
True
1109
False
1110
True
1111
True
1112
False
1113
False
1114
False
1115
False
1116
False
1117
False
1118
False
1119
True
1120
True
1121
True
1122
True
1123
False
1124
False
1125
True
1126
True
1127
True
1128
True
1129
True
1130
True
1131
True
1132
False
1133
False
1134
True
1135
True
1136
True
1137
False
1138
True
1139
False
1140
True
1141
True
1142
False
1143
True
1144
False
1145
Tr

False
2049
False
2050
False
2051
True
2052
True
2053
False
2054
False
2055
False
2056
False
2057
True
2058
True
2059
True
2060
True
2061
True
2062
True
2063
True
2064
True
2065
True
2066
False
2067
True
2068
False
2069
False
2070
True
2071
True
2072
True
2073
True
2074
True
2075
False
2076
True
2077
False
2078
True
2079
True
2080
False
2081
True
2082
True
2083
False
2084
False
2085
True
2086
True
2087
True
2088
False
2089
True
2090
True
2091
False
2092
True
2093
False
2094
True
2095
True
2096
False
2097
True
2098
True
2099
True
2100
True
2101
True
2102
False
2103
True
2104
False
2105
True
2106
False
2107
False
2108
False
2109
True
2110
True
2111
True
2112
True
2113
True
2114
True
2115
False
2116
True
2117
False
2118
True
2119
False
2120
False
2121
True
2122
True
2123
True
2124
True
2125
False
2126
False
2127
False
2128
False
2129
True
2130
True
2131
False
2132
False
2133
True
2134
True
2135
True
2136
True
2137
False
2138
False
2139
True
2140
True
2141
False
2142
True
2143
False
2144
Fa

3048
True
3049
True
3050
False
3051
True
3052
False
3053
False
3054
True
3055
False
3056
False
3057
False
3058
True
3059
False
3060
False
3061
False
3062
False
3063
True
3064
True
3065
False
3066
True
3067
True
3068
True
3069
False
3070
True
3071
True
3072
False
3073
False
3074
True
3075
True
3076
False
3077
True
3078
True
3079
False
3080
True
3081
True
3082
True
3083
False
3084
False
3085
True
3086
False
3087
True
3088
True
3089
False
3090
True
3091
True
3092
False
3093
True
3094
False
3095
False
3096
True
3097
False
3098
True
3099
True
3100
True
3101
True
3102
False
3103
True
3104
True
3105
False
3106
False
3107
True
3108
True
3109
True
3110
True
3111
False
3112
True
3113
True
3114
False
3115
False
3116
False
3117
False
3118
True
3119
True
3120
True
3121
True
3122
False
3123
False
3124
False
3125
True
3126
True
3127
True
3128
True
3129
False
3130
False
3131
True
3132
True
3133
True
3134
True
3135
False
3136
False
3137
False
3138
True
3139
True
3140
False
3141
True
3142
True
3143
True

True
4048
False
4049
True
4050
False
4051
False
4052
False
4053
False
4054
True
4055
False
4056
True
4057
True
4058
True
4059
True
4060
False
4061
False
4062
False
4063
True
4064
True
4065
False
4066
True
4067
True
4068
False
4069
False
4070
True
4071
True
4072
False
4073
True
4074
False
4075
True
4076
False
4077
False
4078
True
4079
True
4080
False
4081
True
4082
False
4083
True
4084
True
4085
True
4086
True
4087
False
4088
True
4089
False
4090
False
4091
False
4092
True
4093
True
4094
True
4095
True
4096
False
4097
True
4098
True
4099
False
4100
False
4101
False
4102
False
4103
False
4104
False
4105
True
4106
True
4107
True
4108
False
4109
False
4110
False
4111
False
4112
True
4113
True
4114
False
4115
True
4116
False
4117
True
4118
False
4119
True
4120
False
4121
True
4122
True
4123
True
4124
True
4125
True
4126
True
4127
False
4128
False
4129
True
4130
False
4131
True
4132
False
4133
True
4134
False
4135
False
4136
True
4137
True
4138
False
4139
False
4140
False
4141
True
4142
Fals

5047
True
5048
True
5049
True
5050
False
5051
True
5052
True
5053
False
5054
False
5055
True
5056
False
5057
True
5058
False
5059
False
5060
False
5061
True
5062
True
5063
False
5064
True
5065
True
5066
False
5067
False
5068
True
5069
False
5070
False
5071
False
5072
True
5073
True
5074
True
5075
True
5076
True
5077
False
5078
False
5079
True
5080
True
5081
True
5082
True
5083
True
5084
True
5085
False
5086
False
5087
True
5088
True
5089
True
5090
False
5091
False
5092
True
5093
False
5094
True
5095
False
5096
True
5097
False
5098
False
5099
True
5100
True
5101
True
5102
True
5103
False
5104
True
5105
False
5106
False
5107
True
5108
True
5109
True
5110
False
5111
False
5112
True
5113
False
5114
True
5115
True
5116
False
5117
False
5118
False
5119
False
5120
True
5121
False
5122
False
5123
False
5124
True
5125
True
5126
True
5127
False
5128
False
5129
True
5130
True
5131
True
5132
True
5133
True
5134
True
5135
True
5136
False
5137
False
5138
True
5139
False
5140
True
5141
True
5142
True

True
6047
False
6048
False
6049
True
6050
True
6051
False
6052
False
6053
False
6054
True
6055
True
6056
False
6057
False
6058
True
6059
True
6060
False
6061
True
6062
False
6063
False
6064
True
6065
False
6066
False
6067
True
6068
True
6069
False
6070
True
6071
True
6072
False
6073
True
6074
False
6075
True
6076
False
6077
False
6078
False
6079
True
6080
True
6081
True
6082
True
6083
True
6084
False
6085
False
6086
False
6087
True
6088
True
6089
True
6090
False
6091
True
6092
True
6093
True
6094
True
6095
True
6096
True
6097
True
6098
True
6099
False
6100
True
6101
True
6102
False
6103
True
6104
True
6105
True
6106
True
6107
True
6108
True
6109
True
6110
True
6111
False
6112
False
6113
False
6114
True
6115
False
6116
True
6117
True
6118
True
6119
False
6120
True
6121
False
6122
True
6123
True
6124
False
6125
False
6126
False
6127
True
6128
True
6129
True
6130
False
6131
False
6132
False
6133
True
6134
True
6135
True
6136
True
6137
False
6138
True
6139
False
6140
True
6141
True
6142
Tr

In [9]:
# Concat df_business and df_attributes
df_combined = pd.concat([df_business, df_attributes], axis=1)
df_combined

Unnamed: 0,address,attributes,categories,city,name,postal_code,review_count,stars,state,is a restaurant,...,RestaurantsAttire,HasTV,NoiseLevel,RestaurantsTakeOut,RestaurantsPriceRange2,RestaurantsGoodForGroups,WiFi,GoodForKids,Alcohol,BusinessParking
0,,,,,,,,,,,...,'casual',False,u'quiet',True,2,True,u'no',True,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
1,,,,,,,,,,,...,u'casual',True,'average',True,1,True,u'no',True,u'beer_and_wine',"{'garage': False, 'street': False, 'validated'..."
2,,,,,,,,,,,...,u'casual',False,u'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
3,,,,,,,,,,,...,'dressy',True,u'quiet',False,4,True,u'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
4,,,,,,,,,,,...,,,,,2,,,,,"{'garage': False, 'street': False, 'validated'..."
5,,,,,,,,,,,...,'casual',False,u'average',True,2,True,'no',True,'none',"{'garage': True, 'street': False, 'validated':..."
6,,,,,,,,,,,...,'casual',False,u'average',False,1,True,'free',True,u'beer_and_wine',"{'garage': True, 'street': False, 'validated':..."
7,,,,,,,,,,,...,'casual',,,True,2,True,,True,,"{'garage': True, 'street': False, 'validated':..."
8,,,,,,,,,,,...,u'casual',False,'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
9,,,,,,,,,,,...,'casual',True,u'quiet',True,1,True,'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."


In [10]:
# Create new df for restaurants between 1 and 2 dollar signs
df_price_1to2 = df_combined.loc[(df_combined['RestaurantsPriceRange2'] == ('1')) 
                                | (df_combined['RestaurantsPriceRange2'] == ('2'))]

# Create new df for restaurants between 3 and 4 dollar signs
df_price_3to4 = df_combined.loc[(df_combined['RestaurantsPriceRange2'] == ('3')) 
                                | (df_combined['RestaurantsPriceRange2'] == ('4'))]

# Create new df for restaurants with 5 dollar signs
df_price_5 = df_combined.loc[df_combined['RestaurantsPriceRange2'] == ('5')]

Viewing Restaurants with a 1-2 dollar price range according to Yelp