In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read business.csv file
df_business = pd.read_csv('business.csv')

# Get the following columns: address, attributes, categories, city, name, postal_code, review_count, stars, state
df_business = df_business[['address', 'attributes', 'categories', 'city', 'name', 'postal_code', 
                           'review_count', 'stars', 'state']]

# Get businesses in the 'state' of NV
df_business = df_business.loc[df_business['state'] == ('NV')]

# Create a column to check if the business is a 'Restaurant'
df_business['is a restaurant'] = df_business['categories'].str.contains("Restaurants")

# Filter dataframe for rows that are True for 'is a restaurant'
df_business = df_business.loc[df_business['is a restaurant'] == (True)]

# Get businesses in Las Vegas of NV
df_business = df_business.loc[df_business['city'].str.contains("Vegas")]

# Drop where attributes is NaN
df_business = df_business.dropna(subset=['attributes'])
df_business

Unnamed: 0,address,attributes,categories,city,name,postal_code,review_count,stars,state,is a restaurant
17,"1775 E Tropicana Ave, Ste 29","{'OutdoorSeating': 'False', 'BusinessAcceptsCr...","Restaurants, Italian",Las Vegas,Carluccio's Tivoli Gardens,89119,40,4.0,NV,True
25,6055 E Lake Mead Blvd,"{'BikeParking': 'True', 'BusinessParking': ""{'...","Mexican, Restaurants, Patisserie/Cake Shop, Fo...",Las Vegas,Maria's Mexican Restaurant & Bakery,89156,184,4.5,NV,True
75,6125 Spring Mountain Rd,"{'RestaurantsPriceRange2': '1', 'Ambience': ""{...","Fast Food, Food, Restaurants, Ice Cream & Froz...",Las Vegas,Dairy Queen,89146,33,2.0,NV,True
135,"Artisan Hotel, 1501 W Sahara Ave","{'RestaurantsAttire': ""'dressy'"", 'Corkage': '...","Restaurants, Pizza, Italian, American (New)",Las Vegas,Artisan Fine Dining Room,89102,3,2.0,NV,True
173,241 W Charleston Blvd,"{'BusinessParking': ""{'garage': False, 'street...","Food, Pizza, Wine Bars, Bars, Restaurants, Nig...",Las Vegas,Bistro Divino,89102,3,4.5,NV,True
174,3655 Las Vegas Blvd S,"{'RestaurantsTakeOut': 'True', 'RestaurantsDel...","French, Restaurants, Creperies",Las Vegas,La Creperie,89109,535,3.5,NV,True
176,2411 W Sahara Ave,"{'RestaurantsDelivery': 'False', 'RestaurantsT...","Buffets, Restaurants",Las Vegas,Feast Buffet,89102,287,3.0,NV,True
206,"3500 Las Vegas Blvd S, Ste E11","{'RestaurantsAttire': ""'casual'"", 'Restaurants...","Sandwiches, Restaurants, Delis, Desserts, Food",Las Vegas,Stage Deli Of Las Vegas,89109,28,3.5,NV,True
214,1263 Silverado Ranch Blvd,"{'RestaurantsDelivery': 'True', 'BusinessParki...","Middle Eastern, Restaurants, Vegetarian, Juice...",Las Vegas,Pita Pit,89183,77,4.0,NV,True
216,"3342 E Sandhill Rd, Ste 11","{'GoodForDancing': 'False', 'GoodForKids': 'Fa...","Dive Bars, Food, Bars, Pubs, Restaurants, Nigh...",Las Vegas,Mr G's Pub & Grub,89121,27,4.0,NV,True


In [3]:
# Convert the dictionary objects in 'attributes' to columns in a new df
df_business["attributes"] = df_business["attributes"].apply(lambda x : dict(eval(x)) )
df_attributes = df_business["attributes"].apply(pd.Series)
df_attributes

Unnamed: 0,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsDelivery,RestaurantsReservations,RestaurantsAttire,Ambience,HasTV,BYOBCorkage,NoiseLevel,RestaurantsTakeOut,...,BestNights,DogsAllowed,DriveThru,Smoking,CoatCheck,AgesAllowed,DietaryRestrictions,AcceptsInsurance,Open24Hours,RestaurantsCounterService
17,False,True,False,True,'casual',"{'romantic': True, 'intimate': False, 'tourist...",False,'no',u'quiet',True,...,,,,,,,,,,
25,False,True,False,False,u'casual',"{'romantic': False, 'intimate': False, 'classy...",True,,'average',True,...,,,,,,,,,,
75,False,True,False,False,u'casual',"{'romantic': False, 'intimate': False, 'classy...",False,,u'average',True,...,,,,,,,,,,
135,False,True,False,True,'dressy',,True,'yes_corkage',u'quiet',False,...,,,,,,,,,,
173,,True,,,,,,,,,...,,,,,,,,,,
174,False,True,False,False,'casual',"{'romantic': False, 'intimate': False, 'touris...",False,'yes_free',u'average',True,...,,,,,,,,,,
176,False,True,False,False,'casual',"{'romantic': False, 'intimate': False, 'touris...",False,'yes_free',u'average',False,...,,,,,,,,,,
206,False,True,False,,'casual',,,,,True,...,,,,,,,,,,
214,False,True,True,False,u'casual',"{'romantic': False, 'intimate': False, 'classy...",False,,'average',True,...,,,,,,,,,,
216,False,True,False,False,'casual',"{'romantic': False, 'intimate': False, 'classy...",True,,u'quiet',True,...,"{'monday': False, 'tuesday': False, 'friday': ...",,,,,,,,,


In [4]:
# Since 'Ambience' is a subjective measurement, we decided to remove it from the dataframe. 
# The definition for words like "romantic", "hipster", or "touristy" vary between Yelp users 
# and is difficult to account for.

# Drop 'Ambience' column
df_attributes = df_attributes.drop(['Ambience'], axis = 1)
df_attributes

Unnamed: 0,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsDelivery,RestaurantsReservations,RestaurantsAttire,HasTV,BYOBCorkage,NoiseLevel,RestaurantsTakeOut,RestaurantsPriceRange2,...,BestNights,DogsAllowed,DriveThru,Smoking,CoatCheck,AgesAllowed,DietaryRestrictions,AcceptsInsurance,Open24Hours,RestaurantsCounterService
17,False,True,False,True,'casual',False,'no',u'quiet',True,2,...,,,,,,,,,,
25,False,True,False,False,u'casual',True,,'average',True,1,...,,,,,,,,,,
75,False,True,False,False,u'casual',False,,u'average',True,1,...,,,,,,,,,,
135,False,True,False,True,'dressy',True,'yes_corkage',u'quiet',False,4,...,,,,,,,,,,
173,,True,,,,,,,,2,...,,,,,,,,,,
174,False,True,False,False,'casual',False,'yes_free',u'average',True,2,...,,,,,,,,,,
176,False,True,False,False,'casual',False,'yes_free',u'average',False,1,...,,,,,,,,,,
206,False,True,False,,'casual',,,,True,2,...,,,,,,,,,,
214,False,True,True,False,u'casual',False,,'average',True,1,...,,,,,,,,,,
216,False,True,False,False,'casual',True,,u'quiet',True,1,...,"{'monday': False, 'tuesday': False, 'friday': ...",,,,,,,,,


In [5]:
# Ratio of missing NaN attributes
df_missing = pd.DataFrame([{'column':c, 'missing': (df_attributes[c].isnull().sum()/df_attributes.shape[0])} for c in df_attributes.columns])
df_missing = df_missing.sort_values('missing', ascending=False)
df_missing

# lists of columns that have a missing ratio greater than 0.25%
droppable_features = []
droppable_features.extend(df_missing[df_missing.missing > 0.25].column.tolist())
droppable_features

# Drop columns with more than 25% missing data
df_attributes.drop(droppable_features, axis=1, inplace=True)
df_attributes

Unnamed: 0,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsDelivery,RestaurantsReservations,RestaurantsAttire,HasTV,NoiseLevel,RestaurantsTakeOut,RestaurantsPriceRange2,RestaurantsGoodForGroups,WiFi,GoodForKids,Alcohol,BusinessParking
17,False,True,False,True,'casual',False,u'quiet',True,2,True,u'no',True,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
25,False,True,False,False,u'casual',True,'average',True,1,True,u'no',True,u'beer_and_wine',"{'garage': False, 'street': False, 'validated'..."
75,False,True,False,False,u'casual',False,u'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
135,False,True,False,True,'dressy',True,u'quiet',False,4,True,u'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
173,,True,,,,,,,2,,,,,"{'garage': False, 'street': False, 'validated'..."
174,False,True,False,False,'casual',False,u'average',True,2,True,'no',True,'none',"{'garage': True, 'street': False, 'validated':..."
176,False,True,False,False,'casual',False,u'average',False,1,True,'free',True,u'beer_and_wine',"{'garage': True, 'street': False, 'validated':..."
206,False,True,False,,'casual',,,True,2,True,,True,,"{'garage': True, 'street': False, 'validated':..."
214,False,True,True,False,u'casual',False,'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
216,False,True,False,False,'casual',True,u'quiet',True,1,True,'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."


In [27]:
#reset index to allow easier looping through the dataframe then drop index and level0 columns that appear as a result
df_attributes = df_attributes.reset_index()
df_attributes.drop(columns = ["index"], inplace = True)
df_attributes.drop(columns = ["level_0"], inplace = True)

for x in df_attributes['BusinessParking']:
     if "True" in x:
            return True
 
    

Unnamed: 0,OutdoorSeating,BusinessAcceptsCreditCards,RestaurantsDelivery,RestaurantsReservations,RestaurantsAttire,HasTV,NoiseLevel,RestaurantsTakeOut,RestaurantsPriceRange2,RestaurantsGoodForGroups,WiFi,GoodForKids,Alcohol,BusinessParking
0,False,True,False,True,'casual',False,u'quiet',True,2,True,u'no',True,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
1,False,True,False,False,u'casual',True,'average',True,1,True,u'no',True,u'beer_and_wine',"{'garage': False, 'street': False, 'validated'..."
2,False,True,False,False,u'casual',False,u'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
3,False,True,False,True,'dressy',True,u'quiet',False,4,True,u'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."
4,,True,,,,,,,2,,,,,"{'garage': False, 'street': False, 'validated'..."
5,False,True,False,False,'casual',False,u'average',True,2,True,'no',True,'none',"{'garage': True, 'street': False, 'validated':..."
6,False,True,False,False,'casual',False,u'average',False,1,True,'free',True,u'beer_and_wine',"{'garage': True, 'street': False, 'validated':..."
7,False,True,False,,'casual',,,True,2,True,,True,,"{'garage': True, 'street': False, 'validated':..."
8,False,True,True,False,u'casual',False,'average',True,1,True,'no',True,u'none',"{'garage': False, 'street': False, 'validated'..."
9,False,True,False,False,'casual',True,u'quiet',True,1,True,'no',False,u'full_bar',"{'garage': False, 'street': False, 'validated'..."


In [7]:
df_attributes['BusinessParking'].replace('None',np.nan,inplace = True)
df_attributes['BusinessParking'].fillna(False,inplace = True)
#df_attributes['BusinessParking'].replace('None',False)
# df_attributes = df_attributes.loc['True' in df_attributes['BusinessParking'].values()]
df_attributes

def search(values, searchFor):
    for k in values:
        for v in values[k]:
            if searchFor in v:
                return True
    return False

parking = []
for x in df_attributes['BusinessParking']:
    #parking.append(search(x., 'True'))
    
    
print(parking)

AttributeError: 'str' object has no attribute 'values'

In [None]:
# Concat df_business and df_attributes
df_combined = pd.concat([df_business, df_attributes], axis=1)
df_combined

In [None]:
# Create new df for restaurants between 1 and 2 dollar signs
df_price_1to2 = df_combined.loc[(df_combined['RestaurantsPriceRange2'] == ('1')) 
                                | (df_combined['RestaurantsPriceRange2'] == ('2'))]

# Create new df for restaurants between 3 and 4 dollar signs
df_price_3to4 = df_combined.loc[(df_combined['RestaurantsPriceRange2'] == ('3')) 
                                | (df_combined['RestaurantsPriceRange2'] == ('4'))]

# Create new df for restaurants with 5 dollar signs
df_price_5 = df_combined.loc[df_combined['RestaurantsPriceRange2'] == ('5')]

Viewing Restaurants with a 1-2 dollar price range according to Yelp