In [69]:
import pandas as pd
import numpy as np
import re

In [2]:
#Load data
google_df = pd.read_csv('/Users/marianaarrieta/Documents/Data_Projects/london_restaurants/raw_data/data_google_maps.csv')
opentable_df = pd.read_csv('/Users/marianaarrieta/Documents/Data_Projects/london_restaurants/raw_data/opentable_data.csv')


In [3]:
google_df.head()

Unnamed: 0,names,summary,cusine,price,reviews_count,rating,address,url
0,Restaurant Story,Chef Tom Sellers' novel approach to British cl...,Fine dining restaurant,££££,418 reviews,4.6,"199 Tooley St, London SE1 2JX",https://www.google.com/maps/place/Restaurant+S...
1,Galvin La Chapelle,,French restaurant,££££,"1,901 reviews",4.6,,https://www.google.com/maps/place/Galvin+La+Ch...
2,City Social,,Modern European restaurant,££££,"1,187 reviews",4.5,,https://www.google.com/maps/place/City+Social/...
3,Brawn,Mediterranean small plates menu with an 'all t...,Mediterranean restaurant,££,615 reviews,4.7,"49 Columbia Rd, London E2 7RG",https://www.google.com/maps/place/Brawn/@51.52...
4,Il Bordello,Old-school Italian restaurant with an authenti...,Italian restaurant,££,"1,170 reviews",4.5,"metropolitan wharf, 70 Wapping Wall, London E1...",https://www.google.com/maps/place/Il+Bordello/...


In [199]:
opentable_df.head()

Unnamed: 0,names,cuisine,price,reviews,rating,bookings,address
0,Seasons Mayfair,Fish,4,41.0,4.3,8.0,Mayfair
1,PAPA L's KITCHEN,African,4,257.0,4.6,6.0,St. James's
2,Coqbull Soho,Rotisserie Chicken,2,175.0,4.1,47.0,Soho
3,Ham Yard Bar and Restaurant,Modern European,2,3696.0,4.6,92.0,Soho
4,El Norte,Spanish,4,296.0,4.5,45.0,Mayfair


In [200]:
len(opentable_df)

1000

### Filter Google Data

In [4]:
#Remove duplicates, if any 
google_df.drop_duplicates(subset = ['names'], inplace = True)

#Remove rows where cusine = "Not a restaurant"
google_df = google_df[google_df.cusine != 'Not a restaurant']

#Remove rows where there are no ratings 
google_df = google_df[google_df.rating != 'Not Available']


### Filter Open Table Data

In [5]:
#Remove duplicates, if any 
opentable_df.drop_duplicates(subset = ['names'], inplace = True)

#Remove booking column 
opentable_df.drop(columns=['bookings'], inplace = True)

## Understand the Differences in Data

In [6]:
google_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 280 entries, 0 to 373
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   names          280 non-null    object
 1   summary        164 non-null    object
 2   cusine         279 non-null    object
 3   price          280 non-null    object
 4   reviews_count  280 non-null    object
 5   rating         280 non-null    object
 6   address        216 non-null    object
 7   url            280 non-null    object
dtypes: object(8)
memory usage: 19.7+ KB


In [7]:
opentable_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 400 entries, 0 to 499
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   names    400 non-null    object 
 1   cuisine  400 non-null    object 
 2   price    400 non-null    int64  
 3   reviews  358 non-null    float64
 4   rating   350 non-null    float64
 5   address  400 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 21.9+ KB


In [8]:
#Different values in price
print(google_df.price.unique())
print(opentable_df.price.unique())

#Price ranges from 1 to 4, but we need to have the same format across tables
price_dict = {'£':1, '££':2, '£££':3, '££££':4, 'Not Available': np.nan}
#Map values
google_df['price'] = google_df['price'].map(price_dict)


['££££' '££' 'Not Available' '£££' '£']
[4 2 3]


In [10]:
#Clean reviews  and ratings before converting to float
google_df['reviews_count'] = google_df['reviews_count'].replace(',','', regex=True)
google_df['reviews_count'] = google_df['reviews_count'].replace('[a-zA-Z]','', regex=True)
google_df['reviews_count'] = google_df['reviews_count'].replace('',np.nan, regex=True)
google_df['reviews_count'] = google_df['reviews_count'].replace(""" ''""", np.nan, regex=True)
google_df['reviews_count'] = google_df['reviews_count'].str.strip()

google_df['rating'] = google_df['rating'].replace('Not Available', np.nan, regex=True)

In [11]:
google_df['reviews_count'] = google_df['reviews_count'].astype(float)

google_df['rating'] = google_df['rating'].astype(float)

In [12]:
#Make sure caps are the same in restaurant titles to be able to merge later
google_df['names'] = google_df['names'].str.lower()
google_df['names'] = google_df['names'].str.title()
opentable_df['names'] = opentable_df['names'].str.lower()
opentable_df['names'] = opentable_df['names'].str.title()

### Make Addresses Compatible

In [66]:
#Load postcode data
postcode_df = pd.read_csv('/Users/marianaarrieta/Documents/Data_Projects/london_restaurants/raw_data/postcodes.csv')

#split strings after each comma on the Borough column 
postcode_df["Borough"] = postcode_df["Borough"].str.split(", ")

#covert to dict
postcode_dict = postcode_df.set_index('Postcode').T.to_dict("list")

#flatten dictionary
for key, value in postcode_dict.items():
    postcode_dict[key] = postcode_dict[key][0]






In [81]:
#We want to make the addresses in the Google Maps df compatible with those in Open Table
#To do that, we want to extract the post code, and then map it to its 
#corresponding neighbourhood using the postcode dictionary above 

#Extract the postcode
google_df['address_postcode'] = google_df['address'].str[-7:]
google_df['address_postcode'] = google_df['address_postcode'].str[:3]
google_df['address_postcode'] = google_df['address_postcode'].str.replace(" ", "")


ValueError: NumPy boolean array indexing assignment cannot assign 3 input values to the 5 output values where the mask is true

In [80]:
#Make column names suitable for merging 
google_df.rename({'cusine': 'Google Cuisine', 'price': 'Google Price', 'rating': 'Google Rating', 'reviews_count': "Google Review Count"}, axis=1, inplace=True)

In [210]:
opentable_df.rename({'cuisine': 'OpenTable Cuisine', 'price': 'OpenTable Price', 'rating': 'OpenTable Rating', 'reviews': "OpenTable Review Count"}, axis=1, inplace=True)

In [211]:
restaurants_df = pd.merge(google_df, opentable_df, how="outer", on=["names"])

In [212]:
restaurants_df.head()

Unnamed: 0,names,summary,Google Cuisine,Google Price,Google Review Count,Google Rating,address_x,url,OpenTable Cuisine,OpenTable Price,OpenTable Review Count,OpenTable Rating,address_y
0,Restaurant Story,Chef Tom Sellers' novel approach to British cl...,Fine dining restaurant,4.0,418.0,4.6,"199 Tooley St, London SE1 2JX",https://www.google.com/maps/place/Restaurant+S...,,,,,
1,Galvin La Chapelle,,French restaurant,4.0,1901.0,4.6,,https://www.google.com/maps/place/Galvin+La+Ch...,,,,,
2,City Social,,Modern European restaurant,4.0,1187.0,4.5,,https://www.google.com/maps/place/City+Social/...,,,,,
3,Brawn,Mediterranean small plates menu with an 'all t...,Mediterranean restaurant,2.0,615.0,4.7,"49 Columbia Rd, London E2 7RG",https://www.google.com/maps/place/Brawn/@51.52...,,,,,
4,Il Bordello,Old-school Italian restaurant with an authenti...,Italian restaurant,2.0,1170.0,4.5,"metropolitan wharf, 70 Wapping Wall, London E1...",https://www.google.com/maps/place/Il+Bordello/...,,,,,


In [213]:
restaurants_df.to_csv("rest.csv")

In [214]:
len(restaurants_df)

647

In [None]:
import inquirer
questions = [
  inquirer.List('size',
                message="What size do you need?",
                choices=['Jumbo', 'Large', 'Standard', 'Medium', 'Small', 'Micro'],
            ),
]
answers = inquirer.prompt(questions)
print answers["size"]