# Data Preparation and Feature augmentation and engineering



### House Transactions dataset

Steps to follow:

- Read data and join with postocdes to geolocate
- Upload to GIS server

In [2]:
import pandas as pd

#Setting for pandas to show all columns
pd.set_option('display.max_columns', None)


#For GIS Analysis
import arcpy

#GIS Libraries and credentials
from arcgis.gis import GIS
from IPython.display import display
from arcgis.features import enrich_data
from arcgis.features import FeatureLayerCollection
from arcgis.geoenrichment import *
gis = GIS()

In [3]:
house_transactions = pd.read_csv('./Data/House_Prices/last_year_house_transactions_london.csv',sep=',')
uk_postcodes = pd.read_csv('./Data/House_Prices/ukpostcodes.csv',sep=',')

In [20]:
house_transactions.head()

Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,postcode,Property Type,Town
0,{68FEB20C-3FF9-38DA-E053-6C04A8C051AE},460000,11/24/2017,N3 2FH,F,BARNET
1,{68FEB20C-4001-38DA-E053-6C04A8C051AE},332000,11/20/2017,E14 0HT,F,TOWER HAMLETS
2,{68FEB20C-4007-38DA-E053-6C04A8C051AE},210000,11/27/2017,UB10 8AU,F,HILLINGDON
3,{68FEB20C-400F-38DA-E053-6C04A8C051AE},260000,10/6/2017,NW2 6JH,F,BRENT
4,{68FEB20C-4014-38DA-E053-6C04A8C051AE},375000,11/30/2017,NW6 5BA,F,BRENT


In [21]:
uk_postcodes.head()

Unnamed: 0,id,postcode,latitude,longitude
0,1,AB10 1XG,57.144165,-2.114848
1,2,AB10 6RN,57.13788,-2.121487
2,3,AB10 7JB,57.124274,-2.12719
3,4,AB11 5QN,57.142701,-2.093295
4,5,AB11 6UL,57.137547,-2.112233


In [22]:
house_transactions = pd.merge(house_transactions,uk_postcodes, on='postcode')
house_transactions.head()

Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,postcode,Property Type,Town,id,latitude,longitude
0,{68FEB20C-4001-38DA-E053-6C04A8C051AE},332000,11/20/2017,E14 0HT,F,TOWER HAMLETS,1250232,51.514245,0.001029
1,{64342BFD-B07B-422C-E053-6C04A8C0FB8A},325000,8/16/2017,E14 0HT,F,TOWER HAMLETS,1250232,51.514245,0.001029
2,{68FEB20C-4007-38DA-E053-6C04A8C051AE},210000,11/27/2017,UB10 8AU,F,HILLINGDON,124987,51.560992,-0.468725
3,{666758D7-90D6-3363-E053-6B04A8C0D74E},520000,11/27/2017,UB10 8AU,F,HILLINGDON,124987,51.560992,-0.468725
4,{68FEB20C-400F-38DA-E053-6C04A8C051AE},260000,10/6/2017,NW2 6JH,F,BRENT,637800,51.558771,-0.24612


In [23]:
house_transactions = house_transactions.drop(labels=['id'], axis=1)
house_transactions.head()

Unnamed: 0,Transaction unique identifier,Price,Date of Transfer,postcode,Property Type,Town,latitude,longitude
0,{68FEB20C-4001-38DA-E053-6C04A8C051AE},332000,11/20/2017,E14 0HT,F,TOWER HAMLETS,51.514245,0.001029
1,{64342BFD-B07B-422C-E053-6C04A8C0FB8A},325000,8/16/2017,E14 0HT,F,TOWER HAMLETS,51.514245,0.001029
2,{68FEB20C-4007-38DA-E053-6C04A8C051AE},210000,11/27/2017,UB10 8AU,F,HILLINGDON,51.560992,-0.468725
3,{666758D7-90D6-3363-E053-6B04A8C0D74E},520000,11/27/2017,UB10 8AU,F,HILLINGDON,51.560992,-0.468725
4,{68FEB20C-400F-38DA-E053-6C04A8C051AE},260000,10/6/2017,NW2 6JH,F,BRENT,51.558771,-0.24612


In [24]:
#how many nulls?
pd.DataFrame(house_transactions.isnull().sum().sort_values(ascending=False), columns = ['Number of Null']).head(20)

Unnamed: 0,Number of Null
longitude,0
latitude,0
Town,0
Property Type,0
postcode,0
Date of Transfer,0
Price,0
Transaction unique identifier,0


In [25]:
house_transactions.to_csv('./Data/House_Prices/house_transactions_geolocated.csv',
                         index=False)

**Create Shapefile**

In [53]:
#we create the shapefile
arcpy.env.workspace = './Data/Shapefiles'
arcpy.management.XYTableToPoint('./Data/House_Prices/house_transactions_geolocated.csv','./Data/Shapefiles/house_transactions.shp',
                                "longitude", "latitude", "",
                                arcpy.SpatialReference(4326, 115700))

<Result '.\\Data\\Shapefiles\\house_transactions.shp'>

## Hotels

In [4]:
#List of hotels, selenium scraper
hotels_list = pd.read_csv('./Data/Hotels/hotel_list_london.csv',
                    names=['Name','Price_Night_USD','Total_Reviews','Stars_Rating','Ranking_best_value','Link'])
hotels_list = hotels_list.drop(labels='Ranking_best_value', axis=1)
hotels_list.head()

Unnamed: 0,Name,Price_Night_USD,Total_Reviews,Stars_Rating,Link
0,The Arch London,257.0,1992,5.0,https://www.tripadvisor.com/Hotel_Review-g1863...
1,The Pelham – Starhotels Collezione,264.0,940,4.5,https://www.tripadvisor.com/Hotel_Review-g1863...
2,Premier Inn London Kings Cross Hotel,160.0,4899,4.5,https://www.tripadvisor.com/Hotel_Review-g1863...
3,Mondrian London at Sea Containers,286.0,4289,4.5,https://www.tripadvisor.com/Hotel_Review-g1863...
4,The Royal Horseguards,205.0,5188,4.5,https://www.tripadvisor.com/Hotel_Review-g1863...


In [5]:
#Details of hotels, including the address that we will merge
hotels_details = pd.read_csv('./Data/Hotels/hotel_details_london.csv')
hotels_details.head()

Unnamed: 0,h_address,postcode,h_amenities,h_link,h_ranking_users,h_star,r_average,r_excellent,r_poor,r_terrible,r_very_good
0,50 Great Cumberland Place,W1H 7FD,"Restaurant,Free High Speed Internet (WiFi),Fit...",https://www.tripadvisor.com/Hotel_Review-g1863...,9.0,5.0,48.0,1480.0,7.0,5.0,215.0
1,100 Shoreditch High Street,E1 6JQ,"Restaurant,Room Service,Fitness Center with Gy...",https://www.tripadvisor.com/Hotel_Review-g1863...,363.0,5.0,161.0,531.0,63.0,65.0,320.0
2,199 - 206 High Holborn,WC1V 7BD,"Free High Speed Internet (WiFi),Room Service,R...",https://www.tripadvisor.com/Hotel_Review-g1863...,76.0,4.0,78.0,1167.0,15.0,6.0,383.0
3,10 Manchester Street,W1U 4DG,"Room Service,Restaurant,Free High Speed Intern...",https://www.tripadvisor.com/Hotel_Review-g1863...,238.0,4.0,74.0,464.0,25.0,19.0,207.0
4,108 Baker Street,W1U 6LJ,"Room Service,Fitness Center with Gym / Workout...",https://www.tripadvisor.com/Hotel_Review-g1863...,365.0,4.0,312.0,914.0,127.0,76.0,814.0


In [6]:
hotels_details = pd.merge(hotels_details,uk_postcodes, on='postcode')
hotels_details = hotels_details.drop(labels='id',axis=1)
hotels_details.head()

Unnamed: 0,h_address,postcode,h_amenities,h_link,h_ranking_users,h_star,r_average,r_excellent,r_poor,r_terrible,r_very_good,latitude,longitude
0,50 Great Cumberland Place,W1H 7FD,"Restaurant,Free High Speed Internet (WiFi),Fit...",https://www.tripadvisor.com/Hotel_Review-g1863...,9.0,5.0,48.0,1480.0,7.0,5.0,215.0,51.515962,-0.159802
1,100 Shoreditch High Street,E1 6JQ,"Restaurant,Room Service,Fitness Center with Gy...",https://www.tripadvisor.com/Hotel_Review-g1863...,363.0,5.0,161.0,531.0,63.0,65.0,320.0,51.525743,-0.077353
2,199 - 206 High Holborn,WC1V 7BD,"Free High Speed Internet (WiFi),Room Service,R...",https://www.tripadvisor.com/Hotel_Review-g1863...,76.0,4.0,78.0,1167.0,15.0,6.0,383.0,51.516893,-0.122558
3,10 Manchester Street,W1U 4DG,"Room Service,Restaurant,Free High Speed Intern...",https://www.tripadvisor.com/Hotel_Review-g1863...,238.0,4.0,74.0,464.0,25.0,19.0,207.0,51.5188,-0.153879
4,108 Baker Street,W1U 6LJ,"Room Service,Fitness Center with Gym / Workout...",https://www.tripadvisor.com/Hotel_Review-g1863...,365.0,4.0,312.0,914.0,127.0,76.0,814.0,51.520914,-0.156705


In [7]:
#cheking null values
pd.DataFrame(hotels_details.isnull().sum())

Unnamed: 0,0
h_address,0
postcode,0
h_amenities,5
h_link,0
h_ranking_users,69
h_star,114
r_average,17
r_excellent,17
r_poor,17
r_terrible,17


In [8]:
#merge of the two tables by the hotel link
hotels = pd.merge(hotels_list,hotels_details, left_on='Link', right_on='h_link').reset_index()
hotels.drop(labels='h_link',axis=1, inplace=True)
hotels.head()

Unnamed: 0,index,Name,Price_Night_USD,Total_Reviews,Stars_Rating,Link,h_address,postcode,h_amenities,h_ranking_users,h_star,r_average,r_excellent,r_poor,r_terrible,r_very_good,latitude,longitude
0,0,The Arch London,257.0,1992,5.0,https://www.tripadvisor.com/Hotel_Review-g1863...,50 Great Cumberland Place,W1H 7FD,"Restaurant,Free High Speed Internet (WiFi),Fit...",9.0,5.0,48.0,1480.0,7.0,5.0,215.0,51.515962,-0.159802
1,1,The Pelham – Starhotels Collezione,264.0,940,4.5,https://www.tripadvisor.com/Hotel_Review-g1863...,15 Cromwell Place,SW7 2LA,"Restaurant,Room Service,Bar/Lounge,Fitness Cen...",191.0,5.0,68.0,534.0,22.0,5.0,214.0,51.494367,-0.175074
2,2,Premier Inn London Kings Cross Hotel,160.0,4899,4.5,https://www.tripadvisor.com/Hotel_Review-g1863...,26-30 York Way,N1 9AA,"Restaurant,Bar/Lounge,Free Internet,Air Condit...",181.0,4.0,350.0,2376.0,130.0,76.0,1506.0,51.531269,-0.122325
3,3,Mondrian London at Sea Containers,286.0,4289,4.5,https://www.tripadvisor.com/Hotel_Review-g1863...,20 Upper Ground,SE1 9PD,"Room Service,Bar/Lounge,Free High Speed Intern...",92.0,5.0,199.0,2900.0,52.0,36.0,721.0,51.508412,-0.106932
4,4,The Royal Horseguards,205.0,5188,4.5,https://www.tripadvisor.com/Hotel_Review-g1863...,2 Whitehall Court,SW1A 2EJ,"Room Service,Restaurant,Fitness Center with Gy...",175.0,5.0,352.0,3210.0,147.0,66.0,1006.0,51.505814,-0.124186


In [9]:
hotels.to_csv('./Data/Hotels/hotels_london.csv')

### GIS Enriching of datasets

We want to add the following around each of the hotels:

- Purchasing Power per Capita
- Average of household size
- Total household
- Ratio of each age group (there are 4 of them)
- Traveltime to the main city centre, finantial centre and Canary Wharf

In [18]:
hotels.shape #to check number of hotels

(1063, 18)

**Create Shapefile**

In [12]:
#we create the shapefile
arcpy.env.workspace = './Data/Shapefiles'
arcpy.management.XYTableToPoint('./Data/Hotels/hotels_london.csv','./Data/Shapefiles/hotels.shp',
                                "longitude", "latitude", "",
                                arcpy.SpatialReference(4326, 115700))

<Result '.\\Data\\Shapefiles\\hotels.shp'>

#### Publish to GIS for enriching

In [19]:
#Publish to GIS
hotels_csv = './Data/Hotels/hotels_london.csv'
item_prop = {'title':'hotels_london_tripadvisor'} #name of the layer in AGOL
#Try to add the csv to agol, if item exists will delete and add again
csv_to_publish = gis.content.add(item_properties=item_prop, data=hotels_csv)
hotels_shp = csv_to_publish.publish()
hotels_shp

**Enrich dataset**

In [20]:
uk = Country.get('GB')
uk.properties.datasets

datasets_esri = uk.data_collections

#we save in a CSV the data name sos we can gather the one sof interest
datasets_esri.to_csv('./Data/Shapefiles/datasets_esri.csv')

In [21]:
#We get the dataset in case has already been published
search_results = gis.content.search('title:hotels_london_tripadvisor', 'Feature Layer')
hotels_shp = search_results[0]
hotels_shp

In [23]:
#Enrich the dataset
data_collections = ['KeyFacts']

analysis_variables= ['KeyFacts.PAGE01_CY', #2017 Total Population Age 0-14
                     'KeyFacts.PAGE02_CY', #15-29
                     'KeyFacts.PAGE03_CY', #30-44
                     'KeyFacts.PAGE04_CY', #45-59
                     'KeyFacts.PAGE05CY', # >60
                     'KeyFacts.TOTPOP_CY', #Total populations
                     'KeyFacts.TOTHH_CY', # Total Households
                     'KeyFacts.HTYP01A_CY', #households with Single Person
                     'KeyFacts.PPPC_CY', #Purchasing Power per Capita
                     'KeyFacts.AVGHHSZ_CY', #Average Household Size
                    ]

#hotels_enriched = enrich_data.enrich_layer(hotels_shp, buffer_type='StraightLine', distance=0.5 ,units='Kilometers', 
                                         data_collections= data_collections, analysis_variables = analysis_variables)

#How to transform a feature collection to a df, first we 'query' teh feature collection adn a 
#Featureset is returned, then we use the df method to transform to a pd dataframe 

#Documentation here https://esri.github.io/arcgis-python-api/apidoc/html/arcgis.features.toc.html#featurecollection and
# here: https://esri.github.io/arcgis-python-api/apidoc/html/arcgis.features.toc.html#featureset
hotels_enriched_df = hotels_enriched.query().df

In [25]:
#We export teh shapefile for further analysis

hotels_enriched.query().save(save_location= './Data/Shapefiles',
     out_name='hotels_enriched', encoding=None)

#Lets change the name of the columns and clean the data
hotels_enriched_df = hotels_enriched_df.drop(['HasData', 'ID', 'OBJECTID', 'ENRICH_FID', 'aggregationMethod', 'areaType', 'bufferUnits', 'sourceCountry', 'SHAPE' ], axis=1)
hotels_enriched_df = hotels_enriched_df.rename(index=str, columns={"PAGE01_CY": "p_b_14", 
                                                               "PAGE02_CY": "p15_29",
                                                               "PAGE03_CY": "p30_44",
                                                               "PAGE04_CY": "p45_59",
                                                               "PAGE05_CY": "p_m_60",
                                                               "TOTPOP_CY": "tot_pop",
                                                               "TOTHH_CY": "tot_hhs",
                                                               "HTYP01A_CY": "single_hhs",
                                                               "PPPC_CY": "ppc",
                                                               "AVGHHSZ_CY": "hh_size"})

#Just to check we have the right data and save it ot a new dataframe and CSV so we dont need to run everytime
hotels_enriched_df.head(5)

Unnamed: 0,hh_size,single_hhs,Link,Name,p_b_14,p15_29,p30_44,p45_59,ppc,Price_Night_USD,Stars_Rating,tot_hhs,tot_pop,Total_Reviews,bufferRadii,bufferUnitsAlias,h_address,h_amenities,h_ranking_users,h_star,index_,latitude,longitude,postcode,r_average,r_excellent,r_poor,r_terrible,r_very_good
0,1.8,2844,https://www.tripadvisor.com/Hotel_Review-g1863...,The Arch London,1313,2291,3237,1679,45036.36,257.0,5.0,5598,10274,1992,0.5,Kilometers,50 Great Cumberland Place,"Restaurant,Free High Speed Internet (WiFi),Fit...",9.0,5.0,0,51.515962,-0.159802,W1H 7FD,48.0,1480.0,7.0,5.0,215.0
1,3.5,602,https://www.tripadvisor.com/Hotel_Review-g1863...,Ruskin Hotel,2775,2869,3168,1966,14963.16,56.0,3.5,3451,12243,626,0.5,Kilometers,386 High Street North,"Free Parking,Bar/Lounge,Free High Speed Intern...",604.0,3.0,500,51.544138,0.049934,E12 6PH,60.0,130.0,31.0,33.0,136.0
2,1.9,1798,https://www.tripadvisor.com/Hotel_Review-g1863...,The Pelham – Starhotels Collezione,1205,1360,1957,1495,56240.8,264.0,4.5,3874,7419,940,0.5,Kilometers,15 Cromwell Place,"Restaurant,Room Service,Bar/Lounge,Fitness Cen...",191.0,5.0,1,51.494367,-0.175074,SW7 2LA,68.0,534.0,22.0,5.0,214.0
3,2.5,1772,https://www.tripadvisor.com/Hotel_Review-g1863...,Premier Inn London Kings Cross Hotel,1629,4421,2335,1489,23642.09,160.0,4.5,4344,10974,4899,0.5,Kilometers,26-30 York Way,"Restaurant,Bar/Lounge,Free Internet,Air Condit...",181.0,4.0,2,51.531269,-0.122325,N1 9AA,350.0,2376.0,130.0,76.0,1506.0
4,1.9,1311,https://www.tripadvisor.com/Hotel_Review-g1863...,Mondrian London at Sea Containers,351,2164,1082,824,26184.8,286.0,4.5,2618,5073,4289,0.5,Kilometers,20 Upper Ground,"Room Service,Bar/Lounge,Free High Speed Intern...",92.0,5.0,3,51.508412,-0.106932,SE1 9PD,199.0,2900.0,52.0,36.0,721.0


In [26]:
#Lets save the data in a CSV
hotels_enriched_df.to_csv('./Data/Hotels/hotels_enriched.csv')

In [27]:
#Load data and inspect
hotels_enriched = pd.read_csv('./Data/Hotels/hotels_enriched.csv')
hotels_enriched.head()

Unnamed: 0.1,Unnamed: 0,hh_size,single_hhs,Link,Name,p_b_14,p15_29,p30_44,p45_59,ppc,Price_Night_USD,Stars_Rating,tot_hhs,tot_pop,Total_Reviews,bufferRadii,bufferUnitsAlias,h_address,h_amenities,h_ranking_users,h_star,index_,latitude,longitude,postcode,r_average,r_excellent,r_poor,r_terrible,r_very_good
0,0,1.8,2844,https://www.tripadvisor.com/Hotel_Review-g1863...,The Arch London,1313,2291,3237,1679,45036.36,257.0,5.0,5598,10274,1992,0.5,Kilometers,50 Great Cumberland Place,"Restaurant,Free High Speed Internet (WiFi),Fit...",9.0,5.0,0,51.515962,-0.159802,W1H 7FD,48.0,1480.0,7.0,5.0,215.0
1,1,3.5,602,https://www.tripadvisor.com/Hotel_Review-g1863...,Ruskin Hotel,2775,2869,3168,1966,14963.16,56.0,3.5,3451,12243,626,0.5,Kilometers,386 High Street North,"Free Parking,Bar/Lounge,Free High Speed Intern...",604.0,3.0,500,51.544138,0.049934,E12 6PH,60.0,130.0,31.0,33.0,136.0
2,2,1.9,1798,https://www.tripadvisor.com/Hotel_Review-g1863...,The Pelham – Starhotels Collezione,1205,1360,1957,1495,56240.8,264.0,4.5,3874,7419,940,0.5,Kilometers,15 Cromwell Place,"Restaurant,Room Service,Bar/Lounge,Fitness Cen...",191.0,5.0,1,51.494367,-0.175074,SW7 2LA,68.0,534.0,22.0,5.0,214.0
3,3,2.5,1772,https://www.tripadvisor.com/Hotel_Review-g1863...,Premier Inn London Kings Cross Hotel,1629,4421,2335,1489,23642.09,160.0,4.5,4344,10974,4899,0.5,Kilometers,26-30 York Way,"Restaurant,Bar/Lounge,Free Internet,Air Condit...",181.0,4.0,2,51.531269,-0.122325,N1 9AA,350.0,2376.0,130.0,76.0,1506.0
4,4,1.9,1311,https://www.tripadvisor.com/Hotel_Review-g1863...,Mondrian London at Sea Containers,351,2164,1082,824,26184.8,286.0,4.5,2618,5073,4289,0.5,Kilometers,20 Upper Ground,"Room Service,Bar/Lounge,Free High Speed Intern...",92.0,5.0,3,51.508412,-0.106932,SE1 9PD,199.0,2900.0,52.0,36.0,721.0


#### Airbnb Data

We have prefiltered the data to take into account, only private rooms, with a review in the last 9 months, more than 10 reviews and with availability.

In [28]:
airbnb = pd.read_csv('./Data/Airbnb/airbnb_listings_for_project.csv')
airbnb.head()                    

Unnamed: 0,id,listing_url,host_id,host_url,host_name,host_since,host_location,host_about,zipcode,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,price,guests_included,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,last_review,review_scores_rating
0,9554,https://www.airbnb.com/rooms/9554,31655,https://www.airbnb.com/users/show/31655,Guy,40039,"London, England, United Kingdom",Please contact me before booking! Please see d...,N8 0EY,51.587767,-0.105666,f,Apartment,Private room,2,,1.0,1.0,Real Bed,35,1,2 weeks ago,t,13,21,21,262,43321,131,43315.0,97.0
1,13913,https://www.airbnb.com/rooms/13913,54730,https://www.airbnb.com/users/show/54730,Alina,40133,"London, England, United Kingdom",I am a Multi-Media Visual Artist and Creative ...,N4 3,51.568017,-0.111208,t,Apartment,Private room,2,1.0,1.0,1.0,Real Bed,45,1,a week ago,t,29,59,89,364,43321,14,43268.0,95.0
2,17402,https://www.airbnb.com/rooms/17402,67564,https://www.airbnb.com/users/show/67564,Liz,40182,"London, England, United Kingdom",Hello!\r\n\r\nWe are Liz and Jack. We look for...,W1T4BP,51.520982,-0.140024,t,Apartment,Entire home/apt,6,2.0,3.0,3.0,Real Bed,300,4,today,t,13,25,49,135,43321,31,43313.0,93.0
3,25123,https://www.airbnb.com/rooms/25123,103583,https://www.airbnb.com/users/show/103583,Grace,40273,"London, England, United Kingdom","Easy going, friendly\r\n\r\nAttentive to detai...",NW11 9,51.572243,-0.20906,t,House,Private room,2,1.5,1.0,1.0,Real Bed,29,1,5 weeks ago,t,4,4,10,285,43321,114,43221.0,95.0
4,26223,https://www.airbnb.com/rooms/26223,110865,https://www.airbnb.com/users/show/110865,Paulo,40287,"London, England, United Kingdom","Originally from Southern Europe, and now livin...",N1 2,51.54168,-0.102065,t,Apartment,Entire home/apt,4,1.0,1.0,3.0,Real Bed,150,4,today,t,29,45,75,350,43321,54,43308.0,84.0


**Create shapefile**

In [29]:
#we create the shapefile
arcpy.env.workspace = './Data/Shapefiles'
arcpy.management.XYTableToPoint('./Data/Airbnb/airbnb_listings_for_project.csv','./Data/Shapefiles/airbnb.shp',
                                "longitude", "latitude", "",
                                arcpy.SpatialReference(4326, 115700))

<Result '.\\Data\\Shapefiles\\airbnb.shp'>

### Restaurants Dataset

Steps to follow:

- Read both datasets and merge by link
- Create Shapefile (already have coordinates)

In [31]:
restaurant_list = pd.read_csv('./Data/Restaurants/rest_list_london.csv')
restaurant_list.head()

Unnamed: 0,Restaurant Name,Price Range,Cuisine type,Number of reviews,Reviews rating (out of 5),Ranking,Restaurant Link
0,Kama Sushi @ Sticky Mango,$$$$,"Japanese, Sushi, Asian, Vegetarian Friendly, G...",88,5.0,1.0,https://www.tripadvisor.com/Restaurant_Review-...
1,Amrutha Lounge,$$ - $$$,"Indian, Asian, Thai, Healthy, Vegetarian Frien...",160,5.0,2.0,https://www.tripadvisor.com/Restaurant_Review-...
2,Humble Grape,$$ - $$$,"European, Wine Bar, Vegan Options, Vegetarian ...",169,5.0,3.0,https://www.tripadvisor.com/Restaurant_Review-...
3,The Lounge Cafe,$,"British, Vegetarian Friendly, Vegan Options, G...",131,5.0,4.0,https://www.tripadvisor.com/Restaurant_Review-...
4,Pizza Union Dalston,$,"Italian, Pizza, Fast Food, Vegetarian Friendly...",368,5.0,5.0,https://www.tripadvisor.com/Restaurant_Review-...


In [32]:
restaurant_details = pd.read_csv('./Data/Restaurants/rest_details_london.csv')
restaurant_details.head()

Unnamed: 0,r_address,r_data,r_lat,r_link,r_long
0,"91 Stamford St , London SE1 9NR,",,51.506176,https://www.tripadvisor.com/Restaurant_Review-...,-0.109894
1,"38C Kensington Church Street , London W8 4BX,",,51.504189,https://www.tripadvisor.com/Restaurant_Review-...,-0.192753
2,"34-38 Southampton Street , London WC2E 7HG,",,51.510845,https://www.tripadvisor.com/Restaurant_Review-...,-0.121941
3,"1A Launceston Place , London W8 5RL,",,51.499207,https://www.tripadvisor.com/Restaurant_Review-...,-0.185377
4,"28 Church Row , London NW3 6UP,",,51.555576,https://www.tripadvisor.com/Restaurant_Review-...,-0.17878


In [33]:
#Merging both datasets and saving the results to csv too
restaurants = pd.merge(restaurant_list,restaurant_details, left_on='Restaurant Link', right_on='r_link')
restaurants.drop(labels='r_link',axis=1, inplace=True)
restaurants.head()

Unnamed: 0,Restaurant Name,Price Range,Cuisine type,Number of reviews,Reviews rating (out of 5),Ranking,Restaurant Link,r_address,r_data,r_lat,r_long
0,Kama Sushi @ Sticky Mango,$$$$,"Japanese, Sushi, Asian, Vegetarian Friendly, G...",88,5.0,1.0,https://www.tripadvisor.com/Restaurant_Review-...,"91 Stamford St , London SE1 9NR,",,51.506176,-0.109894
1,Amrutha Lounge,$$ - $$$,"Indian, Asian, Thai, Healthy, Vegetarian Frien...",160,5.0,2.0,https://www.tripadvisor.com/Restaurant_Review-...,"326 Garratt Lane , London SW18 4EJ,",,51.443775,-0.189937
2,Humble Grape,$$ - $$$,"European, Wine Bar, Vegan Options, Vegetarian ...",169,5.0,3.0,https://www.tripadvisor.com/Restaurant_Review-...,"11-13 Theberton Street , London N1 0QY,",,51.537575,-0.103633
3,The Lounge Cafe,$,"British, Vegetarian Friendly, Vegan Options, G...",131,5.0,4.0,https://www.tripadvisor.com/Restaurant_Review-...,"Welford Centre 113 Chalkhill Road , London HA9...",,51.564049,-0.276614
4,Pizza Union Dalston,$,"Italian, Pizza, Fast Food, Vegetarian Friendly...",368,5.0,5.0,https://www.tripadvisor.com/Restaurant_Review-...,"14 Kingsland Road , London E13 9PA,",,51.54689,-0.075558


In [35]:
restaurants.to_csv('./Data/Restaurants/restaurants.csv')

In [36]:
#cheking null values
pd.DataFrame(restaurants.isnull().sum())

Unnamed: 0,0
Restaurant Name,0
Price Range,5283
Cuisine type,3661
Number of reviews,0
Reviews rating (out of 5),0
Ranking,1771
Restaurant Link,0
r_address,196
r_data,18727
r_lat,117


**Create shapefile**

In [52]:
#we create the shapefile
arcpy.env.workspace = './Data/Shapefiles'
arcpy.management.XYTableToPoint('./Data/Restaurants/restaurants.csv','./Data/Shapefiles/restaurants.shp',
                                "r_long", "r_lat", "",
                                arcpy.SpatialReference(4326, 115700))

<Result '.\\Data\\Shapefiles\\restaurants.shp'>