# 
### by Marc Gou

## In the context of IBM Capstone Project

### Step 1: Get the data of London neighborhood

Source: https://en.wikipedia.org/wiki/List_of_areas_of_London

In [322]:
# Installation of useful libraries
#!conda install -c conda-forge beautifulsoup4 --yes

# Import of useful libraries
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
import geopy as gp
import pandas as pd
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

from bs4 import BeautifulSoup
import urllib.request as req

In [323]:
# Open the URL
wiki_url = 'https://en.wikipedia.org/wiki/List_of_areas_of_London'
wiki_req = req.urlopen(wiki_url)

# Parsing with BeautifulSoup
wiki_soup = BeautifulSoup(wiki_req, 'lxml')
# print(wiki_soup.prettify())

# Locate the neighborhood table: We can see that the table of London neighborhoods are under the class "wikitable sortable"
neig_table = wiki_soup.find('table', attrs={'class':'wikitable sortable'})
#print(neig_table.prettify())

Now let's extract the data from this table to a dataframe:
- we can see that the headers are marked with the tag *th*
- we can also see that each neighborhood are marked with the tag *tr* with *td* for the value of their attributes

In [324]:
# Extract all the headers
table_h = neig_table.findAll('th')

# Define the headers
headers = []
for h in table_h:
    h1 = h.contents[0].replace('\n','').replace('\xa0',' ')
    headers.append(h1)

headers = headers[:3]

In [325]:
# Extract all the neighborhood data
table_n = neig_table.findAll('td')

# Insert the data, 5 elements for each postal code (ignore the last field)
full_data = []
for i in range(0, len(table_n)-6, 6):
    pc_data = []
    e1 = table_n[i].contents[0]
    e2 = table_n[i+1].contents[0]
    e3 = table_n[i+2].contents[0]
    #e4 = table_n[i+3].contents[0]
    #e5 = table_n[i+4].contents[0]
    if(str(e1)[0:2] == '<a'):
        pc_data.append(e1.get('title'))
    else:
        pc_data.append(e1.replace('\n','')) 
    if(str(e2)[0:2] == '<a'):
        pc_data.append(e2.get('title'))
    else:
        pc_data.append(e2.replace('\n',''))    
    if(str(e3)[0:2] == '<a'):
        pc_data.append(e3.get('title'))
    else:
        pc_data.append(e3.replace('\n',''))
    #if(str(e4)[0:2] == '<a'):
    #    pc_data.append(e4.get('title'))
    #else:
    #    pc_data.append(e4.replace('\n',''))    
    #if(str(e5)[0:2] == '<a'):
    #    pc_data.append(e5.get('title'))
    #else:
    #    pc_data.append(e5.replace('\n',''))
    full_data.append(pc_data)

In [326]:
# Convert the data to pandas.dataframe format
df_lond_ngh = pd.DataFrame(data = full_data, columns = headers)

# Ignore cells with Post town which are not LONDON (as the data for Paris only include Paris City)
df_lond_ngh = df_lond_ngh[df_lond_ngh['Post town'] == 'LONDON'].reset_index(drop=True)
#print(df_lond_ngh.shape)
#df_lond_ngh.head()

In [327]:
# We only consider the locations which are located in Inner London
inner_london_bor = ['City', 'Camden', 'Greenwich', 'Hackney', 'Hammersmith and Fulham', 'Islington', 'Kensington and Chelsea', 'Lambeth', 'Lewisham', 'Southwark', 'Tower Hamlets', 'Wandsworth', 'Westminster']

is_inner_bor = []
for i in range(0, len(df_lond_ngh.index)):
    boroughs = df_lond_ngh.iat[i,1].split(", ")
    #print(boroughs)
    for j in range(0, len(boroughs)):
        if (boroughs[j] in inner_london_bor):
            is_inner_bor.append(True)
            break
        if (j == len(boroughs)-1):
            is_inner_bor.append(False)
            break
len(is_inner_bor)
df_lond_ngh['Inner'] = is_inner_bor
df_lond_ngh = df_lond_ngh[df_lond_ngh['Inner'] == True].reset_index(drop=True)
#df_lond_ngh

Now let's add the latitude and longitude coordinates to the dataframe

In [328]:
#Install and import geocoder
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent='myapplication')

# initialize the table of latitude and longitude
lond_lats = []
lond_lngs = []

# loop until you get the coordinates
for i in range(0, len(df_lond_ngh.index), 1):
    location = geolocator.geocode(df_lond_ngh.iat[i,0] + ", United Kingdom")
    try:
        lond_lats.append(location.raw['lat'])
        lond_lngs.append(location.raw['lon'])
    except AttributeError:
        lond_lats.append(np.nan)
        lond_lngs.append(np.nan)        

In [329]:
# Insert the Latitude and Longitudes to the dataframe
df_lond_ngh['Latitude'] = lond_lats
df_lond_ngh['Longitude'] = lond_lngs

Now let's remove the rows with no coordinates data

In [330]:
# Ignore cells with coordinates = nan
df_lond_ngh = df_lond_ngh.dropna(how='any')
df_lond_ngh.reset_index(drop=True)
df_lond_ngh.head()

Unnamed: 0,Location,London borough,Post town,Inner,Latitude,Longitude
0,Abbey Wood,Greenwich,LONDON,True,51.487621,0.1140504
1,"Acton, London","Ealing, Hammersmith and Fulham",LONDON,True,51.5081402,-0.2732607
2,Aldgate,City,LONDON,True,51.5142477,-0.0757186
3,Aldwych,Westminster,LONDON,True,51.5124367,-0.1187414
4,"Angel, London",Islington,LONDON,True,51.5319458,-0.1061056


---

### Step 2: Get the data of Paris neighborhood

Source: https://fr.wikipedia.org/wiki/Liste_des_quartiers_administratifs_de_Paris

In [331]:
# Open the URL
wiki_url = 'https://fr.wikipedia.org/wiki/Liste_des_quartiers_administratifs_de_Paris'
wiki_req = req.urlopen(wiki_url)

# Parsing with BeautifulSoup
wiki_soup = BeautifulSoup(wiki_req, 'lxml')
#print(wiki_soup.prettify())

# Locate the neighborhood table: We can see that the table of Paris neighborhoods are under the class "wikitable sortable"
neig_table = wiki_soup.find('table', attrs={'class':'wikitable sortable'})
#print(neig_table.prettify())'

Now let's extract the data from this table to a dataframe:
- we can see that the headers are marked with the tag *th*
- we can also see that each neighborhood are marked with the tag *tr* with *td* for the value of their attributes

In [332]:
# Extract all the headers
table_h = neig_table.findAll('th')

# Define the headers
headers = []
for h in table_h:
    head = h.contents[0] 
    if(str(head)[0:2] == '<a'):
        headers.append(head.get('title'))
    else:
        headers.append(head.replace('\n','')) 

# We only take the first and second field (Arrondissement of Paris and Quartiers)
headers = headers[:2]
headers

['Arrondissements de Paris', 'Quartiers']

In [333]:
# Extract all the data related to "Quartiers", those are characterized by http tag <a> with class "new"
table_n = neig_table.findAll('td')
quartiers = []
for i in range(0, len(table_n)-1):
    if(str(table_n[i])[:12] == "<td><a href="):
       quartiers.append(table_n[i].string)
     
# Now let's also add the Arrondissement to it
pc_data = []
for i in range(0, len(quartiers)):
    row = []
    # Add arrondissement info
    n = "Arrondissement " + str(i//4+1)
    row.append(n)
    # Add quartier
    row.append(quartiers[i])
    pc_data.append(row)

In [334]:
# Convert the data to pandas.dataframe format
df_paris_ngh = pd.DataFrame(data = pc_data, columns = headers)
#df_paris_ngh

Now let's add the latitude and longitude coordinates to the dataframe

In [335]:
# initialize the table of latitude and longitude
paris_lats = []
paris_lngs = []

# loop until you get the coordinates
for i in range(0, len(df_paris_ngh.index), 1):
    location = geolocator.geocode(df_paris_ngh.iat[i,1] + ", Paris, France")
    try:
        paris_lats.append(location.raw['lat'])
        paris_lngs.append(location.raw['lon'])
    except AttributeError:
        paris_lats.append(np.nan)
        paris_lngs.append(np.nan)      

In [336]:
# Insert the Latitude and Longitudes to the dataframe
df_paris_ngh['Latitude'] = paris_lats
df_paris_ngh['Longitude'] = paris_lngs

# Ignore cells with coordinates = nan
df_paris_ngh = df_paris_ngh.dropna(how='any')
df_paris_ngh.reset_index(drop=True)
df_paris_ngh.head()

Unnamed: 0,Arrondissements de Paris,Quartiers,Latitude,Longitude
0,Arrondissement 1,Saint-Germain-l'Auxerrois,48.8596955,2.3406333
1,Arrondissement 1,Halles,48.8616513,2.3470129
2,Arrondissement 1,Palais-Royal,48.8635847,2.33620422009387
3,Arrondissement 1,Place-Vendôme,48.8674634,2.32942811682519
4,Arrondissement 2,Gaillon,48.86913515,2.33290877033551


---

### Step 3: Getting the venues data from Neighborhoods

In [337]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

 # import the library to handle requests
import requests

# install and import map rendering library
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
# import folium

Let's create a function to repeat the same process to all the neighborhoods in a city

In [338]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    CLIENT_ID = 'K4SABZ32C4O5P4H2QVP4AAYZUKZFTLS3WPRN3ZISL24I5TJW' # your Foursquare ID
    CLIENT_SECRET = 'CEL0PP1DNVSIKFJ2HDXPIY2W0UQTGA1UZWRE1JC3YORY5HRY' # your Foursquare Secret
    VERSION = '20180605' # Foursquare API version
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            1000)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the above function on each area of London

In [339]:
london_venues = getNearbyVenues(names=df_lond_ngh['Location'],latitudes=df_lond_ngh['Latitude'],longitudes=df_lond_ngh['Longitude'])
print('The shape of the dataframe london_venues is: {}'.format(london_venues.shape))
print('There are {} uniques categories.'.format(len(london_venues['Venue Category'].unique())))
london_venues.head()

The shape of the dataframe london_venues is: (7388, 7)
There are 368 uniques categories.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Abbey Wood,51.487621,0.1140504,Co-op Food,51.48765,0.11349,Grocery Store
1,Abbey Wood,51.487621,0.1140504,Bostal Gardens,51.48667,0.110462,Playground
2,Abbey Wood,51.487621,0.1140504,Cheers Off License,51.486808,0.107396,Grocery Store
3,Abbey Wood,51.487621,0.1140504,Abbey Wood Caravan Club,51.485502,0.120014,Campground
4,"Acton, London",51.5081402,-0.2732607,London Star Hotel,51.509624,-0.272456,Hotel


Run the above function on each area of Paris

In [340]:
paris_venues = getNearbyVenues(names=df_paris_ngh['Quartiers'],latitudes=df_paris_ngh['Latitude'],longitudes=df_paris_ngh['Longitude'])
print('The shape of the dataframe paris_venues is: {}'.format(paris_venues.shape))
print('There are {} uniques categories.'.format(len(paris_venues['Venue Category'].unique())))
paris_venues.head()

The shape of the dataframe paris_venues is: (5566, 7)
There are 298 uniques categories.


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Saint-Germain-l'Auxerrois,48.8596955,2.3406333,Cour Carrée du Louvre,48.86036,2.338543,Pedestrian Plaza
1,Saint-Germain-l'Auxerrois,48.8596955,2.3406333,Place du Louvre,48.859841,2.340822,Plaza
2,Saint-Germain-l'Auxerrois,48.8596955,2.3406333,Le Fumoir,48.860341,2.340647,Cocktail Bar
3,Saint-Germain-l'Auxerrois,48.8596955,2.3406333,Église Saint-Germain l'Auxerrois,48.859887,2.340617,Church
4,Saint-Germain-l'Auxerrois,48.8596955,2.3406333,Kong,48.859122,2.343089,French Restaurant


Eliminate the venue category that does not exist in the other city

In [341]:
# Get the list of categories into two lists
lond_categories = london_venues['Venue Category'].unique().tolist()
paris_categories = paris_venues['Venue Category'].unique().tolist()
common_categories = set(lond_categories) & set(paris_categories)

Now, let's eliminate the venues in London that does not exist in Paris:

In [342]:
is_common_cat = []
for i in range(0, len(london_venues.index)):
    cater = london_venues.iat[i,6]
    if (cater in common_categories):
        is_common_cat.append(True)
    else:
        is_common_cat.append(False)

london_venues['Common'] = is_common_cat
london_venues = london_venues[london_venues['Common'] == True].reset_index(drop=True)
print('The shape of the dataframe london_venues (only including common venue categories) is: {}'.format(london_venues.shape))

The shape of the dataframe london_venues (only including common venue categories) is: (7011, 8)


Now, let's eliminate the venues in Paris that does not exist in London:

In [343]:
is_common_cat = []
for i in range(0, len(paris_venues.index)):
    cater = paris_venues.iat[i,6]
    if (cater in common_categories):
        is_common_cat.append(True)
    else:
        is_common_cat.append(False)

paris_venues['Common'] = is_common_cat
paris_venues = paris_venues[paris_venues['Common'] == True].reset_index(drop=True)
print('The shape of the dataframe paris_venues (only including common venue categories) is: {}'.format(paris_venues.shape))

The shape of the dataframe paris_venues (only including common venue categories) is: (5420, 8)


Create two new dataframes based on the venue category and group it by Neighborhood (Location for London, Quartier for Paris)

In [344]:
# LONDON
# one hot encoding
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
london_onehot['Location'] = london_venues['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [london_onehot.columns[-1]] + list(london_onehot.columns[:-1])
london_onehot = london_onehot[fixed_columns]
# Group by Postcode
london_grouped = london_onehot.groupby('Location').mean().reset_index()
print('The shape of the dataframe london_grouped is: {}'.format(london_grouped.shape))
#london_grouped.head()

# PARIS
# one hot encoding
paris_onehot = pd.get_dummies(paris_venues[['Venue Category']], prefix="", prefix_sep="")
# add neighborhood column back to dataframe
paris_onehot['Quartier'] = paris_venues['Neighborhood'] 
# move neighborhood column to the first column
fixed_columns = [paris_onehot.columns[-1]] + list(paris_onehot.columns[:-1])
paris_onehot = paris_onehot[fixed_columns]
# Group by Postcode
paris_grouped = paris_onehot.groupby('Quartier').mean().reset_index()
print('The shape of the dataframe paris_grouped is: {}'.format(paris_grouped.shape))
#paris_grouped.head()

The shape of the dataframe london_grouped is: (179, 244)
The shape of the dataframe paris_grouped is: (79, 244)


In [345]:
# Function to sort the venues in descending order.
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [346]:
# Create the new dataframes and display the top 5 venues for each location in London
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe for London
london_venues_sorted = pd.DataFrame(columns=columns)
london_venues_sorted['Neighborhood'] =london_grouped['Location']
for ind in np.arange(london_grouped.shape[0]):
    london_venues_sorted.iloc[ind, 1:] = return_most_common_venues(london_grouped.iloc[ind, :], num_top_venues)

# create a new dataframe for Paris
paris_venues_sorted = pd.DataFrame(columns=columns)
paris_venues_sorted['Neighborhood'] =paris_grouped['Quartier']
for ind in np.arange(paris_grouped.shape[0]):
    paris_venues_sorted.iloc[ind, 1:] = return_most_common_venues(paris_grouped.iloc[ind, :], num_top_venues)

---

### Step 4: Clustering

In [347]:
# LONDON
kclusters_l = 18
london_grouped_clustering = london_grouped.drop('Location', 1)
kmeans_lond = KMeans(n_clusters=kclusters_l, random_state=0).fit(london_grouped_clustering) # run k-means clustering
london_venues_sorted['Cluster Labels'] = kmeans_lond.labels_ # add clustering labels

# PARIS
kclusters_p = 8
paris_grouped_clustering = paris_grouped.drop('Quartier', 1)
kmeans_paris = KMeans(n_clusters=kclusters_p, random_state=0).fit(paris_grouped_clustering) # run k-means clustering
paris_venues_sorted['Cluster Labels'] = kmeans_paris.labels_ # add clustering labels

Let's create a new dataframe that includes the cluster as well as the top 5 venues for each neighborhood in London and Paris

In [348]:
#LONDON
london_final_df = df_lond_ngh
london_final_df = london_final_df.join(london_venues_sorted.set_index('Neighborhood'), on='Location') # merge
london_final_df = london_final_df.dropna().reset_index(drop=True) # Ignore postcodes without venue data
london_final_df['Cluster Labels'] = london_final_df['Cluster Labels'].apply(np.int64)
london_final_df.head()

Unnamed: 0,Location,London borough,Post town,Inner,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels
0,Abbey Wood,Greenwich,LONDON,True,51.487621,0.1140504,Grocery Store,Playground,Zoo Exhibit,Ethiopian Restaurant,French Restaurant,14
1,"Acton, London","Ealing, Hammersmith and Fulham",LONDON,True,51.5081402,-0.2732607,Pub,Gym / Fitness Center,Fast Food Restaurant,Creperie,Coffee Shop,17
2,Aldgate,City,LONDON,True,51.5142477,-0.0757186,Coffee Shop,Hotel,Indian Restaurant,Pub,Cocktail Bar,16
3,Aldwych,Westminster,LONDON,True,51.5124367,-0.1187414,Theater,French Restaurant,Coffee Shop,Tea Room,Dessert Shop,17
4,"Angel, London",Islington,LONDON,True,51.5319458,-0.1061056,Pub,Coffee Shop,Hotel,Café,Gym / Fitness Center,1


In [349]:
#PARIS
paris_final_df = df_paris_ngh
paris_final_df = paris_final_df.join(paris_venues_sorted.set_index('Neighborhood'), on='Quartiers') # merge
paris_final_df = paris_final_df.dropna().reset_index(drop=True) # Ignore postcodes without venue data
paris_final_df['Cluster Labels'] = paris_final_df['Cluster Labels'].apply(np.int64)
paris_final_df.head()

Unnamed: 0,Arrondissements de Paris,Quartiers,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels
0,Arrondissement 1,Saint-Germain-l'Auxerrois,48.8596955,2.3406333,French Restaurant,Hotel,Pizza Place,Clothing Store,Plaza,1
1,Arrondissement 1,Halles,48.8616513,2.3470129,French Restaurant,Wine Bar,Bar,Pizza Place,Pub,4
2,Arrondissement 1,Palais-Royal,48.8635847,2.33620422009387,Café,Hotel,French Restaurant,Japanese Restaurant,Plaza,0
3,Arrondissement 1,Place-Vendôme,48.8674634,2.32942811682519,Hotel,French Restaurant,Jewelry Store,Japanese Restaurant,Pastry Shop,3
4,Arrondissement 2,Gaillon,48.86913515,2.33290877033551,Hotel,Japanese Restaurant,French Restaurant,Jewelry Store,Italian Restaurant,0


---

### Step 5: Build a classification model for each city

By using support vector machine, building a model for London

In [350]:
# Import useful libraries
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics

# Define the target vector and dependant vectors
y_l = kmeans_lond.labels_.tolist()
X_l = london_grouped_clustering

In [351]:
# Split of data to learning and test set
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X_l, y_l, test_size=0.25, random_state=4)

#Create an instance of SVM and train it with the training set
lond_SVM = svm.SVC(gamma=10, decision_function_shape='ovo')
lond_SVM.fit(X_train_l, y_train_l)

# Prediction with the SVM and check the accuracy
lond_predSVM = lond_SVM.predict(X_test_l)
print("Support Vector Machine's Accuracy according to Jaccard Index: ", metrics.jaccard_similarity_score(y_test_l, lond_predSVM))
print("Support Vector Machine's Accuracy according to F1 Score: ", metrics.f1_score(y_test_l, lond_predSVM, average=None), "\n")

Support Vector Machine's Accuracy according to Jaccard Index:  0.777777777778
Support Vector Machine's Accuracy according to F1 Score:  [ 0.84615385  0.          0.5         0.          0.          0.8
  0.83333333] 



  'precision', 'predicted', average, warn_for)


By using support vector machine, building a model for Paris

In [352]:
# Define the target vector and dependant vectors
y_p = kmeans_paris.labels_.tolist()
X_p = paris_grouped_clustering

In [353]:
# Split of data to learning and test set
X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(X_p, y_p, test_size=0.25, random_state=4)

#Create an instance of SVM and train it with the training set
paris_SVM = svm.SVC(gamma=10, decision_function_shape='ovo')
paris_SVM.fit(X_train_p, y_train_p)

# Prediction with the SVM and check the accuracy
paris_predSVM = paris_SVM.predict(X_test_p)
print("Support Vector Machine's Accuracy according to Jaccard Index: ", metrics.jaccard_similarity_score(y_test_p, paris_predSVM))
print("Support Vector Machine's Accuracy according to F1 Score: ", metrics.f1_score(y_test_p, paris_predSVM, average=None), "\n")

Support Vector Machine's Accuracy according to Jaccard Index:  0.75
Support Vector Machine's Accuracy according to F1 Score:  [ 0.8         0.8         0.90909091  0.          0.          0.8       ] 



  'precision', 'predicted', average, warn_for)


---

### Step 6: Using the classification model 

Check the neighborhoods from which cluster are the best choice in a city for a given neighborhood from the other city

In [354]:
# LONDON
londToParis_predSVM = paris_SVM.predict(X_l)
london_venues_sorted['Paris Cluster'] = londToParis_predSVM
london_final_wParis_df = df_lond_ngh
london_final_wParis_df = london_final_wParis_df.join(london_venues_sorted.set_index('Neighborhood'), on='Location') # merge
london_final_wParis_df = london_final_wParis_df.dropna().reset_index(drop=True) # Ignore postcodes without venue data
london_final_wParis_df['Cluster Labels'] = london_final_wParis_df['Cluster Labels'].apply(np.int64)
london_final_wParis_df['Paris Cluster'] = london_final_wParis_df['Paris Cluster'].apply(np.int64)
london_final_wParis_df.head(20)

Unnamed: 0,Location,London borough,Post town,Inner,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels,Paris Cluster
0,Abbey Wood,Greenwich,LONDON,True,51.487621,0.1140504,Grocery Store,Playground,Zoo Exhibit,Ethiopian Restaurant,French Restaurant,14,4
1,"Acton, London","Ealing, Hammersmith and Fulham",LONDON,True,51.5081402,-0.2732607,Pub,Gym / Fitness Center,Fast Food Restaurant,Creperie,Coffee Shop,17,0
2,Aldgate,City,LONDON,True,51.5142477,-0.0757186,Coffee Shop,Hotel,Indian Restaurant,Pub,Cocktail Bar,16,0
3,Aldwych,Westminster,LONDON,True,51.5124367,-0.1187414,Theater,French Restaurant,Coffee Shop,Tea Room,Dessert Shop,17,0
4,"Angel, London",Islington,LONDON,True,51.5319458,-0.1061056,Pub,Coffee Shop,Hotel,Café,Gym / Fitness Center,1,0
5,"Archway, London",Islington,LONDON,True,51.5654371,-0.1349977,Coffee Shop,Grocery Store,Pub,Pizza Place,Bar,9,0
6,Balham,Wandsworth,LONDON,True,51.4428285,-0.1514426,Coffee Shop,Pizza Place,Pub,Bakery,Café,17,0
7,Barbican Estate,City,LONDON,True,55.41567615,-1.70739463328847,Italian Restaurant,Tea Room,Café,Coffee Shop,Supermarket,1,0
8,Barnsbury,Islington,LONDON,True,51.5389351,-0.114735,Park,Grocery Store,Pub,Gastropub,Café,9,4
9,Battersea,Wandsworth,LONDON,True,51.4707933,-0.172214,Pub,Café,Hotel,Bakery,Grocery Store,17,0


In [355]:
# PARIS
parisToLond_predSVM = lond_SVM.predict(X_p)
paris_venues_sorted['London Cluster'] = parisToLond_predSVM
paris_final_wLond_df = df_paris_ngh
paris_final_wLond_df = paris_final_wLond_df.join(paris_venues_sorted.set_index('Neighborhood'), on='Quartiers') # merge
paris_final_wLond_df = paris_final_wLond_df.dropna().reset_index(drop=True) # Ignore postcodes without venue data
paris_final_wLond_df['Cluster Labels'] = paris_final_wLond_df['Cluster Labels'].apply(np.int64)
paris_final_wLond_df['London Cluster'] = paris_final_wLond_df['London Cluster'].apply(np.int64)
paris_final_wLond_df.head(20)

Unnamed: 0,Arrondissements de Paris,Quartiers,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,Cluster Labels,London Cluster
0,Arrondissement 1,Saint-Germain-l'Auxerrois,48.8596955,2.3406333,French Restaurant,Hotel,Pizza Place,Clothing Store,Plaza,1,17
1,Arrondissement 1,Halles,48.8616513,2.3470129,French Restaurant,Wine Bar,Bar,Pizza Place,Pub,4,17
2,Arrondissement 1,Palais-Royal,48.8635847,2.33620422009387,Café,Hotel,French Restaurant,Japanese Restaurant,Plaza,0,17
3,Arrondissement 1,Place-Vendôme,48.8674634,2.32942811682519,Hotel,French Restaurant,Jewelry Store,Japanese Restaurant,Pastry Shop,3,16
4,Arrondissement 2,Gaillon,48.86913515,2.33290877033551,Hotel,Japanese Restaurant,French Restaurant,Jewelry Store,Italian Restaurant,0,17
5,Arrondissement 2,Vivienne,48.86885895,2.3393625582679,French Restaurant,Japanese Restaurant,Café,Boutique,Wine Bar,1,17
6,Arrondissement 2,Mail,48.8680539,2.34459476094465,French Restaurant,Cocktail Bar,Wine Bar,Bakery,Italian Restaurant,0,17
7,Arrondissement 2,Bonne-Nouvelle,48.8706233,2.3487498,Hotel,French Restaurant,Pizza Place,Burger Joint,Bar,0,17
8,Arrondissement 3,Arts-et-Métiers,48.8654414,2.3561316,French Restaurant,Hotel,Bar,Coffee Shop,Japanese Restaurant,0,17
9,Arrondissement 3,Enfants-Rouges,48.8643317,2.36260844131119,Wine Bar,French Restaurant,Bistro,Café,Japanese Restaurant,0,17


---

### Step 7: Using the map to show a few samples

Two samples from London to see which are the Paris neighborhoods which are the most similar to them and,
same for two samples from Paris

In [356]:
# Import of the Map library
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium

##### LONDON Case 1:  What are the best neighborhoods in Paris to move for a Londoner who loves living at Westminster ?

In [357]:
# create map
data1 = london_final_wParis_df.loc[london_final_wParis_df['Location'] == 'Westminster']
clus1 = int(data1.iat[0,12])
results1 = paris_final_wLond_df.loc[paris_final_wLond_df['Cluster Labels'] == clus1]
loc1= geolocator.geocode("Paris, France")
lati1 = float(loc1.raw['lat'])
long1 = float(loc1.raw['lon'])
map_clusters_1 = folium.Map(location=[lati1, long1], zoom_start=13)

# add markers to the map
for lat, lon, nbr, cluster in zip(results1['Latitude'], results1['Longitude'], results1['Quartiers'], results1['Cluster Labels']):
    label = folium.Popup(str(nbr) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lon)],
        radius=5,
        popup=label,
        color='#3186cc',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_clusters_1)
       
map_clusters_1

##### LONDON Case 2:  What are the best neighborhoods in Paris to move for a Londoner who loves living at Blackwall ?

In [358]:
# create map
data2 = london_final_wParis_df.loc[london_final_wParis_df['Location'] == 'Blackwall, London']
clus2 = int(data2.iat[0,12])
results2 = paris_final_wLond_df.loc[paris_final_wLond_df['Cluster Labels'] == clus2]
#loc1= geolocator.geocode("Paris, France")
#lati1 = float(loc1.raw['lat'])
#long1 = float(loc1.raw['lon'])
map_clusters_2 = folium.Map(location=[lati1, long1], zoom_start=13)

# add markers to the map
for lat, lon, nbr, cluster in zip(results2['Latitude'], results2['Longitude'], results2['Quartiers'], results2['Cluster Labels']):
    label = folium.Popup(str(nbr) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lon)],
        radius=5,
        popup=label,
        color='#3186cc',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_clusters_2)
       
map_clusters_2

##### PARIS Case 1:  What are the best neighborhoods in London to move for a Parisian who loves living at Place-Vendôme (Arrondissement 1) ?

In [359]:
# create map
data3 = paris_final_wLond_df.loc[paris_final_wLond_df['Quartiers'] == 'Place-Vendôme']
clus3 = int(data3.iat[0,10])
results3 = london_final_wParis_df.loc[london_final_wParis_df['Cluster Labels'] == clus3]
loc3= geolocator.geocode("London, United Kingdom")
lati3 = float(loc3.raw['lat'])
long3 = float(loc3.raw['lon'])
map_clusters_3 = folium.Map(location=[lati3, long3], zoom_start=13)

# add markers to the map
for lat, lon, nbr, cluster in zip(results3['Latitude'], results3['Longitude'], results3['Location'], results3['Cluster Labels']):
    label = folium.Popup(str(nbr) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lon)],
        radius=5,
        popup=label,
        color='#3186cc',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_clusters_3)
       
map_clusters_3

##### PARIS Case 2:  What are the best neighborhoods in London to move for a Parisian who loves living at Val-de-Grâce (Arrondissement 5) ?

In [363]:
# create map
data4 = paris_final_wLond_df.loc[paris_final_wLond_df['Quartiers'] == 'Val-de-Grâce']
clus4 = int(data4.iat[0,10])
results4 = london_final_wParis_df.loc[london_final_wParis_df['Cluster Labels'] == clus4]
#loc3= geolocator.geocode("London, United Kingdom")
#lati3 = float(loc3.raw['lat'])
#long3 = float(loc3.raw['lon'])
map_clusters_4 = folium.Map(location=[lati3, long3], zoom_start=13)

# add markers to the map
for lat, lon, nbr, cluster in zip(results4['Latitude'], results4['Longitude'], results4['Location'], results4['Cluster Labels']):
    label = folium.Popup(str(nbr) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lon)],
        radius=5,
        popup=label,
        color='#3186cc',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_clusters_4)
       
map_clusters_4