# Import libraries

In [None]:
!pip install folium

In [None]:
!pip install geopy

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd 
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import folium
import json
import requests
import codecs

try:
  import geocoder
except:
  !pip install geocoder
  import geocoder

from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
from pandas.core.common import flatten

print("Libraries imported.")

# Data

In [None]:
!wget https://en.wikipedia.org/wiki/Category:Arrondissements_of_Paris

In [None]:
html = codecs.open('Category:Arrondissements_of_Paris', encoding='utf-8').read()
soup = BeautifulSoup(html, 'html.parser')

In [None]:
def clean_district_name(district_name):
  if '►  ' in district_name:
    district_name = district_name.replace('►  ', '')
#  if 'District' not in district_name:
#    district_name += ' District'
  return district_name

In [None]:
districts = [li.text.split('\n') for li in soup.find_all('ul')[1:2]]
districts = list(flatten(districts))
districts = [clean_district_name(dist) for dist in districts]

print('There are {} districts in Paris.'.format(len(districts)))
districts

In [None]:
district = [d[0:19] for d in districts]

In [None]:
district

In [None]:
geolocator = Nominatim(user_agent='ibm-capstone')

In [None]:
districts_lat = []
districts_lng = []


In [None]:
for dist in district:
    location = geolocator.geocode(dist, timeout=10000)
    districts_lat.append(location.latitude)
    districts_lng.append(location.longitude)

print('{} latitudes and {} longitudes were found.'.format(len(districts_lat), len(districts_lng)))

In [None]:
city = 'Paris, FR'
Paris_geo = geolocator.geocode(city, timeout=10000)
Paris_lat = Paris_geo.latitude
Paris_lng = Paris_geo.longitude

print('The latitude and longitude of {} are {}, {}.'.format(city, Paris_lat, Paris_lng))

In [None]:
df = pd.DataFrame(data={
    'District': district,
    'Latitude': districts_lat,
    'Longitude': districts_lng
})
df.head(20)

In [None]:
df.to_csv('Paris_arrondissement_data.csv', index=False)

# Get venues of each district within a radius of 5km

In [None]:
# Foursquare cresentials and version
CLIENT_ID = 'SBZ3SIXQKQG5BNV3EK2N3FFVVDVOVA0MAM3BBLMWAADRZCNN'
CLIENT_SECRET = 'PH53JIKBY32YJZ1PJVDY5APEXP2ZVX5R5KH0F4RULNELOTA0'
VERSION = '20200101'
SECTION = 'arts'
LIMIT = 50
RADIUS = 5000

In [None]:
def getNearbyVenues(district, latitudes, longitudes):
    
    venues_list=[]
    for district, lat, lng in zip(district, latitudes, longitudes):
        print(district)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&section={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT,
            SECTION)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            district,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District',
                            'District Latitude', 
                            'District Longitude', 
                            'Venue', 
                            'Venue Latitude', 
                            'Venue Longitude', 
                            'Venue Category']
              
    return(nearby_venues)

In [None]:
Paris_venues = getNearbyVenues(district=df['District'],
                              latitudes=df['Latitude'],
                              longitudes=df['Longitude'])

In [None]:
Paris_venues.head()

In [None]:
Paris_venues.shape

In [None]:
Paris_venues.info()

In [None]:
Paris_venues['Venue Category'].unique()

# Data Analysis

In [None]:
venue_dict = Paris_venues.groupby('District').count()[['Venue']]
venue_dict

In [None]:
venue_dict.sort_values(by='Venue', ascending=False).plot.bar()
plt.xlabel('District')
plt.ylabel('Number of venues')
plt.legend('')
plt.title('Number of venues in Paris')
plt.show()

In [None]:
venue_category = Paris_venues.groupby('Venue Category').count()[['Venue']]
venue_category

In [None]:
venue_category.sort_values(by='Venue', ascending=False).plot.bar()
plt.xlabel('Venue Category')
plt.ylabel('Number of venues')
plt.legend('')
plt.title('Top most common venue categories in Paris')
plt.show()

In [None]:
venue_category.shape

In [None]:
Paris_venues['Venue Category'].unique()

In [None]:
# one hot encoding
Paris_onehot = pd.get_dummies(Paris_venues[['Venue Category']], prefix='', prefix_sep='')

# add neighborhood column back to dataframe
Paris_onehot['District'] = Paris_venues[['District']]

# move neighborhood column to the first column
fixed_columns = [Paris_onehot.columns[-1]] + list(Paris_onehot.columns[:-1])
Paris_onehot = Paris_onehot[fixed_columns]

Paris_onehot.head()

In [None]:
Paris_onehot.shape

In [None]:
Paris_grouped = Paris_onehot.groupby('District').mean().reset_index()
Paris_grouped

In [None]:
Paris_grouped.shape

In [None]:
# Print top 6 most common venues of each district
num_top_venues = 6

for dist in Paris_grouped['District']:
    print('----'+dist+'----')
    # get a dataframe of venues according to neigborhood and transpose it 
    temp = Paris_grouped[Paris_grouped['District'] == dist].T.reset_index()
    
    # update meaningful column names
    temp.columns = ['Venue Category', 'Frequency']
    
    # remove the first row of the dataframe (the previous 'District' column)
    temp = temp.iloc[1:]
    
    # convert 'freq' column to type float
    temp['Frequency'] = temp['Frequency'].astype(float)
    
    # round the 'freq' column to 2 digit
    temp = temp.round({'Frequency': 2})
    
    # sort the dataframe desceningly
    temp.sort_values(by='Frequency', ascending=False, inplace=True)
    
    # reset and drop index
    temp = temp.reset_index(drop=True)
    
    print(temp[:6])
    print('')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['District']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['District'] = Paris_grouped['District']

for ind in np.arange(Paris_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Paris_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

# Modelling

In [None]:
# set number of clusters
kclusters = 5

Paris_grouped_clustering = Paris_grouped.drop('District', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Paris_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# add clustering labels
try:  
  neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
except:
  neighborhoods_venues_sorted = neighborhoods_venues_sorted
Paris_merged = df

# merge Paris_grouped with Paris_data to add latitude/longitude for each neighborhood
Paris_merged = Paris_merged.join(neighborhoods_venues_sorted.set_index('District'), on='District')


In [None]:
Paris_merged.dropna(inplace=True)
Paris_merged

In [None]:
# create map
map_clusters = folium.Map(location=[Paris_lat, Paris_lng], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Paris_merged['Latitude'], Paris_merged['Longitude'], Paris_merged['District'], Paris_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster -1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
# Cluster 0
Paris_merged.loc[Paris_merged['Cluster Labels'] == 0, Paris_merged.columns[[0] + list(range(4, Paris_merged.shape[1]))]]

In [None]:
# Cluster 1
Paris_merged.loc[Paris_merged['Cluster Labels'] == 1, Paris_merged.columns[[0] + list(range(4, Paris_merged.shape[1]))]]

In [None]:
# Cluster 2
Paris_merged.loc[Paris_merged['Cluster Labels'] == 2, Paris_merged.columns[[0] + list(range(4, Paris_merged.shape[1]))]]

In [None]:
# Cluster 3
Paris_merged.loc[Paris_merged['Cluster Labels'] == 3, Paris_merged.columns[[0] + list(range(4, Paris_merged.shape[1]))]]

In [None]:
# Cluster 4
Paris_merged.loc[Paris_merged['Cluster Labels'] == 4, Paris_merged.columns[[0] + list(range(4, Paris_merged.shape[1]))]]

In [None]:
# Cluster 5
Paris_merged.loc[Paris_merged['Cluster Labels'] == 5, Paris_merged.columns[[0] + list(range(4, Paris_merged.shape[1]))]]