We import pandas and numpy

In [1]:
import pandas as pd

In [2]:
import numpy as np 

We install lxml

In [3]:
!pip install lxml



We read the information from the website. 

In [4]:
site = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
table = pd.read_html(site,header=0)[0]

We clean the data.

In [5]:
table = table.replace(to_replace='Not assigned',value=np.nan)
table = table.loc[table.Borough.notna(),:]
table = table.rename({"Postal Code":"PostalCode"}, axis='columns') 
table = table.apply(lambda x: [x.PostalCode,x.Borough, x.Borough] if pd.isna(x.Neighbourhood)
                       else x, axis=1)
table.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [6]:
counts = table.PostalCode.value_counts()
counts[counts>1]

Series([], Name: PostalCode, dtype: int64)

In [7]:
table = table.groupby('PostalCode').apply(lambda x: x.apply(lambda y:  ', '.join(y) \
                                                   if y.name =='Neighbourhood' \
                                                   else y.tolist()[0]))
table.reset_index(inplace=True, drop=True)
table.loc[table.PostalCode.isin(counts[counts>1].index),:].head()

Unnamed: 0,PostalCode,Borough,Neighbourhood


In [8]:
table.loc[table.PostalCode.isin(counts[counts==1].index),:].head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
# The shape of the table 
table.shape

(103, 3)

In [10]:
geo_coords = pd.read_csv('Geospatial_Coordinates.csv')
geo_coords.columns = ['PostalCode', 'Latitude', 'Longitude']

In [11]:
print(geo_coords.shape)
geo_coords.head()

(103, 3)


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
table_n = pd.merge(table,geo_coords,on='PostalCode', how='inner')
table_n.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [13]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library
import seaborn as sns
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [14]:
CLIENT_ID = 'GAEXUWPZOQTIMXLACOEOLTR4Y5RLBSWJY0EUJF4FZ3C1CMNO' # your Foursquare ID
CLIENT_SECRET = 'G3TQZYJ2VTEJSRAXVCWESG5JRMMOUYD1NIYPN3KWC2KSOM3P' # your Foursquare Secret
VERSION = '20200731' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GAEXUWPZOQTIMXLACOEOLTR4Y5RLBSWJY0EUJF4FZ3C1CMNO
CLIENT_SECRET:G3TQZYJ2VTEJSRAXVCWESG5JRMMOUYD1NIYPN3KWC2KSOM3P


In [15]:
table_n.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [16]:
table_n.drop(columns = 'PostalCode',inplace=True)
print(print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(table_n['Borough'].unique()),
        table_n.shape[0])))
table_n.head()

The dataframe has 10 boroughs and 103 neighborhoods.
None


Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
# create map of Toronto using latitude and longitude values
tmap = folium.Map(location=[table_n.Latitude.mean(),
                                   table_n.Longitude.mean()], zoom_start=10)

# add markers to map
iterator = zip(table_n['Latitude'], table_n['Longitude'],
                table_n['Borough'], table_n['Neighbourhood'])
for lat, lng, borough, neighborhood in iterator:
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(tmap)  
    
tmap

In [18]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        while True:
            try:
                results = requests.get(url).json()["response"]['groups'][0]['items']
                break
            except:
                continue
            
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [None]:
t_venues = getNearbyVenues(names=table_n['Neighbourhood'],
                                   latitudes=table_n['Latitude'],
                                   longitudes=table_n['Longitude'],
                                   radius=500)                                  
                                  
#Checkpoint
#t_venues.to_csv('t_venues.csv', index=False)

Malvern, Rouge


In [None]:
#t_venues = pd.read_csv('t_venues.csv')

In [None]:
print(t_venues.shape)
t_venues.head()

In [None]:
venue_counts = t_venues.groupby('Neighborhood').Venue.count()
venue_counts

In [None]:
sns.distplot(venue_counts);

In [None]:
print(f"""The're is only {len(venue_counts[venue_counts>=75.0])/len(venue_counts)*100}% of the neighbourhoods with more than 75 venues
The number of these neighbourhoods is {len(venue_counts[venue_counts>=75.0])}""")

In [None]:
print(f"""The're is only {len(venue_counts[venue_counts>=25.0])/len(venue_counts)*100}% of the neighbourhoods with more than 75 venues
The number of these neighbourhoods is {len(venue_counts[venue_counts>=25.0])}""")

In [None]:
nbrhoods_used = venue_counts[venue_counts>=25.0].index 
t_venues_filtered = t_venues[t_venues.Neighborhood.isin(nbrhoods_used)]
print(t_venues_filtered.shape)
t_venues_filtered.head()

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(t_venues_filtered[['Venue Category']],
                                prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = t_venues_filtered['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = ['Neighborhood'] + [_ for _ in toronto_onehot.columns if _ !='Neighborhood']
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [None]:
toronto_onehot.shape

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

In [None]:
toronto_grouped.shape

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = \
    return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
# add clustering labels
# neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data_w_gsptl

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!


In [None]:
# create map
map_clusters = folium.Map(location=[toronto_data_w_gsptl.Latitude.mean(),
                                   toronto_data_w_gsptl.Longitude.mean()], zoom_start=10)
# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
toronto_merged.dropna(inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters