# Import Necessary Libraries

In [0]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests

from geopy.geocoders import Nominatim 

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

from bs4 import BeautifulSoup

#Create A DataFrame

In [0]:
# define the dataframe columns
column_names = ['PostalCode', "Borough", 'Neighborhood'] 

# instantiate the dataframe
df = pd.DataFrame(columns=column_names)

#Import Data

In [0]:
website_url = requests.get("http://www.wikizero.biz/index.php?q=aHR0cHM6Ly9lbi53aWtpcGVkaWEub3JnL3dpa2kvTGlzdF9vZl9wb3N0YWxfY29kZXNfb2ZfQ2FuYWRhOl9N").text

In [0]:
soup = BeautifulSoup(website_url,"lxml")

In [0]:
My_table = soup.find("table",{"class":"wikitable sortable"})

In [0]:
links = My_table.find_all("td")

In [0]:
postcode = []
borough = []
neighbourhood = []

for i in range(len(links)):
  if i % 3 == 0:
    postcode.append(str(links[i]).replace("<td>","").replace("</td>",""))
  elif i % 3 == 1:
    borough.append(links[i])
  else:
    neighbourhood.append(links[i])  

In [0]:
borough_dict = {}

for i in range(len(borough)):
  if "assigned" not in str(borough[i]):
    try:
      borough_dict[i] = str(borough[i].find("a").get("title"))
    except:
      borough_dict[i] = str(borough[i]).replace("<td>","").replace("</td>","")

In [0]:
neighbourhood_dict = {}

for i in range(len(neighbourhood)):
  if "assigned" not in str(neighbourhood[i]):
    try:
      neighbourhood_dict[i] = str(neighbourhood[i].find("a").get("title"))
    except:
      neighbourhood_dict[i] = str(neighbourhood[i]).replace("<td>","").replace("</td>","").replace("\n","")

In [0]:
df["PostalCode"] = postcode
df["Borough"] = borough
df["Neighborhood"] = neighbourhood

In [0]:
for k, v in borough_dict.items():
  df.loc[k, "Borough"] = v

for k, v in neighbourhood_dict.items():
  df.loc[k, "Neighborhood"] = v

In [0]:
for i in range(len(df)-1, -1, -1):
  if "assigned" in str(df.loc[i, "Borough"]):
    df.drop(df.index[i], inplace=True)
    
df = df.reset_index(drop=True)

In [0]:
for i in range(len(df)-1, -1, -1):
  if "assigned" in str(df.loc[i, "Neighborhood"]):
    df.loc[i, "Neighborhood"] = df.loc[i, "Borough"]
    
df = df.reset_index(drop=True)

In [0]:
# define the dataframe columns
column_names = ['PostalCode', "Borough", 'Neighborhood'] 

# instantiate the dataframe
df_last = pd.DataFrame(columns=column_names)

df_last["PostalCode"] = pd.DataFrame(df.groupby(['PostalCode', "Borough"],)['Neighborhood'].apply(', '.join)).index.levels[0]
df_last["Neighborhood"] = pd.DataFrame(df.groupby(['PostalCode', "Borough"],)['Neighborhood'].apply(', '.join)).values

for i in range(len(df_last)):
  df_last.loc[i, "Borough"] = df[df["PostalCode"] == df_last.loc[i, "PostalCode"]]["Borough"].reset_index(drop = True)[0]

#Shape of DataFrame

In [356]:
df_last.shape

(103, 3)

In [0]:
coordinates = pd.read_csv("https://cocl.us/Geospatial_data")

In [0]:
df_last = df_last.merge(coordinates, how = "inner", left_on = "PostalCode", right_on="Postal Code")

In [0]:
df_last.drop("Postal Code", axis = 1, inplace = True)

In [371]:
df_last.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,"Scarborough, Toronto","Rouge, Toronto, Malvern, Toronto",43.806686,-79.194353
1,M1C,"Scarborough, Toronto","Highland Creek (Toronto), Rouge Hill, Port Uni...",43.784535,-79.160497
2,M1E,"Scarborough, Toronto","Guildwood, Morningside, Toronto, West Hill, To...",43.763573,-79.188711
3,M1G,"Scarborough, Toronto","Woburn, Toronto",43.770992,-79.216917
4,M1H,"Scarborough, Toronto",Cedarbrae,43.773136,-79.239476


In [360]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="Coursera")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


#Map of Toronto and Its Neighbours

In [370]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_last['Latitude'], df_last['Longitude'], df_last['Borough'], df_last['Neighborhood']):
    label = '{}, {}'.format(df_last, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#KMeans Clustering and The Map of Clustered Neighbours

In [372]:
# set number of clusters
kclusters = 5

df_last_clustering = df_last.drop(["PostalCode", "Borough", "Neighborhood"], axis = 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_last_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 2, 0, 2], dtype=int32)

In [378]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_last['Latitude'], df_last['Longitude'], df_last['Neighborhood'], kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters