### This Notebook will be mainly used for the Capstone Project for the IBM Data Science Professional Certificate. The project involves application of Data Science techniques for segmenting and clustring of neighborhoods.

In [2]:
#Importing pandas and numpy libraries 
import pandas as pd
import numpy as np

In [3]:
print("Hello Capstone Project Course")

Hello Capstone Project Course


# This part of this Notebook will be used for exploring, segmenting and clustering the neighbourhoods within the City of Toronto, Canada.

### The first step is to dowload all dependencies and libraries required to achieve the task

In [None]:
#Importing numpy to handle data in vectorised manner or format
import numpy as np

#Importing the pandas dataframe for data analysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#Importing json to read json files
import json

#Installing geopy to enable the geocoding library which converts address to latitude and longitude
!conda install -c conda-forge geopy --yes

#Importing Nominatim to convert address into latitude and longitude
from geopy.geocoders import Nominatim

#Importing a library to handle requests
import requests
#Importing the required Matplotlib libraries
import matplotlib.cm as cm
import matplotlib.colors as colors
#Importing K-Means algorithm from sklearn for clustering
from sklearn.cluster import KMeans
#Installing folium library for map rendering
!conda install -c conda-forge folium=0.5.0 --yes
#Importing the installed folium library
print('Libraries installed!')

In [4]:
#Importing the json_normalise function to transform json file into a pandas dataframe
from pandas.io.json import json_normalize
print('Json normalize imported')

Json normalize imported


In [5]:
#Requesting the data from wikipedia and defining the url
wiki_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [None]:
#Importing Beautiful Soup Framework to enable website data scraping
from bs4 import BeautifulSoup
soup = BeautifulSoup(wiki_url, 'html.parser')# parsing html and not xml
print(soup)

#### After analysing the html data extracted, it can be observed that all the Toronto Neighbourhoods are under table class 'wikitable sortable'

In [None]:
#Finding all attributes of wikitable sortable
table = soup.find('table',{'class':'wikitable sortable'})
print (table)

In [9]:
#Defining empty lists for each column of the table to be populated later
pst_code = [] # Post code
bro = [] # borough
neighb = [] # neighbourhood

#Populating the table as an array row by row using nested for loops
# 'tr' tag is for each row within the table
# 'td' tag is for each cell within the table
for row in table.findAll('tr'): 
    cells = row.findAll('td') 
    if len(cells)==3: # sanity check
        pst_code.append(cells[0].find(text = True))
        bro.append(cells[1].find(text = True))
        neighb.append(cells[2].find(text = True))        

In [10]:
#Now converting the Post code,borough, Neighbourhood lists into a dataframe
df = pd.DataFrame()
df['Post Code'] = pst_code
df['Borough'] = bro
df['Neighbourhood'] = neighb
df.head()


Unnamed: 0,Post Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [11]:
#Processing cells with only Borough assigned while ignoring all cells without Borough assigned.
df2 = df
wantd_rows = df2[df2['Borough'] != "Not assigned"]
wantd_rows.head()

Unnamed: 0,Post Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [30]:
#Replacing the "Not assigned" Neighbourhood with its assigned Borough
df3 = wantd_rows
df3['Neighbourhood'].replace('Not assigned',"Queen's Park", inplace = True)
df3.head()

Unnamed: 0,Post Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [13]:
#Checking for duplicate post codes but for different neighbourhoods
df3.groupby('Post Code')
df3.head()

Unnamed: 0,Post Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [14]:
#Combining Neighbourhoods with the same Post Code into one row but neighbourhood names comma separated
df4 = df3.groupby(['Post Code', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df4.columns = ['Post Code', 'Borough', 'Neighbourhood']
df4.head()


Unnamed: 0,Post Code,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood\n, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae\n


In [15]:
#Printing the final rows in the data frame.
df4.shape

(103, 3)

### The next step is to extract latitude and longitude data for each postcode and convert the data into a dataframe. 

In [21]:
#Reading the csv file with geolocation data for each postcode
df_geodata = pd.read_csv('https://cocl.us/Geospatial_data')
print('Geolocation data loaded!')

Geolocation data loaded!


In [22]:
#Displaying a few rows of the geo dataframe
df_geodata.columns = ['Post Code','Latitude', 'Longitude']
df_geodata.head()

Unnamed: 0,Post Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### The next step is to merge the geodata dataframe with the first data frame with neighbourhoods data.  This creates a single dataframe which will be used with the Foursquare map for segmenting and clustering neighbourhoods.

In [27]:
#Merging the two dataframes ON the ['Post Code'] column which is common in both dataframes
df_merg = pd.merge(df4, df_geodata, on = ['Post Code'], how = 'inner')
df_merg.head()

Unnamed: 0,Post Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476


In [28]:
#Sanity check to see if the rows are still the same number as in each individual dataframe before merger
df_merg.shape

(103, 5)