# IBM CAPSTONE PROJECT NOTEBOOK
### this notebook will be used mainly for the capstone project

## Segmenting and Clustering Neighborhoods in Toronto

In [1]:
### Importing the necessary packages

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
##
import numpy as np
## for reading json files 
import json
### importing geopy package to convert adresses into latitude and longitude  
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

### library to handle requests 
import requests
## library to transform json files into pandas' dataframe
from pandas.io.json import json_normalize
### Import matplotlib related packages 
import matplotlib.cm as cm
import matplotlib.colors as colors

### importing the folium package for the geography data 
import folium

# The First Step is to Extract the Table on Wikipedia into a DataFrame Object
### In the following lines of codes, we will be doing that

In [2]:
## importing a webscrapping library
from bs4 import BeautifulSoup
import requests

In [50]:

## get the html text of the website 
websource = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### read it with the beautifulpackage 
soup = BeautifulSoup(websource, 'lxml')
#print(soup.prettify()) #will show the beautified version of the web content as html file

In [51]:
# getting a table from the website 
tables = soup.find('table')
#print(tables)






In [55]:

table_rows = tables.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)


df = pd.DataFrame(res, columns=["Postcode", "Borough", "Neighborhood"])
print(df.head(5))


  Postcode           Borough      Neighborhood
0      M1A      Not assigned      Not assigned
1      M2A      Not assigned      Not assigned
2      M3A        North York         Parkwoods
3      M4A        North York  Victoria Village
4      M5A  Downtown Toronto      Harbourfront


In [56]:
df = df[df.Borough != 'Not assigned']
df.head(3)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [57]:
### getting not assigned neighborhood
a = df[df.Neighborhood == 'Not assigned']
print(a) ### seems like the 9th element of our data frame is nonassigned 

  Postcode       Borough  Neighborhood
8      M7A  Queen's Park  Not assigned


In [58]:
### changing the name of the neighborhood to the name of the Borough
df.loc[8, 'Neighborhood'] = "Queen's Park"

In [59]:
df.shape

(212, 3)

# From Here on is for the Second Point in the Submission

## At this step, we will be getting lattitudes and longitudes of the neighborhoods

In [8]:
#!conda install -c conda-forge geocoder --yes
#import geocoder

In [11]:
#postcode = df.Postcode
#postcode = pd.Series.tolist(postcode) ### willmake it a list 
#### creating two lists (lattitudes and longitudes) by passing the postcode

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
  #g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
  #lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1] 

In [60]:
df = df.rename(columns = {'Postcode': 'Post'})
df.head()

Unnamed: 0,Post,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [61]:
### getting the lattitudes and longitutes csv file
latlon = pd.read_csv('Geospatial_Coordinates.csv')
latlon.head(2)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497


In [62]:
latlon = latlon.rename(columns = {'Postal Code' : 'Post'})
latlon.head(3)

Unnamed: 0,Post,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711


In [63]:
result = pd.merge(df, latlon, how='left', on=['Post'])
result.head()

Unnamed: 0,Post,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
4,M6A,North York,Lawrence Heights,43.718518,-79.464763


In [66]:
result = result.rename(columns = {'Post' : 'PostalCode'})
result.head(3)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636


In [67]:
result.shape

(212, 5)

# From Here on for the Third Point in the Submission

For this step of the submission, we will get the Boroughs containing 'Toronto' and cluster based on its neighborhoods as we did in the New York data set

In [82]:
result["contains"] = result.Borough.str.contains('Toronto')
result.head(3)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,contains
0,M3A,North York,Parkwoods,43.753259,-79.329656,False
1,M4A,North York,Victoria Village,43.725882,-79.315572,False
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,True


In [83]:
Toronto = result[result.contains != False]
Toronto.head(3)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,contains
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636,True
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636,True
13,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937,True


In [84]:
Toronto = Toronto.drop('contains', axis = 1)

In [85]:
Toronto.head(3)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
13,M5B,Downtown Toronto,Ryerson,43.657162,-79.378937


## Let's get the Latitudes and Longitudes of Toronto

In [86]:
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [95]:
neighborhoods = Toronto

## Let's cluster the neighborhoods in boroughs including Toronto in them 

In [115]:
# create map of New York using latitude and longitude values
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map) 
  
    
toronto_map

In [118]:
import itertools
