# Part I - Segmenting and Clustering Neighborhoods in Toronto

## In this assignment we have to create pandas dataframe from the table of the Torronto post codes, borroughs and neighborhoods located on wikipedia web page

In [1]:
#first we have to import needed libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### First we have to scrape table from the web page with requests and BeautifulSoup python libraries

In [2]:
#using requests library in order to get web page that I need and Beautiful Soup library to parse from that web page
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml') #parser is lxml

In [3]:
#print(soup.prettify()) #prettify method is used to get identations of the code like on the real HTML page

In [4]:
#parse table from the whole web page
table = soup.find('table', class_='wikitable sortable')
#print(table.prettify())

In [5]:
#in HTML 'tr' is the tag for table rows
table_rows = table.find_all('tr')
len(table_rows)

289

In [6]:
#create a list with 3 items in every list. Strip is used to remove '\n' from the end of the line.
toronto_list = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [i.text.strip() for i in td]
    #print(row)
    toronto_list.append(row)
toronto_list

[[],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 ['M9B', 'Etobicoke', 'Islington'],
 ['M

In [7]:
len(toronto_list)

289

In [8]:
#remove first item from the list
toronto_list = toronto_list[1:]
toronto_list[:5]

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront']]

### Now we have to do the following: 
* create pandas DataFrame from the list of list that was scraped from the web page
* drop all rows with the value of the Borough 'Not Assigned'
* change name of the Neighborhood in the row that has neighborhood value 'Not Assigned' to the corresponding Borough name
* combine all rows that have same PostalCode and Borough, like M5A and Downtown Toronto into one row with 2 or more neighborhoods. For this we have to use groupby method.

In [9]:
#creating DF from the list and renaming columns
df = pd.DataFrame(toronto_list)
df.rename(columns={0 : 'PostalCode', 1 : 'Borough', 2 : 'Neighborhood'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [10]:
len(df)

288

In [11]:
#We have to drop all rows with the value of the Borough'Not Assigned'. There are 77 rows.
len(df.loc[df['Borough'] == 'Not assigned'])

77

In [12]:
#we have dropped 77 rows, final number of rows is 211
df = df.loc[df['Borough'] != 'Not assigned']
len(df)

211

In [13]:
# Code to drop all rows with the value of the Borough and Neighborhod 'Not Assigned'. There are 78 rows.
#len(df.loc[(df['Borough'] == 'Not assigned') | (df['Neighborhood'] == 'Not assigned')])
#df = df.loc[(df['Borough'] != 'Not assigned') & (df['Neighborhood'] != 'Not assigned')]

In [14]:
#there is one more 'Not assigned' value in the Neighborhood, we will make Nighborhood same as the Borough 
df.loc[df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = "Queen's Park"

In [15]:
#just to check if the name was changed successfuly
df.loc[df['Borough'] == "Queen's Park"]

Unnamed: 0,PostalCode,Borough,Neighborhood
8,M7A,Queen's Park,Queen's Park


In [16]:
#we have to combine all rows that have same PostalCode and Borough, like M5A and Downtown Toronto 
#into one row with 2 or more neighborhoods
df.loc[df.PostalCode == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park


In [17]:
#use group by method to group items by PostalCode and Borough, result is Series with two indexes and string join by ','.
df_grouped = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join)

In [18]:
df_grouped.head(8)

PostalCode  Borough    
M1B         Scarborough                                Rouge,Malvern
M1C         Scarborough         Highland Creek,Rouge Hill,Port Union
M1E         Scarborough              Guildwood,Morningside,West Hill
M1G         Scarborough                                       Woburn
M1H         Scarborough                                    Cedarbrae
M1J         Scarborough                          Scarborough Village
M1K         Scarborough    East Birchmount Park,Ionview,Kennedy Park
M1L         Scarborough                Clairlea,Golden Mile,Oakridge
Name: Neighborhood, dtype: object

In [19]:
#group by return Pandas Series here multiindex serie.
type(df_grouped)

pandas.core.series.Series

In [20]:
df_toronto = pd.DataFrame(df_grouped)
df_toronto.head(8)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
PostalCode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge,Malvern"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Guildwood,Morningside,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
M1J,Scarborough,Scarborough Village
M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"


In [21]:
df_toronto = df_toronto.reset_index()
df_toronto.head(8)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"


In [22]:
df_toronto.shape

(103, 3)

# Part 2 - Segmenting and Clustering Neighborhoods in Toronto

Now that we have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

### Geocoder with geocoder.google does not work, it works with geocoder.arcgis but the coordinates are not  the same as the ones in the csv file.
Here is the code used with geocoder.arcgis

In [23]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None
latitude_list = []
longitude_list = []

# loop until you get the coordinates
while(lat_lng_coords is None):
    for i in df_toronto['PostalCode']: 
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(i))
        lat_lng_coords = g.latlng
        latitude_list.append(lat_lng_coords[0])
        longitude_list.append(lat_lng_coords[1])


In [24]:
len(latitude_list)

103

In [25]:
len(longitude_list)

103

In [26]:
df_latitude = pd.DataFrame(latitude_list)
df_latitude.head(6)

Unnamed: 0,0
0,43.811525
1,43.785665
2,43.765815
3,43.768369
4,43.769688
5,43.743125


In [27]:
df_longitude = pd.DataFrame(longitude_list)
df_longitude.head(6)

Unnamed: 0,0
0,-79.195517
1,-79.158725
2,-79.175193
3,-79.21759
4,-79.23944
5,-79.23175


In [28]:
df_toronto.head(6)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village


### I have tested coordinates also with pgeocode and they are not the same as the ones in csv file and also as the ones that I get with the goecoder.arcgis

In [29]:
import pgeocode

nomi = pgeocode.Nominatim('ca')
data = nomi.query_postal_code(["M1B", "M1C", "M1E", "M1G", "M1H"])
data

Unnamed: 0,postal_code,country code,place_name,state_name,state_code,county_name,county_code,community_name,community_code,latitude,longitude,accuracy
0,M1B,CA,Scarborough (Malvern / Rouge River),Ontario,ON,Scarborough,,,,43.8113,-79.193,6.0
1,M1C,CA,Scarborough (Rouge Hill / Port Union / Highlan...,Ontario,ON,Scarborough,,,,43.7878,-79.1564,6.0
2,M1E,CA,Scarborough (Guildwood / Morningside / Ellesmere),Ontario,ON,Scarborough,,,,43.7678,-79.1866,6.0
3,M1G,CA,Scarborough (Woburn),Ontario,ON,Scarborough,,,,43.7712,-79.2144,6.0
4,M1H,CA,Scarborough (Cedarbrae),Ontario,ON,Scarborough,,,,43.7686,-79.2389,6.0


### Due to the differences between values of the coordinates in the csv file and the values with geocoder and pgeocode I will make DF with the values from the csv file.

In [30]:
#import csv file
long_lat = pd.read_csv('Geospatial_Coordinates_Toronto_Neighbours.csv')
long_lat.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [31]:
long_lat.drop('Postal Code', axis= 1, inplace=True)
long_lat.head()

Unnamed: 0,Latitude,Longitude
0,43.806686,-79.194353
1,43.784535,-79.160497
2,43.763573,-79.188711
3,43.770992,-79.216917
4,43.773136,-79.239476


In [32]:
long_lat.shape

(103, 2)

In [33]:
df_toronto.shape

(103, 3)

In [34]:
df = pd.concat([df_toronto, long_lat], axis=1)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [38]:
df.shape

(103, 5)