## Segmenting and Clustering Neighborhoods in Toronto

For this assignment, you will be required to explore and cluster the neighborhoods in Toronto.

#### Importing needed Libraries

In [1]:
import pandas as pd # library to process data as dataframes

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

import requests
from bs4 import BeautifulSoup as bs

from sklearn.cluster import KMeans

print("Done importing libraries")

Done importing libraries


## PART 1 ---------------------------------------------------------------------------------------

#### Requesting data from the link 

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
req =requests.get(url)
print("Connection Status : ", req.status_code)
req.encoding = 'utf-8'
encoding = req.encoding
html_doc = req.text

Connection Status :  200


#### Creating the DataFrame 

In [3]:
columns = ['PostalCode', 'Borough', 'Neighborhood']
df = pd.DataFrame(columns=columns)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood


#### Scrape PostalCode, Borough, Neighborhood from HTML_doc using BeautifulSoup

In [4]:
soup = bs(html_doc, 'html.parser')
table = soup.find('table', {'class' : 'wikitable'})
rows = table.tbody.find_all('tr')
for row in rows[1:]: 
    dictionary ={}
    td = row.find_all('td')
    if(td[1].get_text() != "Not assigned") :
        dictionary["PostalCode"] = td[0].get_text()
        dictionary["Borough"] = td[1].get_text()
        # One special case for the Borough Queen's Park  (it has that annoying ' ) with non assigned Neighborhood
        if(td[2].get_text() == "Not assigned" or dictionary["PostalCode"] == "M7A") : 
            dictionary["Neighborhood"] = td[1].get_text()
        else  :
            dictionary["Neighborhood"] = td[2].get_text().strip("\n")
            
        df = df.append(dictionary, ignore_index=True)
        


In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Just to verify that there is No  Not-Assigned Borough or Neighborhood

In [6]:
(df["Borough"]!= "Not assigned").describe().top

True

In [7]:
(df["Neighborhood"]!= "Not assigned").describe().top

True

#### Combining the Neighborhoods 

In [8]:
i = 1
while(i < len(df)):
    if df['PostalCode'].iloc[i] == df['PostalCode'].iloc[i - 1]:
        df.at[i - 1, 'Neighborhood'] = df.Neighborhood.iloc[i - 1] +', ' + df.Neighborhood.iloc[i]
        df.drop(index = i, inplace = True)
        df = df.reset_index().drop('index', axis = 1)
    else:
        i += 1
        
        
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [9]:
# verify table's row count
df.shape  

(103, 3)

## PART 2  -------------------------------------------------------------------------------------

##### tried the geocoder thing but takes too long and with missing data

In [13]:
#import geocoder
#latitudes = []
#longitudes = []
#for code in df.PostalCode:
#   # initialize your variable to None
#    coordinations = None
#
#   # loop until you get the coordinates
#   while(coordinations is None):
#       g = geocoder.google('{}, Toronto, Ontario'.format(code))
#       coordinations = g.latlng
#       
#
#   latitudes.append(coordinations[0])
#   longitudes.append(coordinations[1])
#if(len(latitudes) == 103 and len(longitudes) == 103) : print("We got all coordinations")
#else : print("Some thing is wrong we have only this : ", len(latitudes))

In [16]:
# Downloading the csv file
from urllib2 import urlopen
response = urlopen("https://cocl.us/Geospatial_data").read()
download = open("Geospatial_Coordinates.csv", "w")
download.write(response.read())
download.close()
print('Data downloaded!')

Data downloaded!


In [17]:
# reading the csv file
Geo = pd.read_csv("Geospatial_Coordinates.csv") 

# Rename the 'Postal Code' column
Geo.rename({'Postal Code': 'PostalCode'}, axis='columns', inplace=True)

In [18]:
# Merge Latitude and longitude to original dataframe
df = pd.merge(df, Geo, on='PostalCode')
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
