# Segmenting and Clustering Neighborhoods in Toronto

### 1. Lets import libraries

In [7]:
# library for data analsysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# library to handle data in a vectorized manner
import numpy as np 
from bs4 import BeautifulSoup

# library to handle JSON files
import json
# tranform JSON file into a pandas dataframe
from pandas import json_normalize  

# module to convert an address into latitude and longitude values
!pip install geopy
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# library to handle requests
import requests

# map rendering library
!conda install -c conda-forge folium=0.5.0 --yes
import folium 

# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



### 2. Lets scraple Wikipedia page to obtain the data. Convert the data into a table with "PostalCode", "Borough", "Neighborhood" columns

In [72]:
# getting data from wiki page
url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

# using beautiful soup parsing the HTML/XML codes.
soup = BeautifulSoup(url, 'lxml')

In [73]:
# making our table
table = soup.find("table")
table_rows = table.tbody.find_all("tr")

res = []
for tr in table_rows:
    td = tr.find_all("td")
    row = [tr.text for tr in td]
    
    # We only need cells with defined borough. Ignore cells with a borough that is "Not assigned".
    if row != [] and row[1] != "Not assigned\n":
        # If a row has a borough but a "Not assigned" neighborhood, then we will assume neighborhood the same as the borough.
        if "Not assigned\n" in row[2]: 
            row[2] = row[1]
        res.append(row)

df = pd.DataFrame(res, columns = ["PostalCode", "Borough", "Neighborhood"])
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A\n,North York\n,Parkwoods\n
1,M4A\n,North York\n,Victoria Village\n
2,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
3,M6A\n,North York\n,"Lawrence Manor, Lawrence Heights\n"
4,M7A\n,Downtown Toronto\n,"Queen's Park, Ontario Provincial Government\n"


In [74]:
# removing residual \n from our table
df = df.replace('\n','', regex=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [75]:
# grouping all neighborhoods with the same postal code
df = df.groupby(["PostalCode", "Borough"])["Neighborhood"].apply(", ".join).reset_index()
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [77]:
# lets count total number of unique postal codes using .shape method:
print("rows, columns: ", df.shape)

rows, columns:  (103, 3)


# End of the 1st part of the assignment

### 3. Lets get the latitude and the longitude coordinates for each neighborhood