# Segmentation and Clustering of Toronto Neighborhoods

## Part 1: Extract data from Toronto Neighborhoods Wiki Page

In [1]:
# import libraries
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import geocoder as geo
import requests as rqt

from sklearn.cluster import KMeans
import folium

# pd.set_option('display.max_rows', None) ## display all rows in dataframe

### About Dataset

This dataset will consist of postal codes in Toronto, Canada. To get the data, we will need to scrape a Wiki page utilizing BeautifulSoup. 

Wiki Page: [Toronto Postal Codes](https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M)

**Columns in dataset:**

| Field          | Description                                                                           |
|----------------|---------------------------------------------------------------------------------------|
| Postal Code    | Six-character string that forms part of a postal address in Canada                    |
| Borough        | A subdivision of a major city                                                         |
| Neighborhood   | A localised community within a borough                                                |

In [2]:
# Use beautiful soup to read in our data from the Wiki page
sauce = urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = BeautifulSoup(sauce, 'lxml')

In [3]:
# Create our dataframe
column_names = ['PostalCode', 'Borough', 'Neighborhood'] # Define columns
neighborhood_df = pd.DataFrame(columns = column_names)  

# Parse table for Wiki page and insert into dataframe
table_rows = soup.table.find_all('tr')[1:]
for row in table_rows:
    html_dict = {}
    columns = row.find_all('td')
    
    i = 0
    for column in columns:
        html_dict[column_names[i]] = column.get_text("/n", strip = True) # need to remove the /n from the string
        i += 1
    
    neighborhood_df = neighborhood_df.append(html_dict, ignore_index = True)

Display the first five rows in the Toronto neighborhood dataframe

In [4]:
neighborhood_df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [5]:
print("Rows: {0}\nColumns: {1}".format(neighborhood_df.shape[0], neighborhood_df.shape[1]))

Rows: 180
Columns: 3


Remove postal codes that were not assigned a borough.

In [6]:
neighborhood_df = neighborhood_df[neighborhood_df.Borough != "Not assigned"]

# Display number of rows after removal
print("Rows: {0}\nColumns: {1}".format(neighborhood_df.shape[0], neighborhood_df.shape[1]))

Rows: 103
Columns: 3


Populate neighborhood with borough if row has an assigned borough but no neighborhood assigned.

In [7]:
neighborhood_df.loc[neighborhood_df.Neighborhood == "Not assigned", 'Neighborhood'] = neighborhood_df.Borough
neighborhood_df.reset_index(drop = True, inplace = True) # reset index after row removal

# Display 12 rows
neighborhood_df.head(12)

# Display number of rows
print("Rows: {0}\nColumns: {1}".format(neighborhood_df.shape[0], neighborhood_df.shape[1]))

Rows: 103
Columns: 3


## Part 2: Find latitude and longitude for Toronto Neighborhoods using Geocoder

In [8]:
# Create columns for latitude and longitude
neighborhood_df['Latitude'] = pd.Series(dtype = 'float64')
neighborhood_df['Longitude'] = pd.Series(dtype = 'float64')

Utilizing Geocoder to make requests to find longitude and longitude for the neighborhoods.

In [17]:
with rqt.Session() as session:   ## fix: not a viable solution for a larger dataset
    for index, row in neighborhood_df.iterrows():
        location = str(row[0]) + " Toronto, Ontario"  
        geo_arcgis = geo.arcgis(location, session = session, timeout = 20.0)
        neighborhood_df.at[index, 'Latitude'] = geo_arcgis.latlng[0]
        neighborhood_df.at[index, 'Longitude'] = geo_arcgis.latlng[1]

In [18]:
neighborhood_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.752935,-79.335641
1,M4A,North York,Victoria Village,43.728102,-79.31189
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667481,-79.528953
6,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
7,M3B,North York,Don Mills,43.7489,-79.35722
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.707193,-79.311529
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657491,-79.377529


## Part 3: Explore and cluster the neighborhoods in Toronto

In [19]:
toronto_latlng = [43.651070, -79.347015] # Found through Google search

Create a map of Toronto with neighborhoods superimposed on top.

In [45]:
map_toronto = folium.Map(location = [toronto_latlng[0], toronto_latlng[1]], 
                         tiles = 'cartodbpositron',
                         zoom_start = 11)
toronto_df = neighborhood_df[neighborhood_df.Borough.str.contains(pat = 'Toronto')] # limit to only Boroughs in like Toronto

toronto_df.apply(lambda row: folium.CircleMarker(location = [row[3], row[4]],
                                              radius = 2,
                                              tooltip = '{}, {}'.format(row[1], row[2])).add_to(map_toronto), axis = 1)

map_toronto