# The first part of this code is copied from "Segmentation and Clustering of Neighborhoods - Part 1 - KR #

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# first step is to get the url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
link = requests.get(url).text
toronto = BeautifulSoup(link,'lxml')

In [3]:
# extracting the information and putting it in the dataframe

# creating the dataframe
column_name = ['Postalcode', 'Borough', 'Neighborhood']
toronto_df = pd.DataFrame(columns = column_name)

# going through toronto to extract the Postal Code, Borough, and Neighborhood
info = toronto.find('div', class_='mw-parser-output')
table = info.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text
            i = i + 1
        elif i == 1:
            borough = td.text
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
    toronto_df = toronto_df.append({'Postalcode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

In [4]:
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,0,0,0
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


Getting rid of Boroughs with a value of "Not assigned"

In [5]:
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,0,0,0
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park


Getting rid of the first row (all 0's) and re-naming Postalcode to Postal Code

In [6]:
toronto_df = toronto_df[toronto_df.Borough != 0]
toronto_df.rename(columns = {'Postalcode': 'Postal Code'}, inplace = True)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


Removing Neighborhoods that have a value of "Not assigned" and consolidating Borough's which have multiple Neighborhoods listed

In [7]:
toronto_df[toronto_df.Neighborhood == "Not assigned"]

Unnamed: 0,Postal Code,Borough,Neighborhood
9,M7A,Queen's Park,Not assigned


In [8]:
toronto_df[toronto_df.Borough == "Queen's Park"]

# Borough was erased in code that was previously run

Unnamed: 0,Postal Code,Borough,Neighborhood
9,M7A,Queen's Park,Not assigned


In [9]:
# Consolidating the neighborhoods

toronto_df = toronto_df.groupby(["Postal Code", "Borough"])['Neighborhood'].apply(', '.join).reset_index()

In [10]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Part 2 - Segmenting the neigborhoods by including latitude and longitude information #

In [12]:
import numpy as np

In [13]:
#!conda install -c conda-forge geopy --yes --> #used to install the geopy library

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with initial frozen solve. Retrying with flexible solve.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\karun\AppData\Local\Continuum\anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.6.16          |           py37_0         148 KB  conda-forge
    conda-4.7.12               |           py37_0         3.0 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The 

In [14]:
from geopy.geocoders import Nominatim

In [15]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [16]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ca_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [17]:
address1 = 'M1B, Toronto, Ontario'

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('{}, {}'.format(latitude, longitude))

43.653963, -79.387207


In [19]:
address2 = 'M6P, Toronto, Ontario'

location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('{}, {}'.format(latitude, longitude))

43.653963, -79.387207


#### Installed the geocoder library as I kept getting the same coordinates regardless of postal code ####

In [23]:
#!conda install -c conda-forge geocoder --yes --> installed the geocoder library as I found it would allow me to use the Postal Code

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\karun\AppData\Local\Continuum\anaconda3

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          59 KB

The following NEW packages will be INSTALLED:

  geocoder           conda-forge/noarch::geocoder-1.38.1-py_1
  ratelim            conda-forge/noarch::ratelim-0.1.6-py_2



Downloading and Extracting Packages

geocoder-1.38.1      | 53 KB     |            |   0% 
geocoder-1.38.1      | 53 KB     | ###        |  30% 
geo

In [24]:
import geocoder

In [32]:
def get_coordinates(postal_code):
    # initialize your variable to None
    coordinates = None
    # loop until you get the coordinates
    while(coordinates is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
        coordinates = g.latlng
    return coordinates
    
get_coordinates('M6P')

[43.659935000000075, -79.46301926299998]

In [28]:
get_coordinates('M1B')

[43.811525000000074, -79.19551746399998]

#### Now that I got different coordinates, it's time to do this to the entire dataframe ####

In [35]:
pc = toronto_df['Postal Code']

lat_lng = [get_coordinates(codes) for codes in pc.tolist()]

In [37]:
df_latlong = pd.DataFrame(lat_lng, columns = ['Latitude', "Longitude"])
toronto_df['Latitude'] = df_latlong['Latitude']
toronto_df['Longitude'] = df_latlong['Longitude']

In [38]:
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
