# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

### Import Dependencies

In [1]:
import pandas as pd

In [2]:
#%pip install lxml # Used at the creating of the notebook to support pandas read_html

### Read html data and assign to a dataframe.

In [3]:
html_data = pd.read_html('http://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [4]:
type(html_data)

list

In [5]:
html_data

[    Postcode           Borough          Neighbourhood
 0        M1A      Not assigned           Not assigned
 1        M2A      Not assigned           Not assigned
 2        M3A        North York              Parkwoods
 3        M4A        North York       Victoria Village
 4        M5A  Downtown Toronto           Harbourfront
 ..       ...               ...                    ...
 283      M8Z         Etobicoke              Mimico NW
 284      M8Z         Etobicoke     The Queensway West
 285      M8Z         Etobicoke  Royal York South West
 286      M8Z         Etobicoke         South of Bloor
 287      M9Z      Not assigned           Not assigned
 
 [288 rows x 3 columns],
                                                   0   \
 0                                                NaN   
 1  NL NS PE NB QC ON MB SK AB BC NU/NT YT A B C E...   
 2                                                 NL   
 3                                                  A   
 
                          

#### Assign the table to a dataframe object.

In [6]:
wiki_table = html_data[0]

In [7]:
type(wiki_table)

pandas.core.frame.DataFrame

In [8]:
wiki_table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood

In [9]:
wiki_table.rename(columns={'Postcode': 'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)

#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [10]:
wiki_table = wiki_table[~wiki_table.Borough.str.contains("Not assigned")]

In [11]:
wiki_table = wiki_table.reset_index(drop=True)

In [12]:
wiki_table.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [13]:
wiki_table.columns

Index(['PostalCode', 'Borough', 'Neighborhood'], dtype='object')

In [14]:
wiki_table.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [15]:
wiki_unique_postal = wiki_table.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()

In [16]:
wiki_unique_postal.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


#### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park.

In [17]:
wiki_unique_postal[wiki_unique_postal['Neighborhood'].str.contains('Not ass')]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Not assigned


In [18]:
wiki_unique_postal.loc[wiki_unique_postal['Neighborhood']=='Not assigned', 'Neighborhood'] = wiki_unique_postal['Borough']

In [19]:
wiki_unique_postal[wiki_unique_postal['Neighborhood'].str.contains('Not ass')]

Unnamed: 0,PostalCode,Borough,Neighborhood


In [20]:
wiki_unique_postal[wiki_unique_postal.PostalCode.str.contains('M7A')]

Unnamed: 0,PostalCode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


In [21]:
wiki_unique_postal.shape

(103, 3)

In [22]:
#%pip install geocoder
import geocoder # import geocoder

In [24]:
# Turned given while loop into a function.
def get_coords(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng

    return lat_lng_coords[0], lat_lng_coords[1]

In [26]:
# Could not get to work.
#for ind in wiki_unique_postal.index:
#    lat, long = get_coords(wiki_unique_postal['PostalCode'][ind])
#    wiki_unique_postal['Latitude'][ind] = lat
#    wiki_unique_postal['Longitude'][ind] = long

KeyboardInterrupt: 

#### Instead of using google I went ahead with the csv file.

In [28]:
import csv

In [30]:
coords_dict = {}
with open('Geospatial_Coordinates.csv', newline='') as f:
    reader = csv.reader(f)
    for row in reader:
        if 'Postal Code' in row[0]:
            continue
        coords_dict.update({row[0]: [row[1], row[2]]})

In [35]:
coords_dict['M1B'][0]

'43.8066863'

In [39]:
latitude = []
longitude = []
for ind in wiki_unique_postal.index:
    if wiki_unique_postal['PostalCode'][ind] in coords_dict:
        latitude.append(coords_dict[wiki_unique_postal['PostalCode'][ind]][0])
        longitude.append(coords_dict[wiki_unique_postal['PostalCode'][ind]][1])

In [40]:
wiki_unique_postal.insert(3, "Latitude", latitude, True)

In [42]:
wiki_unique_postal.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude
0,M1B,Scarborough,"Rouge,Malvern",43.8066863
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.7845351
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.7635726
3,M1G,Scarborough,Woburn,43.7709921
4,M1H,Scarborough,Cedarbrae,43.773136


In [43]:
wiki_unique_postal.insert(4, "Longitude", longitude, True)

In [45]:
wiki_unique_postal.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.8066863,-79.1943534
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.7845351,-79.1604971
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.7635726,-79.1887115
3,M1G,Scarborough,Woburn,43.7709921,-79.2169174
4,M1H,Scarborough,Cedarbrae,43.773136,-79.2394761
5,M1J,Scarborough,Scarborough Village,43.7447342,-79.2394761
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park",43.7279292,-79.2620294
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.7111117,-79.2845772
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West",43.716316,-79.2394761
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.2648481
