# For this assignment, you will be required to explore and cluster the neighborhoods in Toronto.

## Start by creating a new Notebook for this assignment.
### Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe like the one shown below:

### 1 - Pre-Processing

In [1]:
# importing necessary libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

In [2]:
# getting data from internet
wikipedia_link='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
raw_wikipedia_page= requests.get(wikipedia_link).text

# using beautiful soup to parse the HTML/XML codes.
soup = BeautifulSoup(raw_wikipedia_page,'xml')
#print(soup.prettify())

### Pre-Processing (Part 1) : Extracting raw table (From Website)

In [3]:
# extracting the raw table inside that webpage
table = soup.find('table')

Postcode      = []
Borough       = []
Neighbourhood = []

# print(table)

# extracting a clean form of the table
for tr_cell in table.find_all('tr'):
    
    counter = 1
    Postcode_var      = -1
    Borough_var       = -1
    Neighbourhood_var = -1
    
    for td_cell in tr_cell.find_all('td'):
        if counter == 1: 
            Postcode_var = td_cell.text
        if counter == 2: 
            Borough_var = td_cell.text
            tag_a_Borough = td_cell.find('a')
            
        if counter == 3: 
            Neighbourhood_var = str(td_cell.text).strip()
            tag_a_Neighbourhood = td_cell.find('a')
            
        counter +=1
        
    if (Postcode_var == 'Not assigned' or Borough_var == 'Not assigned' or Neighbourhood_var == 'Not assigned'): 
        continue
    try:
        if ((tag_a_Borough is None) or (tag_a_Neighbourhood is None)):
            continue
    except:
        pass
    if(Postcode_var == -1 or Borough_var == -1 or Neighbourhood_var == -1):
        continue
        
    Postcode.append(Postcode_var)
    Borough.append(Borough_var)
    Neighbourhood.append(Neighbourhood_var)

### Pre-Processing (Part 2): Integrating Postal codes with more than 1 Neighbours.

In [4]:
unique_p = set(Postcode)
print('num of unique Postal codes:', len(unique_p))
Postcode_u      = []
Borough_u       = []
Neighbourhood_u = []


for postcode_unique_element in unique_p:
    p_var = ''; b_var = ''; n_var = ''; 
    for postcode_idx, postcode_element in enumerate(Postcode):
        if postcode_unique_element == postcode_element:
            p_var = postcode_element;
            b_var = Borough[postcode_idx]
            if n_var == '': 
                n_var = Neighbourhood[postcode_idx]
            else:
                n_var = n_var + ', ' + Neighbourhood[postcode_idx]
    Postcode_u.append(p_var)
    Borough_u.append(b_var)
    Neighbourhood_u.append(n_var)

num of unique Postal codes: 77


### Post-Processing: Creating Pandas Dataframe

In [5]:
toronto_dict = {'Postal Code':Postcode_u, 'Borough':Borough_u, 'Neighbourhood':Neighbourhood_u}
df_toronto = pd.DataFrame.from_dict(toronto_dict)
df_toronto.to_csv('toronto_part1.csv')
df_toronto.head(14)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M5P,Central Toronto,Forest Hill North
1,M1R,Scarborough,"Maryvale, Wexford"
2,M4W,Downtown Toronto,Rosedale
3,M6P,West Toronto,High Park
4,M4N,Central Toronto,Lawrence Park
5,M4L,East Toronto,India Bazaar
6,M4V,Central Toronto,"Deer Park, Rathnelly, South Hill"
7,M6L,North York,Downsview
8,M4A,North York,Victoria Village
9,M1B,Scarborough,"Rouge, Malvern"


In [6]:
df_toronto = df_toronto[df_toronto.Borough != "Not assigned"]

In [7]:
df_toronto = df_toronto.groupby(["Postal Code", "Borough"]).agg(lambda x: ', '.join(set(x))).reset_index()

In [8]:
df_toronto[df_toronto.Neighbourhood == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [9]:
df_toronto.loc[8, "Neighbourhood"] = df_toronto.loc[8, "Borough"]
df_toronto[df_toronto.Neighbourhood == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighbourhood


In [10]:
df_toronto.shape

(77, 3)

### Part 2: Pre-processing - Installing Geocoder


In [11]:
!pip install geocoder
print('geocoder has been installed before.')
import geocoder
print('geocoder has been successfully imported.')

geocoder has been installed before.
geocoder has been successfully imported.


In [12]:
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [13]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
geospatial_data = geo_df
geospatial_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
geospatial_data.shape

(103, 3)

In [16]:
df_toronto_lat = pd.merge(df_toronto, geospatial_data, left_index=True, right_index=True, how = 'inner', on = ['Postal Code'])
df_toronto_lat

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1J,Scarborough,Scarborough Village,43.773136,-79.239476
5,M1K,Scarborough,"Ionview, Kennedy Park",43.744734,-79.239476
6,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.727929,-79.262029
7,M1M,Scarborough,"Cliffcrest, Cliffside",43.711112,-79.284577
8,M1N,Scarborough,Scarborough,43.716316,-79.239476
9,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.692657,-79.264848


In [17]:
df_toronto_lat = df_toronto_lat.dropna()

In [18]:
df_toronto_lat

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1J,Scarborough,Scarborough Village,43.773136,-79.239476
5,M1K,Scarborough,"Ionview, Kennedy Park",43.744734,-79.239476
6,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.727929,-79.262029
7,M1M,Scarborough,"Cliffcrest, Cliffside",43.711112,-79.284577
8,M1N,Scarborough,Scarborough,43.716316,-79.239476
9,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.692657,-79.264848


In [19]:
df_toronto_lat.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1J,Scarborough,Scarborough Village,43.773136,-79.239476
5,M1K,Scarborough,"Ionview, Kennedy Park",43.744734,-79.239476
6,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.727929,-79.262029
7,M1M,Scarborough,"Cliffcrest, Cliffside",43.711112,-79.284577
8,M1N,Scarborough,Scarborough,43.716316,-79.239476
9,M1P,Scarborough,"Dorset Park, Scarborough Town Centre, Wexford ...",43.692657,-79.264848
