# TORONTO

## Import Data

In [1]:
import numpy as np

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

In [2]:
webdata=pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0) #read all URL tables
data_t=pd.DataFrame(webdata[0]) #select 1st table and convert to dataframe
data_t.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
data_t.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Replace Neighbourhood name by Borough name, if a cell has a borough but a Not assigned neighborhood

In [3]:
length_data=len(data_t['Postcode'])-1 #dataframe length
for i in range (0, length_data):
    if data_t['Neighborhood'].iloc[i]=='Not assigned':
        data_t['Neighborhood'].iloc[i]=data_t['Borough'].iloc[i]

### Remove remaining rows where Borough and neighbourhood are 'Not assigned'

In [4]:
data_t = data_t[data_t['Neighborhood']!='Not assigned'] #remove Neighborhood=Not assigned

In [5]:
data_t.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


### Merge Neighbourhood information from rows with same Postcode

In [6]:
data_t_drop_lines=[] #index of redundant lines to be removed after grouping postcodes
length_data=len(data_t['Postcode'])-1 #dataframe length

for i in range(0,length_data): #join info from cells with same postcode
    if data_t['Postcode'].iloc[i]==data_t['Postcode'].iloc[i+1]:
        data_t['Neighborhood'].iloc[i]=data_t['Neighborhood'].iloc[i] + ', ' + data_t['Neighborhood'].iloc[i+1]
        data_t_drop_lines.append(i+1)
        
data_t.drop(data_t.index[data_t_drop_lines], axis=0, inplace=True) #drop redundat lines from cells with same postcode

### SHAPE

In [7]:
data_t.shape

(103, 3)

### READ LAT/LONG

In [8]:
lat_long=pd.read_csv('http://cocl.us/Geospatial_data')
lat_long.rename(columns={'Postal Code':'Postcode'}, inplace=True)

In [9]:
final = data_t.merge(lat_long, on='Postcode')

In [10]:
final.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
