# Capstone project

### This notebook will be used to complete the Coursera capstone project

In [1]:
import pandas as pd
import numpy as np
import requests # library to handle requests
from bs4 import BeautifulSoup

## Assignment in week 3

### Part 1: dowload and read postal codes in Toronto

Dowload data using request and BeautifulSoup

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

Manipulate data to easily read them in a dataframe

In [3]:
list_of_lists = []
for child in soup.find_all('table')[0].children:
    for td in child:
        if td=='\n': continue
        list_of_lists.append(td.text.strip().split('\n\n'))

In [4]:
df = pd.DataFrame({list_of_lists[0][0]:[list_of_lists[i+1][0] for i in range(len(list_of_lists)-1)], 
                  list_of_lists[0][1]:[list_of_lists[i+1][1] for i in range(len(list_of_lists)-1)],
                  list_of_lists[0][2]:[list_of_lists[i+1][2] for i in range(len(list_of_lists)-1)]})
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Remove Not assigned Boroughs and print the shape of the dataset

In [5]:
print(len(df))
df = df[df.Borough!='Not assigned']
print(len(df))
df.reset_index(inplace=True)
df.head(11)

180
103


Unnamed: 0,index,Postal Code,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,9,M1B,Scarborough,"Malvern, Rouge"
7,11,M3B,North York,Don Mills
8,12,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,13,M5B,Downtown Toronto,"Garden District, Ryerson"


In [6]:
df.shape

(103, 4)

### Part 2: dowload latitude and longitude

#### Tried with geocoder but it does not work

In [24]:
import geocoder # import geocoder

In [None]:
postal_code = df['Postal Code'].to_list()
latitude, longitude = [], []

for i in range(len(postal_code)-1):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code[i+1]))
        lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]

#### Read data from .csv

In [7]:
df2 = pd.read_csv('Geospatial_Coordinates.csv')
postal_code = df2['Postal Code'].to_list()
latitude = df2['Latitude'].to_list()
longitude = df2['Longitude'].to_list()
df_dict = {postal_code[i]: (latitude[i], longitude[i]) for i in range(len(postal_code))}

In [13]:
postal_code = df['Postal Code'].to_list()
latitude, longitude = [], []
for i in range(len(postal_code)):
    latitude.append(df_dict[postal_code[i]][0])
    longitude.append(df_dict[postal_code[i]][1])

In [14]:
df['Latitude'] = latitude
df['Longitude'] = longitude
df.head()

Unnamed: 0,index,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,2,M3A,North York,Parkwoods,43.753259,-79.329656
1,3,M4A,North York,Victoria Village,43.725882,-79.315572
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part 3: Neighborhoods in Toronto

In [11]:
import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: / ^C
failed with initial frozen solve. Retrying with flexible solve.

CondaError: KeyboardInterrupt

Collecting package metadata (current_repodata.json): / ^C
\ Libraries imported.


In [15]:
df[df.Borough=='Central Toronto'].Neighbourhood.unique()

array(['Lawrence Park', 'Roselawn', 'Davisville North',
       'Forest Hill North & West, Forest Hill Road Park',
       'North Toronto West,  Lawrence Park',
       'The Annex, North Midtown, Yorkville', 'Davisville',
       'Moore Park, Summerhill East',
       'Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park'],
      dtype=object)

In [16]:
latitude = np.mean(df['Latitude'])
longitude = np.mean(df['Longitude'])
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.70460773398059, -79.39715291165047.


In [19]:
# create map of Toronto using latitude and longitude values
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto