<h1>Capstone project worksheet</h1>

This notebook is mainly used for the capstone project of the "Applied Data Science" course in Coursera

<h2>Week 1</h2>

In [6]:
import pandas as pd
import numpy as np

In [3]:
print ("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Week 3

### Part 1

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# read from wikipedia
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

In [3]:
# find the table
wikitable = soup.find("table",class_="wikitable")

In [4]:
# read header and data from bs into df toronto
header=[]
data=[]
for item in wikitable.find_all('tr'):
    new_data=[]
    for entry in item.find_all("th"):
        header.append(entry.text.rstrip())
    for entry in item.find_all("td"):
        new_data.append(entry.text.rstrip())
    if (len(new_data)>0): data.append(new_data)


In [7]:
# generate DataFrame
toronto=pd.DataFrame(data,columns=header)

In [8]:
toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [9]:
# copy df and remove rows where Borough is Not assigned
tr_df=toronto[toronto.Borough!="Not assigned"].copy()

In [10]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
tr_df.Neighbourhood=tr_df.Borough.where(tr_df.Neighbourhood == 'Not assigned', tr_df.Neighbourhood)

In [11]:
# join the Neighbourhoods accordingly
tr_df.Neighbourhood=tr_df.groupby(["Postcode"])["Neighbourhood"].transform(lambda x: ','.join(x))

In [12]:
# reset index without adding a new index column
tr_df=tr_df.drop_duplicates().set_index("Postcode").reset_index()

In [13]:
# show part of the cleaned DataFrame
tr_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park


In [14]:
# show shape
tr_df.shape

(103, 3)

### Part 2

In [15]:
#alternative solution with pgeocode
import pgeocode

In [37]:
# set Nominatim to Canada
GeoCodes=pgeocode.Nominatim('ca')

In [17]:
# get coordinates from the pgeocode package
coordinates=GeoCodes.query_postal_code(tr_df["Postcode"].values)[["postal_code","latitude","longitude"]]

In [38]:
GeoCodes.query_postal_code("M7A")

postal_code                                              M7A
country code                                              CA
place_name        Queen's Park Ontario Provincial Government
state_name                                           Ontario
state_code                                                ON
county_name                                              NaN
county_code                                              NaN
community_name                                           NaN
community_code                                           NaN
latitude                                             43.6641
longitude                                           -79.3889
accuracy                                                 NaN
Name: 0, dtype: object

In [18]:
# well, pgeocode is nice, but this has to be fixed. For simplicity I did by hand, but a better solution would be to ask the geocoder in this case!
GeoCodes.query_postal_code("M7R")

postal_code       M7R
country code      NaN
place_name        NaN
state_name        NaN
state_code        NaN
county_name       NaN
county_code       NaN
community_name    NaN
community_code    NaN
latitude          NaN
longitude         NaN
accuracy          NaN
Name: 0, dtype: object

In [19]:
# merge the data
tr_coord=tr_df.merge(coordinates,how="left",left_on="Postcode",right_on="postal_code").drop(["postal_code"],axis=1)

In [22]:
tr_coord.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,Harbourfront,43.6555,-79.3626
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.7223,-79.4504
4,M7A,Downtown Toronto,Queen's Park,43.6641,-79.3889


In [23]:
tr_coord[tr_coord.Postcode=="M7R"]

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude
76,M7R,Mississauga,Canada Post Gateway Processing Centre,,


In [24]:
tr_coord.loc[76,"latitude"]=43.6370
tr_coord.loc[76,"longitude"]=-79.6158

In [25]:
tr_coord[tr_coord.Postcode=="M7R"]

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude
76,M7R,Mississauga,Canada Post Gateway Processing Centre,43.637,-79.6158


In [26]:
len(tr_coord.Borough.unique())

11

### Part 3

In [27]:
# imports for maps and plotting
import folium
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import colors
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [28]:
# produce df from CSS colors
CSScolors=pd.DataFrame.from_dict(colors.CSS4_COLORS,orient='index').rename(columns={0:"code"})

In [29]:
# permute the df
CSScolors=CSScolors.reindex(np.random.permutation(CSScolors.index))

In [30]:
# count boroughs and postal areas
print('The dataframe has {} boroughs and {} postcodes.'.format(
        len(tr_coord['Borough'].unique()),
        tr_coord.shape[0]
    )
)

The dataframe has 11 boroughs and 103 postcodes.


In [31]:
# color map the boroughs
colorMap=dict(zip(tr_coord['Borough'].unique(),CSScolors.iloc[0:len(tr_coord['Borough'].unique()),0].values))

In [32]:
colorMap

{'Central Toronto': '#F8F8FF',
 'Downtown Toronto': '#ADD8E6',
 'East Toronto': '#FFE4E1',
 'East York': '#4169E1',
 'Etobicoke': '#DAA520',
 'Mississauga': '#CD5C5C',
 'North York': '#008080',
 "Queen's Park": '#BC8F8F',
 'Scarborough': '#FFE4C4',
 'West Toronto': '#8B008B',
 'York': '#00008B'}

In [33]:
# get coordinates of Toronto
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [34]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, postcode in zip(tr_coord['latitude'], tr_coord['longitude'], tr_coord['Borough'], tr_coord['Postcode']):
    label = '{}, {}'.format(borough, postcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=colorMap[borough],#'blue',
        fill=True,
        fill_color="black",#colorMap[borough],#'#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [35]:
# combine boroughs
tr_borough=tr_coord.groupby("Borough").mean().reset_index()
tr_borough

Unnamed: 0,Borough,latitude,longitude
0,Central Toronto,43.701933,-79.399644
1,Downtown Toronto,43.654426,-79.384774
2,East Toronto,43.6929,-79.3109
3,East York,43.70062,-79.33372
4,Etobicoke,43.659636,-79.542136
5,Mississauga,43.637,-79.6158
6,North York,43.750912,-79.429067
7,Queen's Park,43.6662,-79.5282
8,Scarborough,43.768,-79.247941
9,West Toronto,43.651733,-79.4473


In [36]:
# create map of Toronto using latitude and longitude values for boroghs
map_torontoBorough = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough in zip(tr_borough['latitude'], tr_borough['longitude'], tr_borough['Borough']):
    label = borough
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color=colorMap[borough],
        fill=True,
        fill_color='black',
        fill_opacity=0.7,
        parse_html=False).add_to(map_torontoBorough)  
    
map_torontoBorough

In [315]:
# Well, of course, the color coding could be much better, but the idea was that the number of boroughs is not fixed. in that case one needs a long enough list of colors. 