# IBM Data Science Specialization - Final Project

This is the notebook for Mateus Ribeiro's capstone project for the IBM Data Specialization on Coursera
   

In [5]:
import pandas as pd
import numpy as np

In [6]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


### Scrape data
Scrape data using BeautifulSoup

In [68]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

#get html from wiki page and create soup object
url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(url.text, 'lxml')

#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

# Crt list into Pandas DataFrame
toronto_df = pd.DataFrame(data = data,columns = columns)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Cleaning Data

Ignoring cells with 'Not assigned' rows

In [69]:
toronto_df = toronto_df[toronto_df['Borough'] != 'Not assigned']
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [70]:
# Grouping neighbourhoods by postcode`s, separating them with `s`
toronto_df["Neighbourhood"] = toronto_df.groupby("Postcode")["Neighbourhood"].transform(lambda text: ', '.join(text))
# Dropping duplicates rows
toronto_df = toronto_df.drop_duplicates()

In [71]:
# Set `postcode` as dataframe index
#if(toronto_df.index.name != 'Postcode'):
#    toronto_df = toronto_df.set_index('Postcode')

#if `neighbourhood` == `Not assigned`, then `neighbourhood` == `borough`
toronto_df['Neighbourhood'].replace("Not assigned", toronto_df["Borough"],inplace=True)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,"Lawrence Heights, Lawrence Manor"
7,M7A,Downtown Toronto,Queen's Park


In [72]:
# Dataframe shape
toronto_df.shape


(103, 3)

In [93]:
toronto_df['address'] = toronto_df[['Postcode', 'Borough', 'Neighbourhood']].apply(lambda x: ', '.join(x), axis=1 )
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,address
2,M3A,North York,Parkwoods,"M3A, North York, Parkwoods"
3,M4A,North York,Victoria Village,"M4A, North York, Victoria Village"
4,M5A,Downtown Toronto,Harbourfront,"M5A, Downtown Toronto, Harbourfront"
5,M6A,North York,"Lawrence Heights, Lawrence Manor","M6A, North York, Lawrence Heights, Lawrence Manor"
7,M7A,Downtown Toronto,Queen's Park,"M7A, Downtown Toronto, Queen's Park"


In [94]:
toronto_df.shape


(103, 4)

## Question 2:

Importing csv from url

In [89]:
!wget -q -O 'geo_coo.csv' http://cocl.us/Geospatial_data

In [90]:
geo_coo_df = pd.read_csv('geo_coo.csv')
geo_coo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [95]:
# Renaming 'postcode'
geo_coo_df.rename(columns={'Postal Code': 'Postcode'}, inplace=True)

# Merging both dataframes
df = pd.merge(toronto_df, geo_coo_df, how='inner', on=None, left_on=None, right_on=None,
         left_index=False, right_index=False, sort=True,
         suffixes=('_x', '_y'), copy=True, indicator=False,
         validate=None)

df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,address,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern","M1B, Scarborough, Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union","M1C, Scarborough, Highland Creek, Rouge Hill, ...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill","M1E, Scarborough, Guildwood, Morningside, West...",43.763573,-79.188711
3,M1G,Scarborough,Woburn,"M1G, Scarborough, Woburn",43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,"M1H, Scarborough, Cedarbrae",43.773136,-79.239476


In [116]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


## Question 3:

import geopy

In [100]:
!pip install geopy



Getting coordinates from Toronto

In [103]:

from  geopy.geocoders import Nominatim
geolocator = Nominatim()

location = geolocator.geocode("Toronto, North York, Parkwoods")

print(location.address)
print('')
print((location.latitude, location.longitude))
print('')
print(location.raw)

  app.launch_new_instance()


Parkwoods Village Drive, Parkway East, Don Valley East, North York, Toronto, Golden Horseshoe, Ontario, M3A 2X2, Canada

(43.7587999, -79.3201966)

{'place_id': 124974741, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright', 'osm_type': 'way', 'osm_id': 160406961, 'boundingbox': ['43.7576231', '43.761106', '-79.3239088', '-79.316215'], 'lat': '43.7587999', 'lon': '-79.3201966', 'display_name': 'Parkwoods Village Drive, Parkway East, Don Valley East, North York, Toronto, Golden Horseshoe, Ontario, M3A 2X2, Canada', 'class': 'highway', 'type': 'secondary', 'importance': 0.51}


In [121]:
!pip install folium


Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/fd/a0/ccb3094026649cda4acd55bf2c3822bb8c277eb11446d13d384e5be35257/folium-0.10.1-py2.py3-none-any.whl (91kB)
[K     |████████████████████████████████| 92kB 6.0MB/s eta 0:00:011
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/81/6d/31c83485189a2521a75b4130f1fee5364f772a0375f81afff619004e5237/branca-0.4.0-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.0 folium-0.10.1


In [122]:
import pandas as pd
import folium
print('imported pandas & folium')


imported pandas & folium


In [126]:
#grab a random sample from df
subset_of_df = df.sample(n=30)
map_test = folium.Map(location=[subset_of_df['Latitude'].mean(), 
                                subset_of_df['Longitude'].mean()], 
                      zoom_start=10)
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in subset_of_df.itertuples():
    map_test.add_child(folium.Marker(location=[row.Latitude ,row.Longitude],
           popup=row.Borough))

    
#map_test

#open map_test.html in browser
map_test.save("map_test.html")


In [127]:
map_test

In [128]:
from folium.plugins import MarkerCluster
map_borough = folium.Map(location=[subset_of_df['Latitude'].mean(), 
 subset_of_df['Longitude'].mean()], 
 zoom_start=10)
mc = MarkerCluster()
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in subset_of_df.itertuples():
    mc.add_child(folium.Marker(location=[row.Latitude,  row.Longitude],
                 popup=row.Borough))
    map_borough.add_child(mc)


#map_borough

#open in map_borough.html browser 
map_borough.save("map_borough.html")

In [129]:
map_borough