In [1]:
# Before using this notebook make sure you have installed the following libraries.
# !pip3 install bs4
# !pip3 install geopy
# !pip3 install geocoder
# !pip3 install pandas
# !pip3 install numpy
# !pip3 install folium
# !pip3 install sklearn
import pandas as pd
import numpy as np

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


In this notebook we are going to scrap a wikipedia page to get data about Toronto neighborhood.
Uncomment the !pip3 install bs4 if you no yet have bs4 installed in your environment.
**This notebook is using python3**

In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [4]:
url= "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" #wiki link
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')
type(soup)
storage=[] #initialising a list where will be stored table data


Now we are going to loop on each table present in the html script parse into BS4 instance. For each table we will store row by row every data of the row in a sublist called list

In [5]:
for table in soup.find_all('table'):
    for row in table.find_all('tr'):
        list=[]
        for data in row.find_all('td'):
             list.append(data.get_text())
        storage.append(list)

In [6]:
# create a data frame with the list of table data and remove unnecessary data
df=  pd.DataFrame(storage)
df=df.iloc[1:,[0,1,2]]
df.head()

Unnamed: 0,0,1,2
1,M1A\n,Not assigned\n,Not assigned\n
2,M2A\n,Not assigned\n,Not assigned\n
3,M3A\n,North York\n,Parkwoods\n
4,M4A\n,North York\n,Victoria Village\n
5,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"


In [7]:
# Clean dataset: remove \n in data values and drop all rows with borough not assigned
df= df.replace({'\n':''}, regex=True)
df=df.iloc[:180,[0,1,2]]
df=df[df[1] != 'Not assigned']
df.columns=['PostalCode', 'Borough', 'Neighbourhood']
df.index=range(len(df.index))
df


Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [8]:
# Merge duplicate  PostalCode rows
df = df.groupby(['PostalCode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [9]:
df.shape

(103, 3)

Now let's work with foursquare, we will first write a function that will calculate cordinates for a given postal codeof Toronto. Then we will add thopse cordinates data to the dataframe

In [10]:
import geocoder # import geocoder

def get_lat_long(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
    return lat_lng_coords  


In [14]:
# Add latitude and longitude columns to the dataframe using geocoder.
# This can take a long time to excute instead of using get_lat_long() defined above we will use the csv file
# provided by the labs to longitude and latitude for eash postal cade. In case you would like to use geocoder,
# uncomment the code below

#lat_long = get_lat_long(df['PostalCode'])
#df['latitude']= lat_long[0]
#df['longitude']= lat_long[1]

lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
lat_long.rename(columns={'Postal Code': 'PostalCode'}, inplace= True)
lat_long.shape
lat_long.head()


Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [19]:
df= pd.merge(df,lat_long)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [22]:
#Importing library for clustering and map displaying
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import folium

In [25]:
# getting geo cordinate of toronto to initialize the map
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="http")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.6534817, -79.3839347.


In [27]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto