# Capstone project.

##Section1

Scraping from website, examining data, and pulling into dataframe:

In [1]:
import pandas as pd
import numpy as np

Get the table information from the Wikipedia

In [6]:
tables=pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
tables

[    Postal Code  ...                                      Neighbourhood
 0           M1A  ...                                       Not assigned
 1           M2A  ...                                       Not assigned
 2           M3A  ...                                          Parkwoods
 3           M4A  ...                                   Victoria Village
 4           M5A  ...                          Regent Park, Harbourfront
 ..          ...  ...                                                ...
 175         M5Z  ...                                       Not assigned
 176         M6Z  ...                                       Not assigned
 177         M7Z  ...                                       Not assigned
 178         M8Z  ...  Mimico NW, The Queensway West, South of Bloor,...
 179         M9Z  ...                                       Not assigned
 
 [180 rows x 3 columns],
                                                   0   ...   17
 0                               

###Assigning Postal code info to a dataframe

In [8]:
df=tables[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [9]:
#removing the rows with Borough as Not assigned
df_clean=df[df['Borough']!= 'Not assigned']
df_clean.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Ensuring if there is only one row per postal code:

In [10]:
if len(df_clean['Postal Code'].unique()) == len(df_clean.index):
    print('There is no duplicate postal code value.')
else :
    print('Duplicate values are present and further cleanup is required.')

There is no duplicate postal code value.


In [11]:
#Checking if any Neighbourhood has value as Not assigned
df_clean[df_clean['Neighbourhood']== 'Not assigned'].count()

Postal Code      0
Borough          0
Neighbourhood    0
dtype: int64

In [12]:
df_clean.shape

(103, 3)

##Section 2

Use CSV and load the data into new dataframe as Geocoder package is not reliable

In [14]:
coordinates=pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Joining Both the dataframes to get the final dataframe with all the desired information

In [16]:
df_final=df_clean.set_index('Postal Code').join(coordinates.set_index('Postal Code'))
df_final.reset_index(inplace=True)
df_final.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


##Section 3

###Performing Cluster analysis and plotting the same 

In [17]:
import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


Selecting the data with boroughs that contain the word Toronto to restrict the data

In [19]:
neighborhoods=df_final[df_final['Borough'].str.contains("Toronto")]
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [20]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 4 boroughs and 39 neighborhoods.


Lets get the longitude and latitude of the Toronto city.
 

In [21]:
address = 'Toronto, TO'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.65238435, -79.38356765.


Lets print the different Neighbourhoods in the map

In [22]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [23]:
CLIENT_ID = '1CETT04VB4CLSAGO1IXLCHKKZHVZRSETNG0TXNWYPDA4SEY0' # your Foursquare ID
CLIENT_SECRET = 'Y2GSJIWDGC45VYYYGMV0ASLAC5FI21JECGRODK5GIRC31C0K' # your Foursquare Secret
ACCESS_TOKEN = 'RR1BSSE2OGRMAEYF3YDUYQ0AIDNGDXNM10VZ04ZZZ3DGLQYZ' # your FourSquare Access Token
VERSION = '20180605'
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 1CETT04VB4CLSAGO1IXLCHKKZHVZRSETNG0TXNWYPDA4SEY0
CLIENT_SECRET:Y2GSJIWDGC45VYYYGMV0ASLAC5FI21JECGRODK5GIRC31C0K
