# Machine Learning Capstone - Clustering

## Question 1 - Getting the data from wikipedia

### Instaling libraries

In [65]:
!pip install BeautifulSoup4
!pip install requests



### Importing important libraries

In [69]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Fetching data from wikipedia

In [72]:
path ='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
rawpage= requests.get(path).text

#using BeautifulSoup to get xml codes
soup = BeautifulSoup(rawpage,'xml')


### Extracting raw table from wikipedia

In [78]:
data = []
columns = []
table = soup.find(class_='wikitable sortable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
canada_df = pd.DataFrame(data = data,columns = columns)
canada_df.head()



Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Clean the data 
#### remove borough that are not assigned

In [79]:
canada_df = canada_df[canada_df['Borough'] != 'Not assigned']
canada_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### More than one neighborhood can exist in one postal code area. 
#### These two rows combined into one row with the neighborhoods separated with a comma.

In [None]:
canada_df["Neighbourhood"] = canada_df.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))

#remove duplicates
canada_df = canada_df.drop_duplicates()

#update index to be postcode if it isn't already
if(canada_df.index.name != 'Postcode'):
    canada_df = canada_df.set_index('Postcode')
    
canada_df.head()

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [76]:
canada_df['Neighbourhood'].replace("Not assigned", canada_df["Borough"],inplace=True)
canada_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [97]:
canada_df.head()
canada_df.to_csv (r'C:/Users/Arshad/Documents/ML/capstone/canada_df.csv')

### Print dataframe shape

In [77]:
canada_df.shape

(103, 3)

## Question 2 Getting Geocode

In [86]:
pip install geopy

Collecting geopy
  Downloading geopy-2.0.0-py3-none-any.whl (111 kB)
Collecting geographiclib<2,>=1.49
  Downloading geographiclib-1.50-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-2.0.0
Note: you may need to restart the kernel to use updated packages.


In [98]:
df_geo = pd.read_csv("./Geospatial_Coordinates.csv")
df_geo.to_csv (r'C:/Users/Arshad/Documents/ML/capstone/df_geo.csv')
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [100]:
df_combined=pd.merge(canada_df, df_geo, on=['Postal Code'])

### Merge the two dataframe canada_df and df_geo

In [101]:
df_combined.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Question 3

###  Fetch Latitute and longitude for Toronto 

In [104]:
address = "Toronto, ON"

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

43.6534817 -79.3839347


### Map for Toronto using fetched Longitude and Latitude

In [112]:
import folium
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=12)
toronto_map 

### Adding markers

In [119]:
for lat, lng, borough, neighborhood in zip( df_combined['Latitude'], df_combined['Longitude'], df_combined['Borough'], 
        df_combined['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5, # define how big you want the circle markers to be
        popup=label,
        color='yellow',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        parse_html=False).add_to(toronto_map)  
toronto_map


###  Working with only boroughs that contain the word Toronto, drop the rest!

In [124]:
toronto_df = df_combined[df_combined['Borough'].str.contains("Toronto")].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


### Explore neighbourhoods in Toronto

In [126]:
#start by putting credentials for fourthsquare 
CLIENT_ID = 'TQ3RQHXOFFHGS2ZFWGI35GDZNCBKQDA5M3L5GAS0OBVKBVKD'
CLIENT_SECRET = 'YA1ZNFYZVYTWVS5L1RHWRSNIFKNA4EUJQCS2HCHOTDCWCFAA'
VERSION = '20200816'

# explore first neighbourhood
neighborhood_name = toronto_df.loc[0, 'Neighbourhood']
print(f"The first Neighbourhood is '{neighborhood_name}'.")

The first Neighbourhood is 'Regent Park, Harbourfront'.


In [128]:
# explore second neighbourhood
neighborhood_name = toronto_df.loc[1, 'Neighbourhood']
print(f"The second Neighbourhood is '{neighborhood_name}'.")

The second Neighbourhood is 'Queen's Park, Ontario Provincial Government'.


### Exploring the first 50 neighbourhood

In [136]:
neighborhood_latitude = toronto_df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_df.loc[0, 'Longitude'] # neighborhood longitude value

LIMIT = 50 # limit of number of venues returned by Foursquare API
radius = 300 # define the area to be explored
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

# get the result to a json file
results = requests.get(url).json()