# Coursera Assignment- Segmenting and Clustering Neighborhoods in Toronto

# Part 1

## Let's start with importing important libraries

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

## We scrap the data from site and store the data into a dataframe.

In [2]:
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0, na_values='Not assigned')
df = data[0]  
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## We rename the headings of the dataframe. Postcode into PostalCode and Neighourhood into Neighborhood

In [3]:
df.rename(columns= {'Postcode':'PostalCode', 'Neighbourhood': 'Neighborhood'}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## We need to ignore rows with NaN in Borough, lets drop them.

In [4]:
df.dropna(axis=0, subset=['Borough'], inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


## Replace Neighborhood values with NaN with corresponding values of Borough column

In [5]:
df.Neighborhood.fillna(df.Borough, inplace=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


## Grouping Neighborhoods having same PostalCode

In [6]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


## Checking the shape of the processed data

In [7]:
df.shape

(103, 3)

## This concludes the Part 1. We have shape (103,3). Now, let's move to the second part of the assignment.

# Part 2

## Since Geocoder Python package is very unreliable, we will use direct link to the csv file that has the geographical coordinates of each postal code.

In [8]:
coordinates = pd.read_csv('https://cocl.us/Geospatial_data')
coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Let's rename the column Postal Code to PostalCode

In [9]:
# Renaming column "Postal Code"
coordinates.rename(columns= {'Postal Code':'PostalCode'}, inplace=True)
coordinates.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Now we will check the shape of the dataframe. This is will tell us whether we can merge the dataframe or not.

In [10]:
coordinates.shape

(103, 3)

## Since both dataframes have 103 rows, we can merge them. The common column in both dataframes is PostalCode. Hence, we merge the dataframes based on PostalCode.

In [11]:
df_coordinate = pd.merge(df, coordinates, how = 'inner', on="PostalCode")
df_coordinate.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## This completes the Part 2 of the assignment.

# Part 3

## In this part we will explore function to get the most common venue categories in each neighborhood, and then use this feature to group the neighborhoods into clusters. After this we will visualize it.

## Let's import important libraries first.

In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: \ 

## Use geopy library to get the latitude and longitude values of Toronto, Ontario City.

In [None]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

##  Creating map of Toronto using latitude and longitude values.

In [None]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_coordinate['Latitude'], df_coordinate['Longitude'], df_coordinate['Borough'], df_coordinate['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
map_toronto

## Intializing Foursqure ID, Secret and version

In [None]:
CLIENT_ID = 'L0XBWQWQS03SXZSKIKAM3VFX0QXFWGBUDSOMXFZD0BOK0ION' # your Foursquare ID
CLIENT_SECRET = 'V2D4N3NRAHMZ3QXZ4G5WBQNVD5SAMVDJCCOGLNPOBSYJDFA4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 500
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

## Using for loop to store venues accessed by Foursqure API and creating new dataframe venues_list.

In [None]:
venues_list=[]
for lat, lng,postal, borough, neighborhood in zip(df_coordinate['Latitude'], df_coordinate['Longitude'], 
                                                  df_coordinate['PostalCode'],df_coordinate['Borough'], 
                                                  df_coordinate['Neighborhood']):
    
   
            
     # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        lat, 
        lng, 
        radius, 
        LIMIT)
            
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
        
    # return only relevant information for each nearby venue
    for v in results:
        venues_list.append((
        postal,
        borough,
        neighborhood,
        lat, 
        lng,            
        v['venue']['name'], 
        v['venue']['location']['lat'], 
        v['venue']['location']['lng'],  
        v['venue']['categories'][0]['name']))

## Storing all the data into new dataframe **vdf** and updating the column name.

In [None]:
vdf=pd.DataFrame(venues_list)
vdf.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 
             'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']
vdf.head()

## Checking the count with respect to Borough.

In [None]:
vdf.groupby('Borough').count()

## Finding how many venues are present with respect to neighborhoods.

In [None]:
# one hot encoding
Borough_df = pd.get_dummies(vdf[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Borough_df['Neighborhood'] = Borough_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Borough_df.columns[-1]] + list(Borough_df.columns[:-1])
Borough_df = Borough_df[fixed_columns]

Borough_df.head()

## Let's use Kmeans clustering technique. We will cluster different venues corresponding to Boroughs.

In [None]:
# set number of clusters
kclusters = 5



# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Borough_df)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

## Let's visualize clusters of venues using Folium maps.

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi in zip(vdf['BoroughLatitude'], vdf['BoroughLongitude'], vdf['Neighborhood']):
    label = folium.Popup(str(poi), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
       # color=rainbow[cluster-1],
        fill=True,
        #fill_color=rainbow[],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## This concludes the Part 3 as well.

# Thank You!