# Complete notebook for segmenting and clustering neighborhoods in Toronto.

## 1. Initial libraries to be imported:

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import requests # library to handle requests

! conda install -c anaconda beautifulsoup4 --yes  #install the python package to parse the html page (wikipedia)
from bs4 import BeautifulSoup

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    numpy-base-1.15.4          |   py36h81de0dd_0         4.2 MB  anaconda
    numpy-1.15.4               |   py36h1d66e8a_0          35 KB  anaconda
    certifi-2019.9.11          |           py36_0         154 KB  anaconda
    beautifulsoup4-4.8.1       |           py36_0         153 KB  anaconda
    openssl-1.1.1              |       h7b6447c_0         5.0 MB  anaconda
    soupsieve-1.9.5            |           py36_0          61 KB  anaconda
    mkl_fft-1.0.6              |   py36h7dd41cf_0         150 KB  anaconda
    blas-1.0                   |          

## 2. Scrapping the wikipedia webpage:

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)

soup = BeautifulSoup(r.content, 'html5lib')

table = soup.find('div', attrs = {'id':'container'})

print('Page scrapped.')

Page scrapped.


### 2.1. Extracting table contents from wikipedia page:

In [3]:
postalCodes = []
boroughs = []
neighborhoods = []
colNum = 1
passVal = False

for row in soup.find_all('td'):
    for cell in row:
        if cell.string and cell.string[0].isalpha() and len(cell.string)>2:
            passVal = False
            if colNum == 1:
                if passVal == False and cell.string[1].isdigit():
                    postalCodes.append(cell.string)
                    colNum = 2
                else:
                    continue
            elif colNum == 2:
                if cell.string == 'Not assigned':
                    passVal = True
                    del postalCodes[-1]
                    colNum = 1
                    continue
                else:
                    boroughs.append(cell.string)
                    colNum = 3
            elif colNum == 3:
                if cell.string == 'Not assigned\n':
                    neighborhoods.append(boroughs[-1])
                else:
                    neighborhoods.append(cell.string)
                colNum = 1
                
print('Data Collected.')

Data Collected.


### 2.2. Define column names and create empty dataframe:

In [4]:
col_names = ['PostalCode','Borough','Neighborhood']

tordf = pd.DataFrame(columns = col_names)

tordf

Unnamed: 0,PostalCode,Borough,Neighborhood


### 2.3. Add extracted data to the dataframe columns:

In [5]:
for data in range(len(neighborhoods)):
    code = postalCodes[data]
    borough = boroughs[data]
    neighborhood_name = neighborhoods[data]
    tordf = tordf.append({'PostalCode':code, 'Borough':borough, 'Neighborhood':neighborhood_name},
                                ignore_index = True)

In [6]:
tordf.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Queen's Park,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North\n


#### Note that there is no 'Regent Park' neighborhood for M5A postal code in the wikipedia page table.

### 2.4. Now, group neighborhoods (comma separated) that belong to the same postal code:

In [7]:
tordf = tordf.groupby(['PostalCode','Borough'], as_index=False, sort=False).agg(lambda x:', '.join(x))

In [8]:
tordf.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


#### Note the number of rows is 103 and columns is 3 of consolidated dataframe:

In [9]:
tordf.shape

(103, 3)

## 3. Now, moving on to the second part of the assignment...Adding latitude and logitude columns to the dataframe.

### 3.1. First, fetch the lat,long values from the Geospatial_data csv file:

In [10]:
dfll = pd.read_csv('https://cocl.us/Geospatial_data')
dfll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 3.2. Rename the Postal Code column (remove space) to later merge the dataframes:

In [11]:
dfll.rename(columns={'Postal Code':'PostalCode'}, inplace=True)

In [12]:
dfll.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 3.3. Merge the neighborhood dataframe (tordf) with latlong dataframe (dfll) on the 'PostalCode' column, to get lat long values only for required postal codes:

In [13]:
tordfll = tordf.merge(dfll, on=['PostalCode'])

In [14]:
tordfll.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Moving on to the third part of the assignment...Segmenting and Clustering the neighborhoods of Toronto, and plotting them on the map of Toronto.

## 4. Import plotting libraries:

In [15]:
import matplotlib.cm as cm # matplotlib module
import matplotlib.colors as colors # matplotlib colors module
from sklearn.cluster import KMeans # k-means from clustering stage
!conda install -c conda-forge folium=0.5.0 --yes # install folium
import folium # map rendering library

print('Libraries imported.')

Solving environment: done


  current version: 4.5.11
  latest version: 4.7.12

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge

The following packages will be UPDATED:

    certifi: 2019.9.11-py36_0 anaconda --> 2019.11.28-py36_0 conda-forge

The following packages will be DOWNGRADED:

    openssl: 1.1.1-h7b6447c_0 anaconda --> 1.1.1d-h516909a_0 conda-forge


Downloading and Extracting Packages
certifi-2019.11.28   | 149 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Libraries imported.


## 5. Create a map of Toronto with blue circle markers for neighborhoods:

In [16]:
# create map of Toronto using latitude and longitude values
trmap = folium.Map(location=[43.6532, -79.3832], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(tordfll['Latitude'], tordfll['Longitude'], tordfll['Borough'], 
                                           tordfll['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(trmap)  
    
trmap

## 6. Here, I have decided to work with the borough Scarborough.
### 6.1. So extracting only Scarborough data to a new dataframe:

In [17]:
sb_data = tordfll[tordfll['Borough'] == 'Scarborough'].reset_index(drop=True)
sb_data

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park\n, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West\n",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West\n",43.692657,-79.264848


In [18]:
sb_data.shape

(17, 5)

### 6.2. Create a map of Scarborough with blue circle markers for all its neighborhoods.

In [19]:
# create map of Scarborough using latitude and longitude values
sbmap = folium.Map(location=[43.7764, -79.2318], zoom_start=11)

# add markers to map
for lat, lng, label in zip(sb_data['Latitude'], sb_data['Longitude'], sb_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(sbmap)  
    
sbmap

## 7. Define Foursquare credentials and version:

In [20]:
CLIENT_ID = 'NCZ4TYNDUCFUKDGFRVLSSWGH0GREFDJ2MHMASMVXPMQN0XNZ' # my Foursquare ID
CLIENT_SECRET = 'JUZMZA4ADJYHBOU1QMGZLJ0OBVAJH3JONQIVEV3QB52OM4X5' # my Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('My credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

LIMIT = 100

My credentails:
CLIENT_ID: NCZ4TYNDUCFUKDGFRVLSSWGH0GREFDJ2MHMASMVXPMQN0XNZ
CLIENT_SECRET:JUZMZA4ADJYHBOU1QMGZLJ0OBVAJH3JONQIVEV3QB52OM4X5


### 7.1. In Foursquare, all the information is in the items key. So we define the get_category_type function to extract the categories of the venues:

In [21]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### 7.2. Following is a function to explore the neighborhoods in Scarborough and retrieve top 100 venues for all neighborhoods within 500 metres radius.  Done using API request through Foursquare:

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

## 8. List of Scarborough venues:

In [23]:
sb_venues = getNearbyVenues(names=sb_data['Neighborhood'],
                                   latitudes=sb_data['Latitude'],
                                   longitudes=sb_data['Longitude']
                                  )

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood
, Morningside, West Hill
Woburn
Cedarbrae

Scarborough Village
East Birchmount Park
, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West

Birch Cliff, Cliffside West

Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners
, Sullivan
, Tam O'Shanter
Agincourt North, L'Amoreaux East
, Milliken, Steeles East

L'Amoreaux West

Upper Rouge


### 8.1. Size of the venues dataframe and a view of what the dataframe looks like with venue names, locations and category included:

In [24]:
print(sb_venues.shape)
sb_venues.head()

(96, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood\n, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Guildwood\n, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood\n, Morningside, West Hill",43.763573,-79.188711,Marina Spa,43.766,-79.191,Spa


### 8.2. Following checks how many venues were returned for each neighborhood:

In [25]:
sb_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agincourt,5,5,5,5,5,5
"Agincourt North, L'Amoreaux East\n, Milliken, Steeles East\n",2,2,2,2,2,2
"Birch Cliff, Cliffside West\n",4,4,4,4,4,4
Cedarbrae\n,8,8,8,8,8,8
"Clairlea, Golden Mile, Oakridge",9,9,9,9,9,9
"Clarks Corners\n, Sullivan\n, Tam O'Shanter",12,12,12,12,12,12
"Cliffcrest, Cliffside, Scarborough Village West\n",2,2,2,2,2,2
"Dorset Park, Scarborough Town Centre, Wexford Heights",7,7,7,7,7,7
"East Birchmount Park\n, Ionview, Kennedy Park",7,7,7,7,7,7
"Guildwood\n, Morningside, West Hill",8,8,8,8,8,8


### 8.3. Of this list of venue categories, number of unique categories can be found using the .unique() function:

In [26]:
print('There are {} uniques categories.'.format(len(sb_venues['Venue Category'].unique())))

There are 55 uniques categories.


## 9. Analyze each neighborhood in Scarborough.

### 9.1. Perform one-hot encoding:

In [27]:
# one hot encoding
sb_onehot = pd.get_dummies(sb_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sb_onehot['Neighborhood'] = sb_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [sb_onehot.columns[-1]] + list(sb_onehot.columns[:-1])
sb_onehot = sb_onehot[fixed_columns]

sb_onehot.head()

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Brewery,Bubble Tea Shop,...,Pharmacy,Pizza Place,Playground,Rental Car Location,Sandwich Place,Skating Rink,Soccer Field,Spa,Thai Restaurant,Vietnamese Restaurant
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Guildwood\n, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,"Guildwood\n, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood\n, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [28]:
sb_onehot.shape

(96, 56)

### 9.2. Group rows by neighborhood and take the mean of the frequency of occurrence of each category:

In [29]:
sb_grouped = sb_onehot.groupby('Neighborhood').mean().reset_index()
sb_grouped

Unnamed: 0,Neighborhood,American Restaurant,Athletics & Sports,Auto Garage,Bakery,Bank,Bar,Breakfast Spot,Brewery,Bubble Tea Shop,...,Pharmacy,Pizza Place,Playground,Rental Car Location,Sandwich Place,Skating Rink,Soccer Field,Spa,Thai Restaurant,Vietnamese Restaurant
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0
1,"Agincourt North, L'Amoreaux East\n, Milliken, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Birch Cliff, Cliffside West\n",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0
3,Cedarbrae\n,0.0,0.125,0.0,0.125,0.125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0
4,"Clairlea, Golden Mile, Oakridge",0.0,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0
5,"Clarks Corners\n, Sullivan\n, Tam O'Shanter",0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,...,0.166667,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0
6,"Cliffcrest, Cliffside, Scarborough Village West\n",0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Dorset Park, Scarborough Town Centre, Wexford ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857
8,"East Birchmount Park\n, Ionview, Kennedy Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Guildwood\n, Morningside, West Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,...,0.0,0.125,0.0,0.125,0.0,0.0,0.0,0.125,0.0,0.0


In [30]:
sb_grouped.shape

(16, 56)

### 9.3. Print each neighborhood set with top 5 most common venues:

In [31]:
num_top_venues = 5

for hood in sb_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = sb_grouped[sb_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
                       venue  freq
0             Clothing Store   0.2
1               Skating Rink   0.2
2             Breakfast Spot   0.2
3  Latin American Restaurant   0.2
4                     Lounge   0.2


----Agincourt North, L'Amoreaux East
, Milliken, Steeles East
----
                       venue  freq
0                 Playground   0.5
1                       Park   0.5
2        American Restaurant   0.0
3  Middle Eastern Restaurant   0.0
4          Indian Restaurant   0.0


----Birch Cliff, Cliffside West
----
                   venue  freq
0        College Stadium  0.25
1  General Entertainment  0.25
2           Skating Rink  0.25
3                   Café  0.25
4    American Restaurant  0.00


----Cedarbrae
----
                  venue  freq
0  Caribbean Restaurant  0.12
1       Thai Restaurant  0.12
2                Bakery  0.12
3                  Bank  0.12
4      Hakka Restaurant  0.12


----Clairlea, Golden Mile, Oakridge----
          venue  freq
0  

### 9.4. Sort the venues in descending order using the sort_values() function:

In [32]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### 9.5. Create new dataframe and display top 10 venues for each neighborhood:

In [42]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = sb_grouped['Neighborhood']

for ind in np.arange(sb_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(sb_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Clothing Store,Skating Rink,Breakfast Spot,Latin American Restaurant,Lounge,Vietnamese Restaurant,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
1,"Agincourt North, L'Amoreaux East\n, Milliken, ...",Playground,Park,Vietnamese Restaurant,Caribbean Restaurant,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store,Cosmetics Shop
2,"Birch Cliff, Cliffside West\n",General Entertainment,Skating Rink,College Stadium,Café,Chinese Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store
3,Cedarbrae\n,Thai Restaurant,Athletics & Sports,Gas Station,Bakery,Bank,Hakka Restaurant,Fried Chicken Joint,Caribbean Restaurant,Coffee Shop,Fast Food Restaurant
4,"Clairlea, Golden Mile, Oakridge",Bakery,Bus Line,Metro Station,Soccer Field,Fast Food Restaurant,Bus Station,Park,Convenience Store,Coffee Shop,College Stadium


## 10. Clustering step using K-means algorithm.

### 10.1. Run k-means algorithm to cluster the neighborhoods into 6 clusters:

In [43]:
# set number of clusters
kclusters = 6  # 6 clusters gives best cluster output for me for Scarborough.

sb_grouped_clustering = sb_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sb_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 3, 0, 0, 0, 0, 4, 0, 0, 0], dtype=int32)

### 10.2. Create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood:

In [44]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

sb_merged = sb_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
sb_merged = sb_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
pd.options.display.float_format = '{:,.0f}'.format # display cluster labels in whole number format, without decimal points.
sb_merged.drop(sb_merged.tail(1).index,inplace=True) # drop the last row of dataframe since it contains all NaN values for all columns.
sb_merged

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",44,-79,1,Fast Food Restaurant,Vietnamese Restaurant,Chinese Restaurant,Gas Station,Fried Chicken Joint,Electronics Store,Discount Store,Department Store,Cosmetics Shop,Convenience Store
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",44,-79,2,Bar,Vietnamese Restaurant,Chinese Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store,Cosmetics Shop
2,M1E,Scarborough,"Guildwood\n, Morningside, West Hill",44,-79,0,Mexican Restaurant,Rental Car Location,Intersection,Breakfast Spot,Medical Center,Pizza Place,Electronics Store,Spa,Department Store,Cosmetics Shop
3,M1G,Scarborough,Woburn,44,-79,5,Coffee Shop,Korean Restaurant,Pharmacy,Vietnamese Restaurant,Chinese Restaurant,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
4,M1H,Scarborough,Cedarbrae\n,44,-79,0,Thai Restaurant,Athletics & Sports,Gas Station,Bakery,Bank,Hakka Restaurant,Fried Chicken Joint,Caribbean Restaurant,Coffee Shop,Fast Food Restaurant
5,M1J,Scarborough,Scarborough Village,44,-79,3,Spa,Playground,Convenience Store,Vietnamese Restaurant,Caribbean Restaurant,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store
6,M1K,Scarborough,"East Birchmount Park\n, Ionview, Kennedy Park",44,-79,0,Discount Store,Chinese Restaurant,Department Store,Convenience Store,Bus Station,Coffee Shop,Vietnamese Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",44,-79,0,Bakery,Bus Line,Metro Station,Soccer Field,Fast Food Restaurant,Bus Station,Park,Convenience Store,Coffee Shop,College Stadium
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West\n",44,-79,4,American Restaurant,Motel,Chinese Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store,Department Store,Cosmetics Shop
9,M1N,Scarborough,"Birch Cliff, Cliffside West\n",44,-79,0,General Entertainment,Skating Rink,College Stadium,Café,Chinese Restaurant,Gas Station,Fried Chicken Joint,Fast Food Restaurant,Electronics Store,Discount Store


## 11. Finally, visualize the resulting clusters using folium map.

In [36]:
# create map
map_clusters = folium.Map(location=[43.7764, -79.2318], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sb_merged['Latitude'], sb_merged['Longitude'], sb_merged['Neighborhood'], sb_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### We can see that the neighborhoods in Scarborough have been segmented and clustered into 6 clusters, which are represented by the 6 different color markers in the map above.

### It can be observed that most neighborhoods fall under Cluster 0 and very few fall under other clusters.

# End of notebook!