In [7]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import folium
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### 1. Data Preperation:

In [8]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [9]:
website_text = requests.get(url).text

In [10]:
soup  = BeautifulSoup(website_text, "lxml")

##### Lets  convert table in html into dataframe:

In [11]:
matrix = []
table = soup.table
headings = [heading.text.strip() for heading in table.findAll('th')]
# print("headings : \n", headings)    
for row in table.find_all('tr'):
    redefined_row = [col.text.strip() for col in row.findAll('td')]
    if redefined_row and redefined_row[2]:
        matrix.append([redefined_row[0], redefined_row[1], redefined_row[2]])
# print("We have total samples :  " , len(matrix))

In [12]:
df_data = pd.DataFrame(matrix,columns=["PostalCode", "Borough", "Neighborhood"])

##### Lets remove '/' from in between more than 1 neighborhood:

In [7]:
def combine_neighborhood(df_row):
    if '/' in df_row['Neighborhood']:
        df_row['Neighborhood'] = ", ".join(df_row['Neighborhood'].split('/'))
    return df_row
df_data = df_data.apply(combine_neighborhood, axis=1)

### 2. Lets Get Latitude & Longitude of Neighborhood:

In [8]:
address = 'New York City, NY'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [9]:
df_coordinates = pd.read_csv("Geospatial_Coordinates.csv")

In [10]:
Latitude, Longitude = [], []
for row in df_data.iterrows():
    try:
        temporary_row = df_coordinates[df_coordinates["Postal Code"]==row[1][0]]
        Latitude.append(temporary_row.iloc[0, 1])
        Longitude.append(temporary_row.iloc[0,2])
    except:
        print("No Latitude or Longitude for Code : " , row[1][0])

df_data.insert(3, "Latitude", Latitude)
df_data.insert(4, "Longitude",Longitude)

#### Lets Visualize all Neighbors on Map:

In [11]:
# create map of Manhattan using latitude and longitude values
visual_map = folium.Map(zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_data['Latitude'], df_data['Longitude'], df_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(visual_map)  
    
# visual_map

### 3. Now lets visualize Neighborhood with "Toronot" in Borough:

In [12]:
def check_toronto_keyword(row): 
    if "Toronto" in row["Borough"].strip().split(" "): 
        return row 
    else: 
        return pd.Series()

df_toronto = df_data.apply(check_toronto_keyword, axis =1).dropna().reset_index(drop=True)

  """


In [13]:
# create map of Manhattan using latitude and longitude values
visual_map_toronto = folium.Map(zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(visual_map_toronto)  
    
# visual_map_toronto

### 4. Now lets get all Venues Info around Each & Every Neighborhood of Toronto:

In [14]:

def get_all_neighboring_venues_around_toronto(df_toronto):
    
    CLIENT_ID = 'QVVZFBOIM23WXTHS0QNXXXQRKDR2Y0RZ1IVUKQ5SXP013CWI' 
    CLIENT_SECRET = 'AQNG4XNAKC025K3OASFTQR4WVQGP4LGFRLJ5FPNC4W1E0VEK' 
    VERSION = '20181018'
    LIMIT = 100
    radius = 500
    
    df = pd.DataFrame()
    
    for index, row in df_toronto.iterrows():
#         print("index of row of df_toronto ===========>  ", index)
        
        try:
            neighborhood_name = row['Neighborhood']
            neighbor_latitude = row["Latitude"]
            neighbor_longitude = row["Longitude"]

            url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},\
                            {}&radius={}&limit={}'.format( CLIENT_ID, CLIENT_SECRET, VERSION, \
                                                          neighbor_latitude, neighbor_longitude, radius, LIMIT)
            results = requests.get(url).json()

            venues = results['response']['groups'][0]['items']

            df_venues = json_normalize(venues)

            df_temp = pd.DataFrame(columns=['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude',\
                                           'Venue', 'Venue Category', 'Venue Latitude', 'Venue Longitude'])

            for ind, r in df_venues.iterrows():

                series = r[['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']]
                series["venue.categories"] = series['venue.categories'][0]['name']
                series["Neighborhood"] = neighborhood_name
                series["Neighborhood Latitude"] = neighbor_latitude
                series["Neighborhood Longitude"] = neighbor_longitude

                series.rename({'venue.name': 'Venue',
                                          'venue.categories':'Venue Category',
                                          'venue.location.lat' : 'Venue Latitude',
                                          'venue.location.lng': 'Venue Longitude'}, inplace=True)

                df_temp.loc[ind] = series
            
            df  = pd.concat([df, df_temp])
        except Exception as e:
            print("Exception :  ", e)
            pass
    return df

df_data = get_all_neighboring_venues_around_toronto(df_items)

NameError: name 'df_items' is not defined

In [68]:
df_all_venues_around_toronto.Neighborhood.unique().shape

(39,)

### 5. Lets analyse the Final Data Prepared:

In [87]:
df_preprocessed = df_all_venues_around_toronto

# lets do onehot encoding:
df_preprocessed = pd.get_dummies(df_preprocessed["Venue Category"], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
df_preprocessed['Neighborhood'] = df_all_venues_around_toronto['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [df_preprocessed.columns[-1]] + list(df_preprocessed.columns[:-1])
df_preprocessed = df_preprocessed[fixed_columns]

df_toronto_grouped = df_preprocessed.groupby('Neighborhood').mean().reset_index()


In [89]:
num_top_venues = 5

for hood in df_toronto_grouped['Neighborhood']:
#     print("----"+hood+"----")
    temp = df_toronto_grouped[df_toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
#     print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
#     print('\n')

In [90]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [95]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_toronto_grouped['Neighborhood']

for ind in np.arange(df_toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(df_toronto_grouped.iloc[ind, :], num_top_venues)

# neighborhoods_venues_sorted

### 5. Clustering Neighborhood:

In [99]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = df_toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

df_toronto_merged = df_toronto

# add clustering labels
df_toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_toronto_merged = df_toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

df_toronto_merged.head() # check the last columns!

Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,PostalCode,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,43.65426,-79.360636,"Regent Park , Harbourfront",M5A,4,Coffee Shop,Pub,Park,Bakery,Breakfast Spot,Theater,Café,Mexican Restaurant,Restaurant,Distribution Center
1,Downtown Toronto,43.662301,-79.389494,"Queen's Park , Ontario Provincial Government",M7A,0,Coffee Shop,Diner,Creperie,Beer Bar,Boutique,Sandwich Place,Burger Joint,Burrito Place,Café,Park
2,Downtown Toronto,43.657162,-79.378937,"Garden District, Ryerson",M5B,4,Clothing Store,Coffee Shop,Cosmetics Shop,Café,Middle Eastern Restaurant,Japanese Restaurant,Bubble Tea Shop,Theater,Bookstore,Diner
3,Downtown Toronto,43.651494,-79.375418,St. James Town,M5C,4,Coffee Shop,Café,Italian Restaurant,Restaurant,Cocktail Bar,Clothing Store,Gastropub,Bakery,Beer Bar,Diner
4,East Toronto,43.676357,-79.293031,The Beaches,M4E,4,Coffee Shop,Trail,Health Food Store,Pub,Deli / Bodega,Electronics Store,Eastern European Restaurant,Dumpling Restaurant,Donut Shop,Doner Restaurant


In [106]:
# create map
map_clusters = folium.Map(zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto_merged['Latitude'], df_toronto_merged['Longitude'], df_toronto_merged['Neighborhood'], df_toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters