<h1 align=center><font size = 6>Capstone Project</font></h1>
<h1 align=center><font size = 5>Finding Location for a Beauty Salon</font></h1>

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests 
from geopy.geocoders import Nominatim
import folium
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans

---
## Part 1. Web Scraping

The goal of this part is to get available apartments from the <a href=https://www.spacelist.ca>Spacelist</a> service and put them into a dataframe.

In [2]:
# specifying parameters
COUNTRY = 'CA'
CITY = 'toronto'
TYPE = 'retail-other' # categories of the apartments
PURPOSE = 'for-lease'
MIN_SIZE = '300' # in square feet
MAX_SIZE = '800' # in sqare feet

In [3]:
# saving city center coordinates as default
geolocator = Nominatim(user_agent="capstone-project")
loc_def = geolocator.geocode(CITY +',' + COUNTRY)
lat_def = loc_def.latitude
lng_def = loc_def.longitude
print("Default latitude (city center) is {}".format(lat_def))
print("Default longitude (city center) is {}".format(lng_def))

Default latitude (city center) is 43.653963
Default longitude (city center) is -79.387207


The following functions parse results from the site and save them into a dataframe.

In [4]:
# columns for the dataframe
column_names = ['id', 'address', 'suite', 'square', 'type', 'price', 'lat', 'lng'] 

In [5]:
# load all apartments into a dataframe using BeautifulSoup
# returns a filled dataframe as a result
def load_all_apartments(df):
    # results can be on several pages
    # starting with the first one
    page_num = 1
    has_more = True
    # continue, if there was results on the page
    while(has_more):
        url = "https://www.spacelist.ca/listings/on/{}/{}/{}/page/{}?s%5Bmax_size%5D={}&s%5Bmin_size%5D={}".format(
                CITY, TYPE, PURPOSE, page_num, MAX_SIZE, MIN_SIZE)
        source = requests.get(url).text
        apart_list = BeautifulSoup(source, 'lxml').find_all('div', class_ = "listing-result cell shrink")
        if (len(apart_list) == 0):
            # no more results
            has_more = False
        else:
            # parse results from the page into a dataframe
            df = load_apartments_from_list(apart_list, df)
            page_num = page_num + 1
    return df

In [6]:
# parse results from the site using BeautifulSoup
def load_apartments_from_list(apart_list, df):
    for apart in apart_list:
        item = apart.find('a', class_ = "listing-card")
        id_ = item['data-listing']
        item = item.find('div', class_ = "meta-card")
        cell = item.find('div', class_ = "cell auto about")
        square_ = cell.find('div', class_ = "default-font").text.split(' ')[0]
        type_ = cell.find('div', class_ = "heavy-font").text
        cell = item.find('div', class_ = "cell shrink display-price")
        cell = cell.find('div', class_ = "default-font")
        price_ = cell.find('div', class_ = "rent_per_month").text
        cell = item.find('h2').find('div').text.split(' - ')
        # separate suite from the address
        if (len(cell) > 1):
            address_ = cell[1]
            suite_ = cell[0]
        else:
            address_ = cell[0]
            suite_ = "-"
        loc = geolocator.geocode(address_ + ',' + CITY +',' + COUNTRY)
        # if address is not correct (e.g. missing letters)
        # geocode can't determine latitude and longitude
        # since it is a very rare situation, just use default coordinates
        if (loc is None):
            lat_ = lat_def
            lng_ = lng_def
        else:
            lat_ = loc.latitude
            lng_ = loc.longitude
        apart = pd.DataFrame([[id_, address_, suite_, square_, type_, price_, lat_, lng_]], columns = column_names, index = [id_])
        df = df.append(apart)
    return df

Call functions and save results into a dataframe.

In [12]:
aparts_df = pd.DataFrame(columns = column_names)
aparts_df = load_all_apartments(aparts_df)
aparts_df.set_index('id')
print(aparts_df.shape)

(42, 8)


In [13]:
aparts_df.head(10)

Unnamed: 0,id,address,suite,square,type,price,lat,lng
93705,93705,717 Queen St E,202B,400,Office / Retail,"$1,500/mo",43.658853,-79.349325
196010,196010,1918 A Queen Street East,M,450,Retail,"$3,600/mo",43.669264,-79.304059
226221,226221,1489 Dundas Street West,-,600,Office / Retail / Other,Contact,43.6495,-79.431684
189358,189358,36 Toronto St,108G,363,Retail,Contact,43.650687,-79.376692
198168,198168,98 Ossington Avenue,Store Front,500,Office / Retail,"$2,800/mo",43.646342,-79.419797
202368,202368,717 Queen St E,A,"140-7,800",Office / Retail / Medical,"$1,200/mo",43.658853,-79.349325
195694,195694,41 Dovercourt Rd,1,"750-5,658",Retail,"$3,924-29,609/mo",43.641559,-79.421492
195692,195692,41 Dovercourt Rd,5,533,Retail,"$2,788/mo",43.641559,-79.421492
195693,195693,41 Dovercourt Rd,7,719,Retail,"$3,762/mo",43.641559,-79.421492
194676,194676,20 Toronto St,137,635,Retail,Contact,43.650264,-79.376402


Map all found apartments.

In [118]:
map_all_aparts = folium.Map(location=[lat_def, lng_def], zoom_start=11)

# add markers to map
for lat, lng, addr, pr in zip(aparts_df['lat'], aparts_df['lng'], aparts_df['address'], aparts_df['price']):
    label = '{}, {}'.format(addr, pr)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_all_aparts)  
    
map_all_aparts

-----
## Part 2. Filter apartments using Foursquare

In [15]:
CLIENT_ID = '5AUUUCSEBIMVWGLSIH3YU2YCBGKHGH5W333VU2APBQVEEFVZ'
CLIENT_SECRET = 'YOQBLPCWR5E3LRLYKSPB3QXO4UG2INMK1VOHEGWAG1U22QGX'
VERSION = '20190915'
LIMIT = 50

In [16]:
# queries for the foursquare
bus_query = 'Bus Stop'
parking_query = 'Parking'
shopping_query = 'Shopping'
mall_query = 'Shopping Mall'
salon_query = 'Salon / Barbershop'
# small radius is used for filtering from other salons
s_radius = 150
# big radius is used for bus stops, parking, and shopping centers
b_radius = 600
# category filters for different requests
bus_category_filter = ['Bus Stop', 'Light Rail Station']
parking_category_filter = ['Parking']
salon_category_filter = ['Salon / Barbershop']
# columns filter for the foursqure
filtered_columns = ['id', 'name', 'categories', 'location.lat', 'location.lng']
# resulting columns
result_columns = ['id', 'name', 'categories', 'lat', 'lng', 'apart']

In [18]:
# forming dataframes for a results from the foursquare
bus_df = pd.DataFrame(columns = result_columns)
parking_df = pd.DataFrame(columns = result_columns)
shopping_df = pd.DataFrame(columns = result_columns)
salon_df = pd.DataFrame(columns = result_columns)
salon_df

Unnamed: 0,id,name,categories,lat,lng,apart


In [19]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories'] 
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [20]:
# process results of each query to the foursquare
# category filter can be an empty list
def get_query_results(apart_id, lat, lng, query, radius, cat_filter):
    # search for a specific query
    url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, query, radius, LIMIT)
    # assign relevant part of JSON to venues
    venues = requests.get(url).json()['response']['venues']
    # tranform venues into a dataframe
    df = json_normalize(venues)
    if (df.empty):
        # process the situation, when there is no results
        return None
    else:
        # keep only columns of interest
        df = df.loc[:, filtered_columns]
        df['categories'] = df.apply(get_category_type, axis=1)
        # keep only relevant rows 
        if (len(cat_filter) > 0 ):
            df = df[df.categories.isin(cat_filter)]
        df['apart'] = apart_id
        # clean column names
        df.columns = [column.split('.')[-1] for column in df.columns]
        return df

We look for an apartment, which has both a bus stop and a parking nearby, shop or a mall, and doesn't have a salon inside a small radius.

In [21]:
# for each found apartment run all queries:
# search for bus
# search for parking
# search for shopping and malls
# search for salons
for index, apart in aparts_df.iterrows():
    apart_id = apart['id']
    bus_df = bus_df.append(get_query_results(apart_id, apart['lat'], apart['lng'], bus_query, b_radius, bus_category_filter))
    parking_df = parking_df.append(get_query_results(apart_id, apart['lat'], apart['lng'], parking_query, b_radius, parking_category_filter))
    shopping_df = shopping_df.append(get_query_results(apart_id, apart['lat'], apart['lng'], shopping_query, b_radius, []))
    shopping_df = shopping_df.append(get_query_results(apart_id, apart['lat'], apart['lng'], mall_query, b_radius, []))
    salon_df = salon_df.append(get_query_results(apart_id, apart['lat'], apart['lng'], salon_query, s_radius, salon_category_filter))

In [22]:
with_bus_stop = bus_df['apart'].unique()
with_parking = parking_df['apart'].unique()
with_shops = shopping_df['apart'].unique()
with_salons = salon_df['apart'].unique()

In [23]:
print('{} apartments have bus stop nearby'.format(len(with_bus_stop)))
print('{} apartments have parking nearby'.format(len(with_parking)))
print('{} apartments have shops or malls nearby'.format(len(with_shops)))
print('{} apartments have other salons nearby'.format(len(with_salons)))

36 apartments have bus stop nearby
33 apartments have parking nearby
27 apartments have shops or malls nearby
30 apartments have other salons nearby


In [24]:
# gets intersectoin of 2 lists
def intersection(first, second): 
    second = set(second) 
    return [item for item in first if item in second]  

In [25]:
# substract the second list from the first
def difference(first, second):
    second = set(second)
    return [item for item in first if item not in second]

Filtering apartments, which satisfy all criteria.

In [26]:
result_aparts = intersection(with_bus_stop, with_parking)
print(len(result_aparts))
result_aparts = intersection(result_aparts, with_shops)
print(len(result_aparts))
result_aparts = difference(result_aparts, with_salons)
print(len(result_aparts)) 

29
23
5


Apartments, that satisfy all criteria:

In [122]:
final_aparts = aparts_df[aparts_df['id'].isin(result_aparts)]
final_aparts.head()

Unnamed: 0,id,address,suite,square,type,price,lat,lng
196010,196010,1918 A Queen Street East,M,450,Retail,"$3,600/mo",43.669264,-79.304059
198168,198168,98 Ossington Avenue,Store Front,500,Office / Retail,"$2,800/mo",43.646342,-79.419797
195694,195694,41 Dovercourt Rd,1,"750-5,658",Retail,"$3,924-29,609/mo",43.641559,-79.421492
195692,195692,41 Dovercourt Rd,5,533,Retail,"$2,788/mo",43.641559,-79.421492
195693,195693,41 Dovercourt Rd,7,719,Retail,"$3,762/mo",43.641559,-79.421492


Show on map

1. Apartments, which satisfy all criteria in green
2. All other apartments in black
2. Shops, bus stops and parking in blue
3. Salons in red

In [121]:
black = aparts_df[~aparts_df['id'].isin(result_aparts)]
black.head()

Unnamed: 0,id,address,suite,square,type,price,lat,lng
93705,93705,717 Queen St E,202B,400,Office / Retail,"$1,500/mo",43.658853,-79.349325
226221,226221,1489 Dundas Street West,-,600,Office / Retail / Other,Contact,43.6495,-79.431684
189358,189358,36 Toronto St,108G,363,Retail,Contact,43.650687,-79.376692
202368,202368,717 Queen St E,A,"140-7,800",Office / Retail / Medical,"$1,200/mo",43.658853,-79.349325
194676,194676,20 Toronto St,137,635,Retail,Contact,43.650264,-79.376402


In [120]:
blue = bus_df
blue = blue.append(parking_df)
blue = blue.append(shopping_df)
blue.head()

Unnamed: 0,id,name,categories,lat,lng,apart
1,4fcfd817e4b071f98b98e3a2,TTC Stop #11895,Bus Stop,43.660751,-79.350139,93705
3,4cd5aefd886cb60c3e938589,TTC Stop #3033,Light Rail Station,43.659136,-79.349712,93705
8,531fca76498e81a83abb1327,TTC Stop 3047,Light Rail Station,43.660156,-79.34417,93705
9,4c1696e18aedd13a6c0b5237,TTC Stop #01081,Light Rail Station,43.665284,-79.352964,93705
10,51d82cb3498ef5f350eea3dc,TTC Stop #9317,Light Rail Station,43.662584,-79.351606,93705


In [123]:
aparts_map = folium.Map(location=[lat_def, lng_def], zoom_start=12)
   
# add bus stops as blue circle markers
for lat, lng, name, cat in zip(blue['lat'], blue['lng'], blue['name'], blue['categories']):
    label = '{} ({})'.format(name, cat)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.4
    ).add_to(aparts_map)
    
# add other salons as red circle markers
for lat, lng, name in zip(salon_df['lat'], salon_df['lng'], salon_df['name']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='red',
        popup=folium.Popup(name),
        fill = True,
        fill_color='red',
        fill_opacity=0.4
    ).add_to(aparts_map)      

# add failed apartments as black markers
for lat, lng, addr, pr in zip(black['lat'], black['lng'], black['address'], black['price']):
    label = '{}, {}'.format(addr, pr)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='black',
        fill_opacity=1,
        parse_html=False).add_to(aparts_map)  

    
# add a green circle marker to represent the apartments
for lat, lng, addr, pr in zip(final_aparts['lat'], final_aparts['lng'], final_aparts['address'], final_aparts['price']):
    label = '{}, {}'.format(addr, pr)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=1,
        parse_html=False).add_to(aparts_map)  

# display map
aparts_map

----
## Part 3. Clustering Neighborhoods

Download Toronto neighborhoods with latitude and longitude coordinates (saved from the previous lab).

In [34]:
neighborhoods = pd.read_csv(r'NeighToronto.csv')
neighborhoods = neighborhoods.drop(columns=['Unnamed: 0'])
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


Since the apratments are in the city center, let's limit boroughs.

In [35]:
toronto = neighborhoods[neighborhoods['Borough'].str.contains('Toronto')].reset_index(drop = True)
toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
1,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [38]:
# expected 38, 5
toronto.shape

(38, 5)

Explore all the selected neighborhoods for venues nearby and form a dataframe.

In [41]:
def getNearbyVenues(names, latitudes, longitudes):
    venues_list=[]
    n = 1
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print('{}: {}'.format(n, name))
        n = n + 1
              
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, b_radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, lat, lng, 
            v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    # create a dataframe out of list
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Lat', 'Neighborhood Lng', 
                  'Venue', 'Venue Lat', 'Venue Lng', 'Venue Category']
    
    return(nearby_venues)

In [42]:
# expected: print all toronto neighborhoods -- 38
toronto_venues = getNearbyVenues(names = toronto['Neighborhood'],
                                 latitudes = toronto['Latitude'],
                                 longitudes = toronto['Longitude']
                                )

1: Harbourfront, Regent Park
2: Ryerson, Garden District
3: St. James Town
4: The Beaches
5: Berczy Park
6: Central Bay Street
7: Christie
8: Adelaide, King, Richmond
9: Dovercourt Village, Dufferin
10: Harbourfront East, Toronto Islands, Union Station
11: Little Portugal, Trinity
12: The Danforth West, Riverdale
13: Design Exchange, Toronto Dominion Centre
14: Brockton, Exhibition Place, Parkdale Village
15: The Beaches West, India Bazaar
16: Commerce Court, Victoria Hotel
17: Studio District
18: Lawrence Park
19: Roselawn
20: Davisville North
21: Forest Hill North, Forest Hill West
22: High Park, The Junction South
23: North Toronto West
24: The Annex, North Midtown, Yorkville
25: Parkdale, Roncesvalles
26: Davisville
27: Harbord, University of Toronto
28: Runnymede, Swansea
29: Moore Park, Summerhill East
30: Chinatown, Grange Park, Kensington Market
31: Deer Park, Forest Hill SE, Rathnelly, South Hill, Summerhill West
32: CN Tower, Bathurst Quay, Island airport, Harbourfront West, 

Have a look at the results.

In [53]:
print(toronto_venues.shape)
toronto_venues.head()

(1377, 7)


Unnamed: 0,Neighborhood,Neighborhood Lat,Neighborhood Lng,Venue,Venue Lat,Venue Lng,Venue Category
0,"Harbourfront, Regent Park",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Harbourfront, Regent Park",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Harbourfront, Regent Park",43.65426,-79.360636,Toronto Cooper Koo Family Cherry St YMCA Centre,43.653191,-79.357947,Gym / Fitness Center
3,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Harbourfront, Regent Park",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [54]:
print('There are {} uniques categories'.format(len(toronto_venues['Venue Category'].unique())))

There are 229 uniques categories


Have a closer look at all these categories and select only categories of interest.

In [45]:
print(toronto_venues['Venue Category'].unique())

['Bakery' 'Coffee Shop' 'Gym / Fitness Center' 'Spa' 'Restaurant'
 'Breakfast Spot' 'Park' 'Pub' 'Historic Site' 'Farmers Market'
 'Chocolate Shop' 'Dessert Shop' 'Performing Arts Venue'
 'Mediterranean Restaurant' 'Mexican Restaurant' 'French Restaurant'
 'Café' 'Italian Restaurant' 'Liquor Store' 'Tech Startup' 'Event Space'
 'Yoga Studio' 'Greek Restaurant' 'Theater' 'Ice Cream Shop' 'Shoe Store'
 'Art Gallery' 'Animal Shelter' 'Asian Restaurant' 'Cosmetics Shop'
 'Brewery' 'Thai Restaurant' 'Clothing Store' 'Pizza Place' 'Comic Shop'
 'Plaza' 'Tea Room' 'Burrito Place' 'Ramen Restaurant' 'Burger Joint'
 'Sandwich Place' 'Movie Theater' 'Diner' 'Steakhouse' 'Hotel'
 'Japanese Restaurant' 'American Restaurant' 'Gastropub'
 'Fast Food Restaurant' 'Vegetarian / Vegan Restaurant'
 'Modern European Restaurant' 'Beer Bar' 'Sporting Goods Shop'
 'Shopping Mall' 'Bookstore' 'Miscellaneous Shop' 'Bar'
 'Middle Eastern Restaurant' 'Seafood Restaurant' 'College Rec Center'
 'Creperie' 'Gym' 'F

In [46]:
# categories, which are close in some way to 'salon' 
cat_interest = ['Spa', 'Cosmetics Shop', 'Salon / Barbershop']

So let's focus only on these categories.

In [57]:
toronto_filtered_cat = toronto_venues[toronto_venues['Venue Category'].isin(cat_interest)]

In [59]:
toronto_filtered_cat.shape

(17, 7)

In [60]:
toronto_filtered_cat.head()

Unnamed: 0,Neighborhood,Neighborhood Lat,Neighborhood Lng,Venue,Venue Lat,Venue Lng,Venue Category
3,"Harbourfront, Regent Park",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
47,"Harbourfront, Regent Park",43.65426,-79.360636,The Abnormal Beauty Company,43.649892,-79.361005,Cosmetics Shop
66,"Ryerson, Garden District",43.657162,-79.378937,Elmwood Spa,43.657759,-79.382586,Spa
82,"Ryerson, Garden District",43.657162,-79.378937,SEPHORA,43.653688,-79.38012,Cosmetics Shop
86,"Ryerson, Garden District",43.657162,-79.378937,LUSH,43.653557,-79.3804,Cosmetics Shop


Cluster all neighborhoods on these selected categories.

In [61]:
toronto_onehot = pd.get_dummies(toronto_filtered_cat[['Venue Category']], prefix = "", prefix_sep = "")

# add neighborhood column back to dataframe
# and place it as the first column
toronto_onehot['Neighborhood'] = toronto_filtered_cat['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighborhood,Cosmetics Shop,Salon / Barbershop,Spa
3,"Harbourfront, Regent Park",0,0,1
47,"Harbourfront, Regent Park",1,0,0
66,"Ryerson, Garden District",0,0,1
82,"Ryerson, Garden District",1,0,0
86,"Ryerson, Garden District",1,0,0


In [62]:
# expected (17, 4)
toronto_onehot.shape

(17, 4)

In [81]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,Cosmetics Shop,Salon / Barbershop,Spa
0,Central Bay Street,0.0,0.0,1.0
1,Church and Wellesley,0.0,1.0,0.0
2,"Harbourfront, Regent Park",0.5,0.0,0.5
3,"Little Portugal, Trinity",0.0,1.0,0.0
4,North Toronto West,0.0,0.5,0.5


In [82]:
# expected (11, 4)
toronto_grouped.shape

(11, 4)

Run k-means clustering on these neighborhoods.

In [83]:
# set number of clusters
kclusters = 2

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters = kclusters, random_state = 0).fit(toronto_grouped_clustering)

In [85]:
toronto_cluster = toronto_grouped
toronto_cluster.insert(0, 'Cluster Labels', kmeans.labels_)

In [88]:
toronto_cluster.head()

Unnamed: 0,Cluster Labels,Neighborhood,Cosmetics Shop,Salon / Barbershop,Spa
0,1,Central Bay Street,0.0,0.0,1.0
1,1,Church and Wellesley,0.0,1.0,0.0
2,0,"Harbourfront, Regent Park",0.5,0.0,0.5
3,1,"Little Portugal, Trinity",0.0,1.0,0.0
4,1,North Toronto West,0.0,0.5,0.5


Have a look at the clusters.

In [89]:
toronto_cluster.loc[toronto_cluster['Cluster Labels'] == 0]

Unnamed: 0,Cluster Labels,Neighborhood,Cosmetics Shop,Salon / Barbershop,Spa
2,0,"Harbourfront, Regent Park",0.5,0.0,0.5
6,0,"Ryerson, Garden District",0.666667,0.0,0.333333
7,0,St. James Town,1.0,0.0,0.0
8,0,Stn A PO Boxes 25 The Esplanade,1.0,0.0,0.0
9,0,"The Annex, North Midtown, Yorkville",1.0,0.0,0.0
10,0,"The Danforth West, Riverdale",0.5,0.0,0.5


In [90]:
toronto_cluster.loc[toronto_cluster['Cluster Labels'] == 1]

Unnamed: 0,Cluster Labels,Neighborhood,Cosmetics Shop,Salon / Barbershop,Spa
0,1,Central Bay Street,0.0,0.0,1.0
1,1,Church and Wellesley,0.0,1.0,0.0
3,1,"Little Portugal, Trinity",0.0,1.0,0.0
4,1,North Toronto West,0.0,0.5,0.5
5,1,Roselawn,0.0,0.0,1.0


Looks like the cluster '0' is agains salons and prefer cosmetic shops.

Let's visualize all these:

1. Red is for those neighborhoods, which are not interested in cosmetic and salons.
2. Blue is for those, who are interested in cosmetic.
3. Green is for those, who are interested in salons.
4. Black is for apartments, which satisfy all criteria.

In [124]:
blue_list = toronto_cluster.loc[toronto_cluster['Cluster Labels'] == 0]['Neighborhood']
green_list = toronto_cluster.loc[toronto_cluster['Cluster Labels'] == 1]['Neighborhood']

In [125]:
red = toronto[(~toronto['Neighborhood'].isin(blue_list) & ~toronto['Neighborhood'].isin(green_list))]
red.shape

(27, 5)

In [126]:
blue = toronto[toronto['Neighborhood'].isin(blue_list)]
blue.shape

(6, 5)

In [127]:
green = toronto[toronto['Neighborhood'].isin(green_list)]
green.shape

(5, 5)

In [128]:
map_clusters = folium.Map(location=[lat_def, lng_def], zoom_start=11)

# add markers to the map
for lat, lng, poi in zip(red['Latitude'], red['Longitude'], red['Neighborhood']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=folium.Popup(str(poi), parse_html=True),
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7).add_to(map_clusters)
    
for lat, lng, poi in zip(green['Latitude'], green['Longitude'], green['Neighborhood']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=folium.Popup(str(poi), parse_html=True),
        color='green',
        fill=True,
        fill_color='green',
        fill_opacity=0.7).add_to(map_clusters)
       
for lat, lng, poi in zip(blue['Latitude'], blue['Longitude'], blue['Neighborhood']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=folium.Popup(str(poi), parse_html=True),
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7).add_to(map_clusters)     
        
# add a green circle marker to represent the apartments
for lat, lng, addr, pr in zip(final_aparts['lat'], final_aparts['lng'], final_aparts['address'], final_aparts['price']):
    label = '{}, {}'.format(addr, pr)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='black',
        fill_opacity=1,
        parse_html=False).add_to(map_clusters) 
    
map_clusters

In [116]:
final_aparts

Unnamed: 0,id,address,suite,square,type,price,lat,lng
196010,196010,1918 A Queen Street East,M,450,Retail,"$3,600/mo",43.669264,-79.304059
198168,198168,98 Ossington Avenue,Store Front,500,Office / Retail,"$2,800/mo",43.646342,-79.419797
195694,195694,41 Dovercourt Rd,1,"750-5,658",Retail,"$3,924-29,609/mo",43.641559,-79.421492
195692,195692,41 Dovercourt Rd,5,533,Retail,"$2,788/mo",43.641559,-79.421492
195693,195693,41 Dovercourt Rd,7,719,Retail,"$3,762/mo",43.641559,-79.421492


Since we are looking for a black point close to the green, with the lowest price, our winner is 

__98 Ossington Avenue with $2,800/mo__