In [279]:
# Import all the library
import pandas as pd
import numpy as np
from requests import get
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
headers = ({'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})

### Collect data from Daft

In [4]:
house_links = []
#set the initial link
next_link = "https://www.daft.ie/dublin-city/houses-for-sale/"
# collect links for dublin city
for i in range(1,139):
    if i % 10 == 0:
        print(str(i) + " Records completed")
        
    #get the page and store links of houses in house links
    response = get(next_link, headers=headers)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    class_container = html_soup.find_all(class_='PropertyImage__mainImageContainer')  
    house_links.append([y.find_all('a',href=True)[0]['href'] for y in class_container])
    
    response = get(next_link, headers=headers)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    class_container = html_soup.find_all(class_='PropertyImage__mainImageContainerStandard')  
    house_links.append([y.find_all('a',href=True)[0]['href'] for y in class_container])
    
    #find next page
    z = html_soup.find_all(class_='next_page')
    next_link = 'https://www.daft.ie' + z[0].find_all('a',href=True)[0]['href']

10 Records completed
20 Records completed
30 Records completed
40 Records completed
50 Records completed
60 Records completed
70 Records completed
80 Records completed
90 Records completed
100 Records completed
110 Records completed
120 Records completed
130 Records completed


In [5]:
# flatten the 2d array into 1D array
daft_links = []
for link_array in house_links:
    for link in link_array:
        daft_links.append( 'https://www.daft.ie' + link)

In [10]:
# Collect data from daft lik address,eircode,square meter,price,lat and long into array
final_daft_data = []
for i in range(0,len(daft_links)):
    response = get(daft_links[i], headers=headers)
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    if i % 100 == 0:
        print(str(i) + " Records completed")

    #get address
    address = html_soup.find(class_='PropertyMainInformation__address').text
    
    #get eircode
    temp = html_soup.find(class_='PropertyMainInformation__eircode')
    if temp != None:
        eircode = ''.join(temp.find_all(text=True, recursive=False)).strip()  
        eircode = eircode.replace(u'\xa0', u' ')
    else:
        eircode = ''
    
    #get square meter
    temp = html_soup.find(class_='PropertyOverview__propertyOverviewDetails')
    if temp != None:
        sq_m = ''.join(temp.find_all(text=True, recursive=False)).strip().split(' ')[0]
    else:
        sq_m = ''

    #get price
    temp = html_soup.find(class_='PropertyInformationCommonStyles__propertyPrice')   
    if temp != None:    
        amount = html_soup.find(class_='PropertyInformationCommonStyles__propertyPrice').text.strip()
    else:
        amount = ''
    
    #get lat and long
    z = html_soup.find(class_='MapButton withoutBackground',href=True)
    lat_lon = (str(z).split(' ')[3].split('loc:')[1].split('+'))
    lat = lat_lon[0]
    lon = lat_lon[1]    
    
    final_daft_data.append([eircode,address,lat,lon,sq_m,amount])

0 Records completed
100 Records completed
200 Records completed
300 Records completed
400 Records completed
500 Records completed
600 Records completed
700 Records completed
800 Records completed
900 Records completed
1000 Records completed
1100 Records completed
1200 Records completed
1300 Records completed
1400 Records completed
1500 Records completed
1600 Records completed
1700 Records completed
1800 Records completed
1900 Records completed
2000 Records completed
2100 Records completed
2200 Records completed
2300 Records completed
2400 Records completed
2500 Records completed
2600 Records completed
2700 Records completed


In [17]:
#convert the data to dataframe and store in excel
daft_df = pd.DataFrame(columns=['eircode','address','lat','lon','sq_m','amount'])
for z in final_daft_data:
    daft_df = daft_df.append(pd.Series(z, index=['eircode','address','lat','lon','sq_m','amount']), ignore_index=True)

In [18]:
daft_df

Unnamed: 0,eircode,address,lat,lon,sq_m,amount
0,,"St Pancras , Terenure, Dublin 6w, South Dublin...",53.31612749992591,"-6.281567215919495""",,"From\n€390,000\n \n to €890,000"
1,,"Proby Place, Carysfort Avenue, Blackrock, Sout...",53.29187888892909,"-6.176945611571938""",,"From\n€1,250,000"
2,K36 V663,"Kalamunda, Grove Lawn, Malahide, North Co. Dublin",53.445769,"-6.148386""",250.8,"€1,850,000"
3,,"40 Commons Road, Clondalkin, Dublin 22, West C...",53.3149378,"-6.3996281""",,"€279,000"
4,,"Taylor Hill, Off Clonard Road, Balbriggan, Nor...",53.608167037476484,"-6.210913989352321""",,"From\n€270,000\n \n to €450,000"
...,...,...,...,...,...,...
2755,,"South Shore Road, Rush, North Co. Dublin",53.516258,"-6.106439""",383,"€850,000"
2756,K34 H985,"Dun An Oir, Loughshinny, North Co. Dublin",53.54698719938854,"-6.085864175755205""",,"€650,000"
2757,,"104 Fortlawn Avenue, Blanchardstown, Dublin 15...",53.390806,"-6.4031859""",,"€270,000"
2758,D18 H9R2,"The Bawn, Kerrymount Avenue, Foxrock, Dublin 1...",53.260881,"-6.176414""",520,"€4,800,000"


In [19]:
#Save the data for further analysis
daft_df.to_excel('daft_data.xlsx')

### Data Cleaning

In [180]:
daft_df = pd.read_excel('daft_data.xlsx')

In [181]:
daft_df.head(5)

Unnamed: 0,eircode,address,lat,lon,sq_m,amount
0,,"St Pancras , Terenure, Dublin 6w, South Dublin...",53.316127,"-6.281567215919495""",,"From\n€390,000\n \n to €890,000"
1,,"Proby Place, Carysfort Avenue, Blackrock, Sout...",53.291879,"-6.176945611571938""",,"From\n€1,250,000"
2,K36 V663,"Kalamunda, Grove Lawn, Malahide, North Co. Dublin",53.445769,"-6.148386""",250.8,"€1,850,000"
3,,"40 Commons Road, Clondalkin, Dublin 22, West C...",53.314938,"-6.3996281""",,"€279,000"
4,,"Taylor Hill, Off Clonard Road, Balbriggan, Nor...",53.608167,"-6.210913989352321""",,"From\n€270,000\n \n to €450,000"


In [182]:
daft_df.shape

(2760, 6)

In [183]:
#Check for missing values
daft_df = daft_df.replace(r'^\s*$', np.NaN, regex=True)
daft_df.dropna(axis=0,how='any',inplace=True)

In [184]:
daft_df

Unnamed: 0,eircode,address,lat,lon,sq_m,amount
2,K36 V663,"Kalamunda, Grove Lawn, Malahide, North Co. Dublin",53.445769,"-6.148386""",250.8,"€1,850,000"
8,D15 C1XR,"1 Summerfield Close, Blanchardstown, Blanchard...",53.387244,"-6.386496""",130.0,"€385,000"
11,D09 HX84,"68 Lindsay Road, Glasnevin, Glasnevin, Dublin ...",53.365188,"-6.267152""",183.0,"€1,195,000"
12,D08 X7FY,"5 New Ireland Road, Rialto, Dublin 8, South Du...",53.337218,"-6.299548""",87.0,"€475,000"
13,K45 DT62,"14 Dun Emer Gardens, Lusk, North Co. Dublin",53.525009,"-6.179869""",101.0,"€299,500"
...,...,...,...,...,...,...
2750,K67 NW20,"Forest Road, Swords, North Co. Dublin",53.437883,"-6.250311""",186.0,"€550,000"
2751,D18 V0V8,"Springmount, Ferndale Road, Rathmichael, South...",53.221594,"-6.134844""",433.0,"€2,500,000"
2752,D11 N8X8,"159 Jamestown Road, Finglas, Dublin 11, North ...",53.395988,"-6.293423""",105.0,"€325,000"
2758,D18 H9R2,"The Bawn, Kerrymount Avenue, Foxrock, Dublin 1...",53.260881,"-6.176414""",520.0,"€4,800,000"


In [185]:
#clean the amount column
daft_df['amount_num'] = daft_df.amount.str.split('€')
daft_df['remove_len'] = daft_df.amount_num.apply(lambda x: len(x))
daft_df.drop(daft_df[daft_df['remove_len'] != 2].index,inplace = True)
daft_df['value_1'] = daft_df.amount_num.map(lambda x: x[0])
daft_df['actual_amount'] = daft_df.amount_num.map(lambda x: x[1])
daft_df.drop(['amount','amount_num','remove_len','value_1'], axis=1,inplace = True)
daft_df['lat'] = daft_df.lat.apply(lambda x: str(x).split('"')[0])
daft_df['lon'] = daft_df.lon.apply(lambda x: str(x).split('"')[0])

## cluster analysis of the houses in dublin 1

#### Use geopy library to get the latitude and longitude values of Dublin,Ireland.

In [186]:

address = 'Dublin 1,Ireland'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Dublin are {}, {}.'.format(latitude, longitude))


The geograpical coordinate of Dublin are 53.3524881, -6.256645689721826.


#### Create a map of Dublin and plot the houses on top of it using the folium

In [188]:
# create map of Toranto using latitude and longitude values
dublin_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, eircode in zip(daft_df['lat'], daft_df['lon'], daft_df['eircode']):
    label = '{}'.format(eircode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lng)],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(dublin_map)  
    
dublin_map

#### Lets see how many number of houses we have

In [192]:
print('The dataframe has {} houses.'.format(daft_df.shape[0]))

The dataframe has 1875 houses.


#### Define the Foursquare credentials and versions

In [227]:
CLIENT_ID = '4V4LTLKW5FCOMLV1RRTBDMG4RRZLNFSV4JCMF5HRP2AA4YJW' # your Foursquare ID
CLIENT_SECRET = 'O54ZYFJSH11NXBTRTPK0QXCT2H0I3B3RXKSRHTTTCTZFPWIB' # your Foursquare Secret
VERSION = '20200702' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 4V4LTLKW5FCOMLV1RRTBDMG4RRZLNFSV4JCMF5HRP2AA4YJW
CLIENT_SECRET:O54ZYFJSH11NXBTRTPK0QXCT2H0I3B3RXKSRHTTTCTZFPWIB


### Define a function to get the nearby venues of the

In [234]:
import requests
def getNearbyVenues(names, latitudes, longitudes, radius=200):
    LIMIT = 100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except KeyError:
            nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
            nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
            return(nearby_venues)
    
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [235]:
venues = getNearbyVenues(names=daft_df['eircode'],
                                   latitudes=daft_df['lat'],
                                   longitudes=daft_df['lon']
                                  )

K36 V663
D15 C1XR
D09 HX84
D08 X7FY
K45 DT62
D01 E0A2
K67 F8R2
A94 F4E4
D6W CK11
D07 E9VH
D12 WE27
D07 E899
D06 FK44
K36 A039
K36 W273
D15 WK7V
A96 EN80
A96 XR91
D16 T9V9
D16 K8X3
D6W DW73
D09 HW08
D09 N8R6
K56 DE93
D07 P2H9
K67 VP82
A94 HE03
D14 W7C1
D08 NVH5
D15 R9DX
D08 APN7
K36 H732
K36 FW98
K78 T0V9
A96 WY27
D09 RY98
D08 H2A4
A94 C9C2
A94 P991
D11 AY9X
A96 D5N7
D15 RRW3
K36 H564
K36 HX01
K36 VA47
K36 X668
D24 V2W0
A96 YX22
K34 Y927
K78 HY75
D04 V0Y3
D14 WR22
D05 F6F4
A96 Y1K7
D15 E0YC
K34 TR50
A94 TH68
D11 P027
D6W YR28
D15 P5KH
D14 Y2W7
A94 NW10
D11 Y2E1
D09 H9K6
D15 CDH4
D08 NW6F
A96 NW94
A96 Y592
D09 CY68
D18 RD21
D09 EK71
D05 CY93
D18 ED35
A94 YW44
D07 F788
D07 PF72
D07 E7Y8
K36 X778
D13 TD34
D09 EC52
D15 N8CF
A94 EY63
K36 A240
D18 C1X4
D04 P2K1
D07 W3V1
K36 AW64
D07 C5Y0
D22 F6K0
D14 V279
D14 E6X8
D13 RD98
D15 CF22
D14 N232
D06 A091
D14 YE03
D15 W7W0
K36 R966
K34 WK18
D07 TW30
D15 X6N1
D17 PW97
D13 K021
D16 K7W3
D07 K7K7
K67 AW90
D12 X4Y7
D12 HH39
D07 H5N3
D06 XT96
D11 FK58
D

In [239]:
print(venues.shape)
venues.head()

(1020, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,D09 HX84,53.365188,-6.267152,Maples House Hotel,53.366265,-6.265293,Hotel
1,D09 HX84,53.365188,-6.267152,Egan's Townhouse Hotel,53.366907,-6.266402,Hotel
2,D08 X7FY,53.33721800000001,-6.299548,The Bird Flanagan,53.336561,-6.299196,Pub
3,D08 X7FY,53.33721800000001,-6.299548,Mace,53.338736,-6.300121,Grocery Store
4,K45 DT62,53.525009,-6.179869,Gino,53.525144,-6.180755,Fast Food Restaurant


### use only 355 records due to max capicity of foresquare API

In [245]:
daft_filtered_df = daft_df[daft_df.eircode.isin(venues.Neighborhood)]

In [247]:
daft_filtered_df.shape

(362, 6)

### plot the filtered data on folium

In [250]:
# create map of Toranto using latitude and longitude values
dublin_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, eircode in zip(daft_filtered_df['lat'], daft_filtered_df['lon'], daft_filtered_df['eircode']):
    label = '{}'.format(eircode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lng)],
        radius=3,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(dublin_map)  
    
dublin_map

### lets check how many venues are returned for each eircode

In [253]:
venues.groupby('Neighborhood').count().sort_values('Neighborhood Latitude',ascending=False)

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
K36 H564,28,28,28,28,28,28
D02 HD83,26,26,26,26,26,26
D06 E5R3,14,14,14,14,14,14
A96 RW92,12,12,12,12,12,12
D10 VY92,12,12,12,12,12,12
...,...,...,...,...,...,...
D15 AD86,1,1,1,1,1,1
D15 CDH4,1,1,1,1,1,1
D15 E7W2,1,1,1,1,1,1
D15 H5F6,1,1,1,1,1,1


#### Analyze Each Neighborhood

In [254]:
# one hot encoding
hot_enc = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
hot_enc['Neighborhood'] = venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [hot_enc.columns[-1]] + list(hot_enc.columns[:-1])
hot_enc = hot_enc[fixed_columns]

hot_enc.head()

Unnamed: 0,Neighborhood,ATM,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bar,Beach,Bed & Breakfast,...,Tram Station,Tunnel,Vegetarian / Vegan Restaurant,Video Store,Warehouse Store,Waterfront,Weight Loss Center,Wine Bar,Wine Shop,Yoga Studio
0,D09 HX84,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,D09 HX84,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,D08 X7FY,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,D08 X7FY,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,K45 DT62,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by eircode and by taking the mean of the frequency of occurrence of each category

In [257]:
venue_grouped = hot_enc.groupby('Neighborhood').mean().reset_index()
venue_grouped

Unnamed: 0,Neighborhood,ATM,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bar,Beach,Bed & Breakfast,...,Tram Station,Tunnel,Vegetarian / Vegan Restaurant,Video Store,Warehouse Store,Waterfront,Weight Loss Center,Wine Bar,Wine Shop,Yoga Studio
0,A94 A260,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A94 A6F5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0
2,A94 AD78,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A94 D9V0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A94 EY63,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,K78 ED89,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
351,K78 F862,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
352,K78 NT67,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
353,K78 WC91,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [258]:
venue_grouped.shape

(355, 167)

In [259]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [261]:
### check common venues for each eircode

In [262]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = venue_grouped['Neighborhood']

for ind in np.arange(venue_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(venue_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,A94 A260,Home Service,Spa,Bakery,Yoga Studio,Event Service,Food Court,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
1,A94 A6F5,Wine Shop,Pub,Gas Station,Thai Restaurant,Chinese Restaurant,Event Service,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
2,A94 AD78,Playground,Shoe Store,Yoga Studio,Event Service,Food Court,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
3,A94 D9V0,Deli / Bodega,Spa,Bakery,Yoga Studio,Falafel Restaurant,Food Court,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
4,A94 EY63,Pub,Fast Food Restaurant,Shopping Mall,Yoga Studio,English Restaurant,Food & Drink Shop,Fish & Chips Shop,Farmers Market,Farm,Falafel Restaurant


In [269]:
# set number of clusters
kclusters = 10

grouped_clustering = venue_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([7, 7, 6, 7, 1, 2, 7, 8, 7, 7])

In [270]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)


In [273]:
neighborhoods_venues_sorted

Unnamed: 0,Cluster Labels,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,7,A94 A260,Home Service,Spa,Bakery,Yoga Studio,Event Service,Food Court,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
1,7,A94 A6F5,Wine Shop,Pub,Gas Station,Thai Restaurant,Chinese Restaurant,Event Service,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
2,6,A94 AD78,Playground,Shoe Store,Yoga Studio,Event Service,Food Court,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
3,7,A94 D9V0,Deli / Bodega,Spa,Bakery,Yoga Studio,Falafel Restaurant,Food Court,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
4,1,A94 EY63,Pub,Fast Food Restaurant,Shopping Mall,Yoga Studio,English Restaurant,Food & Drink Shop,Fish & Chips Shop,Farmers Market,Farm,Falafel Restaurant
...,...,...,...,...,...,...,...,...,...,...,...,...
350,7,K78 ED89,Park,Convenience Store,Pub,Supermarket,Café,Event Service,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market
351,7,K78 F862,Park,Bus Stop,Fast Food Restaurant,Event Service,Food Court,Food & Drink Shop,Fish & Chips Shop,Farmers Market,Farm,Falafel Restaurant
352,4,K78 NT67,Bus Stop,Yoga Studio,Falafel Restaurant,Frame Store,Food Court,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Farm
353,7,K78 WC91,Park,Convenience Store,Pub,Supermarket,Café,Event Service,Food & Drink Shop,Fish & Chips Shop,Fast Food Restaurant,Farmers Market


In [370]:
final_result = daft_filtered_df.merge(neighborhoods_venues_sorted,how = 'left',left_on='eircode', right_on='Neighborhood')
final_result = final_result[['eircode','address','lat','lon','sq_m','actual_amount', 'Cluster Labels']]

In [371]:
final_result

Unnamed: 0,eircode,address,lat,lon,sq_m,actual_amount,Cluster Labels
0,D09 HX84,"68 Lindsay Road, Glasnevin, Glasnevin, Dublin ...",53.365188,-6.267152,183.00,1195000,7
1,D08 X7FY,"5 New Ireland Road, Rialto, Dublin 8, South Du...",53.33721800000001,-6.299548,87.00,475000,1
2,K45 DT62,"14 Dun Emer Gardens, Lusk, North Co. Dublin",53.525009,-6.179869,101.00,299500,7
3,D01 E0A2,"450 North Circular Road, Dublin 1, Dublin City...",53.359407,-6.26075,280.00,735000,7
4,A94 F4E4,"6 Anglesea Avenue, Blackrock, South Co. Dublin",53.29804,-6.181226,144.00,1150000,2
...,...,...,...,...,...,...,...
357,D24 WD0F,"16 Kilcarrig Crescent, Fettercairn, Tallaght, ...",53.292526,-6.392476,90.00,209000,7
358,D12 DC44,"25 Clonard Road, Crumlin, Crumlin, Dublin 12, ...",53.326814,-6.305686,68.33,240000,1
359,D08 A4E9,"24 Brabazon Square, The Coombe, Dublin 8, Sout...",53.340729,-6.280172,38.00,250000,2
360,A94 YW44,"10 Priory Avenue, Stillorgan, South Co. Dublin",53.29617,-6.197744,140.00,995000,7


In [280]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(final_result['lat'], final_result['lon'], final_result['eircode'], final_result['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [381]:
final_result.to_excel('final_result.xlsx')

In [383]:
final_result = pd.read_excel('final_result.xlsx')

In [388]:

grouped = final_result.groupby(['Cluster Labels']).sum()[['sq_m','actual_amount']]

In [390]:
#price per square km
grouped['avg_price'] = round(grouped['actual_amount'] / grouped ['sq_m'],2)

In [391]:
grouped


Unnamed: 0_level_0,sq_m,actual_amount,avg_price
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1329.7,5370000,4038.5
1,3832.53,16623950,4337.59
2,4509.27,24568000,5448.33
3,2141.88,13308000,6213.23
4,1736.1,7525000,4334.43
5,1581.4,7870000,4976.6
6,1117.6,4440000,3972.8
7,25371.97,113257600,4463.89
8,1219.8,6549950,5369.69
9,1390.3,5342950,3843.02
