# Property comparison using clustering

In [1]:
from sklearn_extra.cluster import KMedoids

In [3]:
# Import necassary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Clustering
import gower
from numpy import savetxt
from sklearn.cluster import DBSCAN

# # Specifing figure layout
sns.set_context("talk", font_scale=1.5)

ModuleNotFoundError: No module named 'gower'

In [13]:
# load datasets
df_listings = pd.read_csv('../data/all_listings_cleaned_20210723.csv')
df_room_features = pd.read_csv('../data/room_features20210716.csv')

In [5]:
# Shape of both datasets
print(df_listings.shape)
print(df_room_features.shape)

(27679, 45)
(30227, 153)


In [16]:
for column in df_listings:
    print(df_listings[column].value_counts())

0        1
17117    1
25337    1
27384    1
4855     1
        ..
21920    1
11663    1
9614     1
13708    1
2047     1
Name: Unnamed: 0, Length: 27679, dtype: int64
e9c4b745-e7f1-5855-b4d6-634afc469720    1
7d1dceed-915b-52ee-a92b-13130d4cf030    1
cd339b61-9606-50db-909b-e33d54205681    1
ca729fa9-2b29-5955-afbf-02c748083456    1
24df7448-3202-5e4c-b02c-7df64de45242    1
                                       ..
d0e64fe5-38d9-59a7-a8c0-702baa3bc838    1
5772908c-8093-5140-bfb2-3ca6fc3495ff    1
6c4b7d1f-185d-547c-a75f-65e98500ff43    1
3ec5b4e9-9553-5632-983e-5d29bf84cbb1    1
f6ee338d-7fb4-5244-b7cf-385e34b1e5f4    1
Name: listing_id, Length: 27679, dtype: int64
2de3ec57-2814-5d5b-90b3-1970c8078357    59
070c7650-7e59-53ea-8d86-0b43d73737dc    50
95575bf2-e8e9-5d40-ac13-6a4e3e1144ce    41
9df16b81-6697-56a0-9aaa-862252e94e88    15
42fa2884-62d4-5546-9e56-78fe5c8ac38d    14
                                        ..
dda734ae-412d-5a82-bcbd-3ed56a880b8f     1
4c1bc862-041e-59f8-bb7c-

## 1. Try with merging listings and room features

In [None]:
# Merge both datasets
df_cluster = pd.merge(df_listings, df_room_features, on='listing_id', how='inner')
print(df_cluster.shape)
del df_cluster['Unnamed: 0']
df_cluster.head()

In [None]:
# Looking for all features
list(df_cluster.columns)

In [None]:
# Combine duplicated columns
df_cluster['dishwasher_x'] = np.where(df_cluster['dishwasher_x'] == 0, df_cluster['dishwasher_y'], df_cluster['dishwasher_x'])
df_cluster['dryer_x'] = np.where(df_cluster['dryer_x'] == 0, df_cluster['dryer_y'], df_cluster['dryer_x'])
df_cluster['terrace_x'] = np.where(df_cluster['terrace_x'] == 0, df_cluster['terrace_y'], df_cluster['terrace_x'])

# Delete duplicated columns
df_cluster.drop(['dishwasher_y', 'dryer_y', 'terrace_y'], axis=1, inplace=True)

# Rename columns
df_cluster.rename(columns={'dishwasher_x': 'dishwasher', 
                    'dryer_x': 'dryer', 'terrace_x': 'terrace',
                    'sun_umbrella_': 'sun_umbrella',
                    'colouring_book_/_pencils': 'colouring_book_pencils',
                    "child's_bed": 'childs_bed', 'awning_': 'awning',
                    'air_conditioning_': 'air_conditioning',
                    'CDs/_DVDs': 'CDs_DVDs', 'living_/_dining_room': 'living_dining_room',
                    'living_/_bedroom': 'living_bedroom', 'children`s_room': 'childrens_room',
                    'Library': 'library'}, inplace=True)

In [None]:
# Check if categorical features have a sum less than 10 for true values
np.any(df_cluster.loc[:,'option_allergic':].sum(axis=0) <= 10)

In [None]:
# Drop unneccassary features for clustering (comparison properties)
df_cluster.drop(['state', 'contract_end', 'subscription', 'binding_inquiry'],axis=1, inplace=True)

In [None]:
df_cluster.shape

### Distance measures for mixed data: Gower’s dissimilarity

Clustering algorithms are based on distance measures to define if objects are considered similar or not. Distances need to be defined between two objects in order to use clustering algorithms. A problem with defining distances can occur when a data set consists of mixed data, for instance, numeric, binary, nominal and ordinal data (section 2.2.3). For example how do you measure the similarity between a red car that weights 1400 kg and a blue car that weights 1200 kg? A solution is to use Gower’s dissimilarity measure (GD) that can calculate the distance between two entities whose attributes have a mix of categorical and numerical values.

In [None]:
# Calculate distance matrix
distance_matrix = gower.gower_matrix(df_cluster)

In [None]:
# Shape of distance matrix
distance_matrix.shape

In [None]:
# Save distance matrix in CSV file
#savetxt('../data/distance_matrix.csv', distance_matrix, delimiter=',')

# load numpy array from csv file
#from numpy import loadtxt
# load array
#df_cluster = loadtxt('distance_matrix.csv', delimiter=',')

### DBSCAN

To understand DBSCAN in more detail, let’s dive into it. The main concept of DBSCAN algorithm is to locate regions of high density that are separated from one another by regions of low density. 

Density at a point P: Number of points within a circle of Radius Eps (ϵ) from point P.
Dense Region: For each point in the cluster, the circle with radius ϵ contains at least minimum number of points (MinPts).

In [None]:
# Configuring the parameters of the clustering algorithm
dbscan_cluster = DBSCAN(eps=0.01, 
                        min_samples=100, 
                        metric="precomputed")

# Fitting the clustering algorithm
dbscan_cluster.fit(distance_matrix)

# Adding the results to a new column in the dataframe
## first try: column cluster --> eps=0.3; min_sample=5
df_cluster["cluster_001_100"] = dbscan_cluster.labels_

# Show head of new dataset
df_cluster.head()

# Export new CSV
df_cluster.to_csv('../data/clustering_20210723.csv')

In [None]:
print(df_cluster.cluster_01_5.nunique())
print(df_cluster.cluster_01_5.unique())

In [None]:
print(df_cluster.cluster_01_4.nunique())
print(df_cluster.cluster_01_4.unique())

In [None]:
print(df_cluster.cluster_01_6.nunique())
print(df_cluster.cluster_01_6.unique())

In [None]:
print(df_cluster.cluster_01_3.nunique())
print(df_cluster.cluster_01_3.unique())

In [None]:
df_cluster.cluster_01_5.value_counts()

In [None]:
df_cluster.head()

## 2. Try: only listings

In [None]:
df_listings.head()

In [None]:
# Drop unneccassary features for clustering (comparison properties)
df_listings.drop(['Unnamed: 0','state', 'contract_end', 'subscription', 'binding_inquiry'],axis=1, inplace=True)

In [None]:
# Calculate distance matrix
distance_matrix_listings = gower.gower_matrix(df_listings)

In [None]:
# Save distance matrix in CSV file
#savetxt('../data/distance_matrix_listings.csv', distance_matrix_listings, delimiter=',')

In [None]:
# Configuring the parameters of the clustering algorithm
dbscan_cluster = DBSCAN(eps=0.1, 
                        min_samples=7, 
                        metric="precomputed")

# Fitting the clustering algorithm
dbscan_cluster.fit(distance_matrix_listings)

# Adding the results to a new column in the dataframe
## first try: column cluster --> eps=0.3; min_sample=5
df_listings["cluster_01_7"] = dbscan_cluster.labels_

# Show head of new dataset
df_listings.head()

# Export new CSV
#df_cluster.to_csv('../data/clustering_20210723.csv')

In [None]:
print(df_listings.cluster_01_5.nunique())
print(df_listings.cluster_01_5.unique())

In [None]:
df_listings.cluster_01_5.value_counts()

In [None]:
print(df_listings.cluster_01_10.nunique())
print(df_listings.cluster_01_10.unique())

In [None]:
df_listings.cluster_01_10.value_counts()

In [None]:
print(df_listings.cluster_01_7.nunique())
print(df_listings.cluster_01_7.unique())

In [None]:
df_listings.cluster_01_7.value_counts()

In [None]:
del df_listings['cluster_02_10']

In [None]:
df_listings.columns

In [None]:
df_listings.query('cluster_01_7 == 0').region.unique()

In [None]:
df_listings.region.value_counts()

## 3. Try only on room_features

Jaccard Similarity Coefficient can be used when your data or variables are qualitative in nature. In particular to be used when the variables are represented in binary form such as (0, 1) or (Yes, No).

In [6]:
df_room_features.head()

Unnamed: 0,listing_id,1_room_apartment,Library,children`s_room,corridor,dining_room,en_suite_bathroom,gallery,heating_room,kitchen,...,underfloor_heating,vacuum_cleaner,walk-in_shower,wall_bed,wardrobe,wash_basin,water_bed,windbreak,window,cooking
0,00005de7-6ed8-541b-be83-2573a46cd307,0,0,0,0,0,0,0,0,1,...,0,1,0,0,1,1,0,0,0,1
1,0002653c-7e59-59a2-89f2-f6574ba14a0a,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
2,0003f153-450a-5837-8a46-bd5db5b536b1,1,0,0,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,1
3,00058d0d-579c-5b84-814b-8920fb5f1a27,0,0,0,0,0,0,0,0,1,...,0,1,0,0,1,1,0,0,0,1
4,000cd267-f826-50f4-aa55-cec9aaa079ae,0,0,0,0,0,0,0,0,1,...,1,1,1,0,1,1,0,0,1,1


How to Find Optimal number of clustering

One of the challenging tasks in agglomerative clustering is to find the optimal number of clusters. Silhouette Score is one of the popular approaches for taking a call on the optimal number of clusters. It is a way to measure how close each point in a cluster is to the points in its neighboring clusters.

Let ai be the mean distance between an observation i and other points in the cluster to which observation I assigned.

Let bi be the minimum mean distance between an observation i and points in other clusters.

In [5]:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch

In [7]:
X = df_room_features

In [8]:
X.replace([0, 1], ['no', 'yes'], inplace=True)

In [10]:
# Calculate distance matrix
import gower
distance_matrix_room_features = gower.gower_matrix(X)

In [7]:
dendrogram = sch.dendrogram(sch.linkage(distance_matrix_room_features, method='average'))

Error in callback <function flush_figures at 0x127ae00d0> (for post_execute):


KeyboardInterrupt: 

In [None]:
model = AgglomerativeClustering(n_clusters=5, affinity='precomputed', linkage='average')
model.fit(distance_matrix_room_features)
labels = model.labels_