# Property comparison using cluster modeling

## 1. Load the datsets

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Clustering
import gower
from numpy import savetxt
from sklearn.cluster import DBSCAN

# # Specifing figure layout
sns.set_context("talk", font_scale=1.5)

In [None]:
# load datasets
df_listings = pd.read_csv('../data/all_listings_cleaned_20210723.csv')
df_room_features = pd.read_csv('../data/room_features20210716.csv')

In [None]:
# Shape of both datasets
print(df_listings.shape)
print(df_room_features.shape)

## 2. Try with merging listings and room features

In [None]:
# Merge both datasets
df_cluster = pd.merge(df_listings, df_room_features, on='listing_id', how='inner')
print(df_cluster.shape)
del df_cluster['Unnamed: 0']
df_cluster.head()

In [None]:
# Looking for all features
list(df_cluster.columns)

In [None]:
# Combine duplicated columns
df_cluster['dishwasher_x'] = np.where(df_cluster['dishwasher_x'] == 0, df_cluster['dishwasher_y'], df_cluster['dishwasher_x'])
df_cluster['dryer_x'] = np.where(df_cluster['dryer_x'] == 0, df_cluster['dryer_y'], df_cluster['dryer_x'])
df_cluster['terrace_x'] = np.where(df_cluster['terrace_x'] == 0, df_cluster['terrace_y'], df_cluster['terrace_x'])

# Delete duplicated columns
df_cluster.drop(['dishwasher_y', 'dryer_y', 'terrace_y'], axis=1, inplace=True)

# Rename columns
df_cluster.rename(columns={'dishwasher_x': 'dishwasher', 
                    'dryer_x': 'dryer', 'terrace_x': 'terrace',
                    'sun_umbrella_': 'sun_umbrella',
                    'colouring_book_/_pencils': 'colouring_book_pencils',
                    "child's_bed": 'childs_bed', 'awning_': 'awning',
                    'air_conditioning_': 'air_conditioning',
                    'CDs/_DVDs': 'CDs_DVDs', 'living_/_dining_room': 'living_dining_room',
                    'living_/_bedroom': 'living_bedroom', 'children`s_room': 'childrens_room',
                    'Library': 'library'}, inplace=True)

In [None]:
# Check if categorical features have a sum less than 10 for true values
np.any(df_cluster.loc[:,'option_allergic':].sum(axis=0) <= 10)

In [None]:
# Drop unneccassary features for clustering (comparison properties)
df_cluster.drop(['state', 'contract_end', 'subscription', 'binding_inquiry'],axis=1, inplace=True)

### Distance measures for mixed data: Gower’s dissimilarity

Clustering algorithms are based on distance measures to define if objects are considered similar or not. Distances need to be defined between two objects in order to use clustering algorithms. A problem with defining distances can occur when a data set consists of mixed data, for instance, numeric, binary, nominal and ordinal data. A solution is to use Gower’s dissimilarity measure (GD) that can calculate the distance between two entities whose attributes have a mix of categorical and numerical values.

In [None]:
# Calculate distance matrix
distance_matrix = gower.gower_matrix(df_cluster)

In [None]:
# Shape of distance matrix
distance_matrix.shape

In [None]:
# Save distance matrix in CSV file
#savetxt('../data/distance_matrix.csv', distance_matrix, delimiter=',')

# load numpy array from csv file
#from numpy import loadtxt
# load array
#df_cluster = loadtxt('distance_matrix.csv', delimiter=',')

### DBSCAN

The main concept of DBSCAN algorithm is to locate regions of high density that are separated from one another by regions of low density. 

Density at a point P: Number of points within a circle of Radius Eps (ϵ, eps) from point P.
Dense Region: For each point in the cluster, the circle with radius ϵ contains at least minimum number of points (MinPts, min_samples).

In [None]:
# Configuring the parameters of the clustering algorithm
dbscan_cluster = DBSCAN(eps=0.1, 
                        min_samples=5, 
                        metric="precomputed")

# Fitting the clustering algorithm
dbscan_cluster.fit(distance_matrix)

# Adding the results to a new column in the dataframe
## first try: column cluster --> eps=0.3; min_sample=5
df_cluster["cluster_01_5"] = dbscan_cluster.labels_

# Show head of new dataset
df_cluster.head()

# Export new CSV
df_cluster.to_csv('../data/clustering_20210723.csv')

In [None]:
print(df_cluster.cluster_01_5.nunique())
print(df_cluster.cluster_01_5.unique())

In [None]:
df_cluster.cluster_01_5.value_counts()

## 3. Try with only the listing dataset

In [None]:
# Drop unneccassary features for clustering (comparison properties)
df_listings.drop(['Unnamed: 0','state', 'contract_end', 'subscription', 'binding_inquiry'],axis=1, inplace=True)

In [None]:
# Calculate distance matrix
distance_matrix_listings = gower.gower_matrix(df_listings)

In [None]:
# Save distance matrix in CSV file
#savetxt('../data/distance_matrix_listings.csv', distance_matrix_listings, delimiter=',')

In [None]:
# Configuring the parameters of the clustering algorithm
dbscan_cluster = DBSCAN(eps=0.1, 
                        min_samples=5, 
                        metric="precomputed")

# Fitting the clustering algorithm
dbscan_cluster.fit(distance_matrix_listings)

# Adding the results to a new column in the dataframe
## first try: column cluster --> eps=0.3; min_sample=5
df_listings["cluster_01_5"] = dbscan_cluster.labels_

# Show head of new dataset
df_listings.head()

# Export new CSV
#df_cluster.to_csv('../data/clustering_20210723.csv')

In [None]:
print(df_listings.cluster_01_5.nunique())
print(df_listings.cluster_01_5.unique())

In [None]:
df_listings.cluster_01_5.value_counts()

## Both tries show clusters but unfortunately the clusters are not clear and evenly. With this method it's hard to comprehend how the clusters are created and what represent a cluster.