In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.cluster import AgglomerativeClustering, HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
import umap

%matplotlib qt5
sns.set_style('whitegrid')
sns.set_context('paper')
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 0.5

In [2]:
from typing import Optional
import numbers

def auto_opt_pd_dtypes(df_: pd.DataFrame, inplace=False) -> Optional[pd.DataFrame]:
    """ Automatically downcast Number dtypes for minimal possible,
        will not touch other (datetime, str, object, etc)
        :param df_: dataframe
        :param inplace: if False, will return a copy of input dataset
        :return: `None` if `inplace=True` or dataframe if `inplace=False`
    """
    df_temp = df_ if inplace else df_.copy()
    print(df_temp.info())

    for col in df_temp.columns:
        # integers
        if issubclass(df_temp[col].dtypes.type, numbers.Integral):
            # unsigned integers
            if df_temp[col].min() >= 0:
                df_temp[col] = pd.to_numeric(df_temp[col], downcast='unsigned')
            # signed integers
            else:
                df_temp[col] = pd.to_numeric(df_temp[col], downcast='integer')
        # other real numbers
        elif issubclass(df_temp[col].dtypes.type, numbers.Real):
            df_temp[col] = pd.to_numeric(df_temp[col], downcast='float')

        elif issubclass(df_temp[col].dtypes.type, np.object_):
            df_temp[col] = pd.Categorical(df_temp[col])

    print(df_temp.info())
    if not inplace:
        return df_temp

In [3]:
df = pd.read_csv('kc_house_data.csv', header='infer', delimiter=',', parse_dates=['date'])
df = auto_opt_pd_dtypes(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             21613 non-null  int64         
 1   date           21613 non-null  datetime64[ns]
 2   price          21613 non-null  float64       
 3   bedrooms       21613 non-null  int64         
 4   bathrooms      21613 non-null  float64       
 5   sqft_living    21613 non-null  int64         
 6   sqft_lot       21613 non-null  int64         
 7   floors         21613 non-null  float64       
 8   waterfront     21613 non-null  int64         
 9   view           21613 non-null  int64         
 10  condition      21613 non-null  int64         
 11  grade          21613 non-null  int64         
 12  sqft_above     21613 non-null  int64         
 13  sqft_basement  21613 non-null  int64         
 14  yr_built       21613 non-null  int64         
 15  yr_renovated   2161

In [4]:
df['sqft_basement'] = df['sqft_basement'].replace(0, np.nan)
df['yr_renovated'] = df['yr_renovated'].replace(0, np.nan)
df['yr_renovated'] = df['yr_renovated'].replace(1, np.nan)

In [5]:
cols = [col for col in df.columns if col not in ('id', 'date')]
cols_log = ['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']

# view is mostly 0
cols_pairplot = [col for col in cols if col not in ('sqft_basement', 'zipcode', 'lat', 'long', 'view', 'grade', 'sqft_above', 'sqft_living15', 'sqft_lot15', 'floors', 'waterfront', 'condition')]

In [6]:
df = df.loc[df['bedrooms'] < 30]

In [7]:
df_log = df.copy()
df_log[cols_log] = np.log10(df_log[cols_log])
df_log['yr_renovated_new'] = 2025 - df_log['yr_renovated']
df_log['yr_built_new'] = 2025 - df_log['yr_built']

In [8]:
df_modelling = df_log.copy()

df_modelling = df_modelling[[col for col in df_modelling.columns if col not in (
'zipcode', 'yr_built', 'yr_renovated', 'bathrooms', 'sqft_living', 'grade', 'sqft_living15', 'sqft_lot15')]]

df_modelling = df_modelling.sort_values('date').groupby('id').agg('last').reset_index()
df_modelling.loc[df_modelling['yr_renovated_new'].isna(), 'yr_renovated_new'] = df_modelling['yr_built_new']
df_modelling['sqft_basement'] = df_modelling['sqft_basement'].fillna(0)

In [9]:
from sklearn.preprocessing import MinMaxScaler

cols_norm = [col for col in df_modelling.columns if col not in ('id', 'date')]
scaler = MinMaxScaler()
df_normalized = df_modelling.copy()
df_normalized[cols_norm] = scaler.fit_transform(df_modelling[cols_norm])

In [10]:
# 
# num_data = df_modelling[cols_norm]
# 
# minmaxscaler = MinMaxScaler()
# standardscaler = StandardScaler()
# 
# x = pd.DataFrame(data=standardscaler.fit_transform(num_data[cols_norm]), columns=cols_norm)
# 
# pca_input = x.dropna()
# model = PCA(n_components=5)
# pca_data = model.fit_transform(pca_input)
# 
# num_data[['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5']] = pca_data
# 
# sns.scatterplot(data=num_data.sort_values(by='price', ascending=True), x='PCA 1', y='PCA 2', hue='price', palette='RdYlGn')
# plt.xlabel('PCA component 1')
# plt.ylabel('PCA component 2')
# plt.title('PCA')
# plt.tight_layout()
# plt.show()
# 
# print(f'Explained variance by first 5 compononents: {sum(model.explained_variance_ratio_):.3f}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_data[['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5']] = pca_data


Explained variance by first 5 compononents: 0.726


In [37]:
from sklearn.cluster import HDBSCAN

reducer = umap.UMAP(n_components=6)
data = df_modelling[[col for col in df_modelling.columns if col not in ('id', 'date')]].values
scaled_data = StandardScaler().fit_transform(data)
embedding = reducer.fit_transform(scaled_data)
X = embedding
mcs = round(len(df_modelling) * 0.025)
min_samples = round(mcs * 0.25)
clusterer = HDBSCAN(
    min_cluster_size=mcs,
    min_samples=min_samples,
    metric="euclidean"
)
labels = clusterer.fit_predict(X)

## 2D UMAP Clusters

In [38]:
data = df_modelling[cols_norm].values
scaled_data = StandardScaler().fit_transform(data)
reducer = umap.UMAP()
embedding = reducer.fit_transform(scaled_data)
df_results = pd.DataFrame(data=embedding)
df_results = pd.concat([df_results, df_modelling], axis=1)
df_results['label'] = labels

In [40]:
col = 'label'
results_sorted = df_results.sort_values(col)
results_sorted = results_sorted.loc[results_sorted['label'] != -1]
ax = sns.scatterplot(x=results_sorted.iloc[:, 0], y=results_sorted.iloc[:, 1], s=10, linewidth=0, hue=results_sorted[col], palette='tab10', alpha=0.2)

legend = ax.get_legend()
for handle in legend.legend_handles:
    handle.set_alpha(1)
    
# plt.title('Hierarchical Clustering on projected Data (n=6, linkage=ward)')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.tight_layout()
# plt.savefig('hierarchical_n6_ward_projected')

In [32]:
x = results_sorted['lat']
y = results_sorted['long']

ax = sns.scatterplot(x=x, y=y, hue=results_sorted['label'].astype(str), s=5, alpha=0.5, palette='tab10_r', linewidth=0)

legend = ax.get_legend()
for handle in legend.legend_handles:
    handle.set_alpha(1)
    
plt.title('Hierarchical Clustering on projected Data (n=6, linkage=ward)')
plt.xlabel('Lat.')
plt.ylabel('Long.')
plt.tight_layout()
# plt.savefig('hierarchical_n6_ward_coords')

In [19]:
df_settings = pd.read_csv('HDBSCAN_parameter_results.csv')

In [45]:
# cols_log.remove('sqft_living')
# cols_log.remove('sqft_living15')
# cols_log.remove('sqft_lot15')
# df_results[cols_log] = 10**df_results[cols_log]
df_results['renovated'] = df_results['yr_built_new'] != df_results['yr_renovated_new']
cluster_data = df_results.groupby('label').agg(['mean'])
cluster_data

Unnamed: 0_level_0,0,1,id,date,price,bedrooms,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,lat,long,yr_renovated_new,yr_built_new,renovated
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
-1,-2.056331,3.706224,5522237000.0,2014-10-31 22:11:40.884955648,1356166.0,3.0,19733.924779,1.480088,0.721239,2.716814,3.39823,2249.22998,514.517699,47.58276,-122.212082,43.725664,51.672566,0.185841
0,11.473857,-6.403739,4429368000.0,2014-10-27 23:30:43.253234688,506462.2,2.837338,1604.131238,2.990758,0.0,0.007394,3.0,1547.105347,28.295749,47.665104,-122.339432,18.009242,18.009242,0.0
1,8.305052,11.449243,4391379000.0,2014-10-25 23:45:25.362517248,439441.2,3.151573,16175.130506,1.315321,0.0,0.010944,4.187962,1630.372925,1.218057,47.531063,-122.212463,65.915185,66.749384,0.019973
2,16.338198,5.948574,4610391000.0,2014-11-01 02:24:56.605122048,484778.1,3.297915,15217.201072,1.663311,0.0,0.005837,2.9838,2031.234741,6.483502,47.544739,-122.171043,41.569744,43.652412,0.038952
3,0.749989,7.643003,4533159000.0,2014-10-21 08:13:39.430950912,561219.8,3.593685,12860.260583,1.146773,0.0,0.031228,4.269951,1401.443726,779.975017,47.57761,-122.256195,71.692575,72.32755,0.012838
4,2.525029,-0.548284,4613020000.0,2014-11-03 19:31:48.370044160,554501.6,3.549437,12967.360744,1.35022,0.0,0.036711,3.006853,1558.687256,674.802496,47.5881,-122.245598,48.889623,53.641214,0.071953
5,7.865779,2.358121,4774841000.0,2014-10-28 08:28:48.000000000,891321.6,3.613333,25574.193333,1.530909,0.0,2.480606,3.491515,2197.38916,648.890303,47.565762,-122.239136,53.442424,58.107879,0.08303


In [47]:
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [121]:
reducer = umap.UMAP(n_components=10)
data = df_modelling[[col for col in df_modelling.columns if col not in ('id', 'date')]].values
scaled_data = StandardScaler().fit_transform(data)
embedding = reducer.fit_transform(scaled_data)
X = embedding

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=None, n_clusters=13, linkage='average')

model = model.fit(X)
labels = model.labels_

In [122]:
silhouette_score(X, labels)

0.6222167611122131

In [123]:
davies_bouldin_score(X, labels)

0.4743468575263468

In [56]:
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=5)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [124]:
data = df_modelling[cols_norm].values
scaled_data = StandardScaler().fit_transform(data)
reducer = umap.UMAP()
embedding = reducer.fit_transform(scaled_data)
df_results = pd.DataFrame(data=embedding)
df_results = pd.concat([df_results, df_modelling], axis=1)
df_results['label'] = labels

In [127]:
col = 'label'
results_sorted = df_results.sort_values(col)
# results_sorted = results_sorted.loc[results_sorted['label'] != -1]
ax = sns.scatterplot(x=results_sorted.iloc[:, 0], y=results_sorted.iloc[:, 1], s=5, linewidth=0, hue=results_sorted[col],
                     palette='tab20', alpha=0.5)

legend = ax.get_legend()
for handle in legend.legend_handles:
    handle.set_alpha(1)

# plt.title('Hierarchical Clustering on projected Data (n=6, linkage=ward)')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.tight_layout()
# plt.savefig('hierarchical_n6_ward_projected')

In [99]:
x = results_sorted['lat']
y = results_sorted['long']

ax = sns.scatterplot(x=x, y=y, hue=results_sorted['label'].astype(str), s=5, alpha=0.5, palette='tab10_r', linewidth=0)

legend = ax.get_legend()
for handle in legend.legend_handles:
    handle.set_alpha(1)

plt.title('Hierarchical Clustering on projected Data (n=6, linkage=ward)')
plt.xlabel('Lat.')
plt.ylabel('Long.')
plt.tight_layout()
# plt.savefig('hierarchical_n6_ward_coords')


In [131]:
# df_results[cols_log] = 10**df_results[cols_log]
df_results['renovated'] = df_results['yr_built_new'] != df_results['yr_renovated_new']
df_results['years_until_renovation'] = df_results['yr_built_new'] - df_results['yr_renovated_new']
df_results['years_until_renovation'].replace(0, None, inplace=True)
cluster_data = df_results.groupby('label').agg(['mean'])
cluster_data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_results['years_until_renovation'].replace(0, None, inplace=True)


Unnamed: 0_level_0,0,1,id,date,price,bedrooms,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,lat,long,yr_renovated_new,yr_built_new,renovated,years_until_renovation
Unnamed: 0_level_1,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
0,16.842068,0.284823,4676087000.0,2014-10-27 10:30:50.250536704,634813.5,4.16786,13043.787044,2.000716,0.0,0.004653,2.999284,2824.104492,1.0,47.53677,-122.126015,24.815676,25.469936,0.015032,43.52381
1,0.890852,7.861189,4526826000.0,2014-10-21 07:39:34.468085248,563812.1,3.594372,12755.583734,1.157344,0.0,0.032258,4.268016,1408.672241,778.636925,47.578732,-122.256241,71.361359,71.999314,0.01304,48.921053
2,8.301653,11.485629,4393369000.0,2014-10-26 00:50:26.600985344,438437.9,3.151067,16184.016694,1.315271,0.0,0.010947,4.188287,1629.560181,1.316366,47.530827,-122.21244,65.933224,66.762999,0.019978,41.534247
3,18.025635,5.212155,4908143000.0,2014-10-30 04:08:32.995484160,455802.5,2.886603,8443.377321,1.990718,0.0,0.005018,2.998996,2024.756104,1.039137,47.534725,-122.136116,27.009533,27.635725,0.015554,40.258065
4,15.019461,9.889189,4381424000.0,2014-11-05 19:26:42.919708160,383734.9,2.899854,21674.041752,1.181168,0.0,0.007299,2.961168,1443.831299,1.151533,47.55365,-122.219154,64.965547,69.177226,0.074161,56.791339
5,10.217995,-6.13478,4426111000.0,2014-11-11 17:57:59.329608960,481916.6,2.307263,1263.709497,2.972067,0.0,0.011173,3.0,1347.195435,83.497207,47.655861,-122.342659,19.256983,19.256983,0.0,
6,12.790297,-7.042036,4430978000.0,2014-10-20 16:14:35.138121472,518599.5,3.099448,1772.461326,3.0,0.0,0.005525,3.0,1645.955811,1.0,47.669674,-122.337845,17.392265,17.392265,0.0,
7,-6.336211,6.841465,4490512000.0,2014-10-30 22:49:19.509202432,1661876.0,3.300613,25371.828221,1.641104,1.0,3.766871,3.533742,2473.042969,701.042945,47.537365,-122.281601,51.791411,62.809816,0.257669,42.761905
8,0.784772,0.387335,4575616000.0,2014-11-08 07:03:15.227765760,470402.1,3.477946,13130.343818,1.060557,0.0,0.034707,2.984816,1326.625,677.953362,47.583527,-122.250847,61.531815,62.824657,0.029646,43.609756
9,7.662488,2.338885,4774841000.0,2014-10-28 08:28:48.000000000,891321.6,3.613333,25574.193333,1.530909,0.0,2.480606,3.491515,2197.38916,648.890303,47.565762,-122.239136,53.442424,58.107879,0.08303,56.189781


In [129]:
cluster_data.to_csv('results_hierarchical_ncomp10_nclusters13_linkage_avg.csv', index=False)

In [21]:
from tqdm import tqdm

linkage_methods = ['average', 'complete', 'ward']
df_results = pd.DataFrame(columns=['linkage', 'n_components', 'n_clusters', 'silhouette_score', 'dbi_score'])

for n in tqdm((4, 6, 8, 10)):
    reducer = umap.UMAP(n_components=n)
    data = df_modelling[[col for col in df_modelling.columns if col not in ('id', 'date')]].values
    scaled_data = StandardScaler().fit_transform(data)
    embedding = reducer.fit_transform(scaled_data)
    X = embedding
    for linkage in linkage_methods:
        print(linkage)
        for n_clusters in range(8, 21, 2):
            model = AgglomerativeClustering(distance_threshold=None, n_clusters=n_clusters, linkage='average')

            model = model.fit(X)
            labels = model.labels_

            temp = pd.DataFrame(data={'linkage': linkage,
                                      'n_components': n,
                                      'n_clusters': n_clusters,
                                      'silhouette_score': silhouette_score(X, labels),
                                      'dbi_score': davies_bouldin_score(X, labels)
                                      },
                                index=[0])
            df_results = pd.concat([df_results, temp]).reset_index(drop=True)

  0%|          | 0/4 [00:00<?, ?it/s]

average


  df_results = pd.concat([df_results, temp]).reset_index(drop=True)


complete
ward


 25%|██▌       | 1/4 [08:26<25:20, 506.83s/it]

average
complete
ward


 50%|█████     | 2/4 [17:26<17:32, 526.21s/it]

average
complete
ward


 75%|███████▌  | 3/4 [24:19<07:54, 474.56s/it]

average
complete
ward


100%|██████████| 4/4 [30:44<00:00, 461.06s/it]


In [22]:
df_results.to_csv('param_search_hierarchical_new.csv', index=False)

In [28]:
df_results = pd.read_csv('param_search_hierarchical_new.csv')
for linkage in linkage_methods:
    temp = df_results.loc[df_results['linkage'] == linkage]
    for score in ('silhouette_score', 'dbi_score'):
        sns.lineplot(temp, x='n_clusters', y=score, hue='n_components', palette='coolwarm')
        plt.title(f'linkage: {linkage}')
        plt.tight_layout()
        plt.show(block=True)