In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

# %matplotlib qt5
sns.set_style('whitegrid')
sns.set_context('paper')
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 0.5

In [None]:
from typing import Optional
import numbers

def auto_opt_pd_dtypes(df_: pd.DataFrame, inplace=False) -> Optional[pd.DataFrame]:
    """ Automatically downcast Number dtypes for minimal possible,
        will not touch other (datetime, str, object, etc)
        :param df_: dataframe
        :param inplace: if False, will return a copy of input dataset
        :return: `None` if `inplace=True` or dataframe if `inplace=False`
    """
    df_temp = df_ if inplace else df_.copy()
    print(df_temp.info())

    for col in df_temp.columns:
        # integers
        if issubclass(df_temp[col].dtypes.type, numbers.Integral):
            # unsigned integers
            if df_temp[col].min() >= 0:
                df_temp[col] = pd.to_numeric(df_temp[col], downcast='unsigned')
            # signed integers
            else:
                df_temp[col] = pd.to_numeric(df_temp[col], downcast='integer')
        # other real numbers
        elif issubclass(df_temp[col].dtypes.type, numbers.Real):
            df_temp[col] = pd.to_numeric(df_temp[col], downcast='float')

        elif issubclass(df_temp[col].dtypes.type, np.object_):
            df_temp[col] = pd.Categorical(df_temp[col])

    print(df_temp.info())
    if not inplace:
        return df_temp

In [None]:
df = pd.read_csv('kc_house_data.csv', header='infer', delimiter=',', parse_dates=['date'])
df = auto_opt_pd_dtypes(df)

df

In [None]:
df.describe()

In [None]:
df['id'].value_counts()

In [None]:
df.loc[df['id'] == 795000620]

replace 0s with actual nan values

In [None]:
df['sqft_basement'] = df['sqft_basement'].replace(0, np.nan)
df['yr_renovated'] = df['yr_renovated'].replace(0, np.nan)
df['yr_renovated'] = df['yr_renovated'].replace(1, np.nan)

label specific columns

In [None]:
cols = [col for col in df.columns if col not in ('id', 'date')]
cols_log = ['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']

# view is mostly 0
cols_pairplot = [col for col in cols if col not in ('sqft_basement', 'zipcode', 'lat', 'long', 'view', 'grade', 'sqft_above', 'sqft_living15', 'sqft_lot15', 'floors', 'waterfront', 'condition')]

In [None]:
df = df.loc[df['bedrooms'] < 30]

In [None]:
df_log = df.copy()
df_log[cols_log] = np.log10(df_log[cols_log])

df_log

In [None]:
df_num = df[cols]
for k,v in df_num.items():
    q1 = v.quantile(0.25)
    q3 = v.quantile(0.75)
    iqr = q3 - q1
    v_outliers = v[(v <= q1 - 1.5 * iqr) | (v >= q3 + 1.5 * iqr)]
    print(f'outliers in {k}: {len(v_outliers)/len(v)*100:.2f}%')

In [None]:
# df = df[df['sqft_lot'] <= 50_000]

In [None]:
# df_num = df[cols]
# for k,v in df_num.items():
#     q1 = v.quantile(0.25)
#     q3 = v.quantile(0.75)
#     iqr = q3 - q1
#     v_outliers = v[(v <= q1 - 1.5 * iqr) | (v >= q3 + 1.5 * iqr)]
#     print(f'outliers in {k}: {len(v_outliers)/len(v)*100:.2f}%')
# 
# df

boxplots

In [None]:
fig, axs = plt.subplots(ncols=5, nrows=4, figsize=(15, 8))

index = 0
axs = axs.flatten()
for k,v in df[cols].items():
    sns.boxplot(y=k, data=df, ax=axs[index])
    index += 1
plt.tight_layout()
plt.show()

ecdfplot

In [None]:
fig, axs = plt.subplots(ncols=5, nrows=4, figsize=(15, 8))

index = 0
axs = axs.flatten()
for k,v in df[cols].items():
    sns.ecdfplot(x=k, data=df, ax=axs[index])
    index += 1
plt.tight_layout()
plt.show()

histogram/distribution

In [None]:
fig, axs = plt.subplots(ncols=5, nrows=4, figsize=(15, 8))

index = 0
axs = axs.flatten()
for k,v in df[cols].items():
    if k in cols_log:
        sns.histplot(x=k, data=df, ax=axs[index], kde=True, log_scale=True)
    else:
        sns.histplot(x=k, data=df, ax=axs[index], kde=True, log_scale=False)
    index += 1
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(ncols=4, nrows=3, figsize=(15, 8))

index = 0
axs = axs.flatten()
for k,v in df[list(set(cols) - set(cols_log))].items():
    sns.histplot(x=k, data=df, ax=axs[index], kde=True, log_scale=False)
    index += 1
plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df[cols_pairplot])

In [None]:
sns.heatmap(df[cols].corr('spearman')**2, annot=True)
plt.title('r^2 using Pearson')
plt.tight_layout()
plt.show()

In [None]:
df[cols].corr()

In [None]:
df[cols].corr('pearson')

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from pca import pca

num_data = df[cols].fillna(0)

minmaxscaler = MinMaxScaler()
standardscaler = StandardScaler()

cols_sel = [col for col in cols if col not in ('price', 'zipcode', 'lat', 'long')]
x = pd.DataFrame(data=standardscaler.fit_transform(num_data[cols_sel]), columns=cols_sel)

pca_input = x.dropna()
model = PCA(n_components=2)
pca_data = model.fit_transform(pca_input)

num_data[['PCA 1', 'PCA 2']] = pca_data

sns.scatterplot(data=num_data.sort_values(by='price', ascending=True), x='PCA 1', y='PCA 2', hue='price', palette='RdYlGn')
plt.xlabel('PCA component 1')
plt.ylabel('PCA component 2')
plt.title('PCA')
plt.tight_layout()
plt.show()

print(f'Explained variance by first 2 compononents: {sum(model.explained_variance_ratio_):.3f}')

# model = pca(n_components=2)
# pca_data = model.fit_transform(pca_input)

In [None]:
# model.biplot(alpha=0.2, s=30, color_arrow='r', figsize=(14, 8), arrowdict={'alpha': 1, 'color_weak': '#008808', 'scale_factor': 1}, dpi=80)

In [None]:
for i in range(model.components_.shape[0]):
    arr = np.abs(model.components_[i])
    indices = np.argpartition(arr, -5)[-5:]
    indices = indices[np.argsort(arr[indices])[::-1]]
    input_cols = x.columns[indices]
    values = model.components_[i][indices]
    
    print(f'PCA component {i}:')
    for j in range(5):
        print(f'{input_cols[j]}:\t{values[j]:.3f}')
    print()

In [None]:
plt.scatter(df['sqft_above']+df['sqft_basement'], df['sqft_living'])
plt.xlabel('basement + above')
plt.ylabel('sqft_living')
plt.tight_layout()

In [None]:
from matplotlib.colors import LogNorm
import numpy as np

fig, axs = plt.subplots(ncols=5, nrows=4, figsize=(15, 8), sharex=True, sharey=True)

index = 0
axs = axs.flatten()

for k, v in df[cols].items():
    data = df.sort_values(k)

    # Determine normalization
    if k in cols_log:
        # handle nonpositive values
        cvals = data[k].to_numpy()
        vmin = np.min(cvals[cvals > 0]) if np.any(cvals > 0) else 1e-6
        norm = LogNorm(vmin=vmin, vmax=np.max(cvals))
    else:
        norm = None

    sc = axs[index].scatter(
        data['lat'],
        data['long'],
        c=data[k],
        cmap='RdYlGn',
        norm=norm,
        alpha=0.6,
        s=1,
        linewidth=0
    )

    axs[index].set_title(k)
    plt.colorbar(sc, ax=axs[index])

    index += 1

plt.tight_layout()
plt.show()

In [None]:
df.info()

In [None]:
plt.scatter(df['sqft_living'], df['sqft_living15'])

standardize df

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_standardized = df.copy()
df_standardized[cols] = scaler.fit_transform(df[cols])

df_standardized

normalize df

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_normalized = df.copy()
df_normalized[cols] = scaler.fit_transform(df[cols])

df_normalized

In [None]:
fig, axs = plt.subplots(ncols=5, nrows=4, figsize=(15, 8))

index = 0
axs = axs.flatten()
for k,v in df_normalized[cols].items():
    sns.boxplot(y=k, data=df_normalized, ax=axs[index])
    index += 1
plt.tight_layout()
plt.show()

In [None]:
# Add suffixes to numeric columns
std_df_renamed = df_standardized[cols].add_suffix("_std")
norm_df_renamed = df_normalized[cols].add_suffix("_norm")

# Combine into one DataFrame
df_pair = pd.concat([std_df_renamed, norm_df_renamed], axis=1)

# sns.pairplot(df_pair)
# plt.tight_layout()

In [33]:
df_log['yr_renovated_new'] = 2025 - df_log['yr_renovated']
df_log['yr_built_new'] = 2025 - df_log['yr_built']
df_modelling = df_log.copy()

In [34]:
df_modelling = df_modelling[[col for col in df_modelling.columns if col not in ('zipcode', 'yr_built', 'yr_renovated', 'bathrooms', 'sqft_living', 'grade', 'sqft_living15', 'sqft_lot15')]]
df_modelling

Unnamed: 0,id,date,price,bedrooms,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,lat,long,yr_renovated_new,yr_built_new
0,7129300520,2014-10-13,5.346157,3,3.752048,1.0,0,0,3,3.071882,,47.511200,-122.257004,,70
1,6414100192,2014-12-09,5.730783,3,3.859859,2.0,0,0,3,3.336460,2.602060,47.721001,-122.319000,34.0,74
2,5631500400,2015-02-25,5.255272,2,4.000000,1.0,0,0,3,2.886491,,47.737900,-122.233002,,92
3,2487200875,2014-12-09,5.781037,4,3.698970,1.0,0,0,5,3.021189,2.959041,47.520802,-122.392998,,60
4,1954400510,2015-02-18,5.707570,3,3.907411,1.0,0,0,3,3.225309,,47.616798,-122.044998,,38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,2014-05-21,5.556303,3,3.053463,3.0,0,0,3,3.184691,,47.699299,-122.346001,,16
21609,6600060120,2015-02-23,5.602060,4,3.764400,2.0,0,0,3,3.363612,,47.510700,-122.362000,,11
21610,1523300141,2014-06-23,5.604335,2,3.130334,2.0,0,0,3,3.008600,,47.594398,-122.299004,,16
21611,291310100,2015-01-16,5.602060,3,3.378034,2.0,0,0,3,3.204120,,47.534500,-122.069000,,21


In [35]:
df_modelling = df_modelling.sort_values('date').groupby('id').agg('last').reset_index()
df_modelling.loc[df_modelling['yr_renovated_new'].isna(), 'yr_renovated_new'] = df_modelling['yr_built_new']
df_modelling['sqft_basement'] = df_modelling['sqft_basement'].fillna(0)
df_modelling

Unnamed: 0,id,date,price,bedrooms,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,lat,long,yr_renovated_new,yr_built_new
0,1000102,2015-04-22,5.477121,6,3.971879,2.0,0,0,3,3.380211,0.000000,47.326199,-122.213997,34.0,34
1,1200019,2014-05-08,5.811240,4,4.415574,1.0,0,0,4,3.064458,2.954243,47.444401,-122.350998,78.0,78
2,1200021,2014-08-11,5.602060,3,4.633468,1.0,0,0,3,3.164353,0.000000,47.443401,-122.347000,73.0,73
3,2800031,2015-04-01,5.371068,3,3.880756,1.5,0,0,4,3.004321,2.623249,47.478298,-122.264999,95.0,95
4,3600057,2015-03-19,5.604766,4,3.544564,1.0,0,0,3,2.880814,2.949390,47.580299,-122.293999,12.0,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21430,9842300095,2014-07-25,5.562293,5,3.619928,1.5,0,0,3,3.204120,0.000000,47.529701,-122.380997,98.0,98
21431,9842300485,2015-03-11,5.579783,2,3.867585,1.0,0,0,5,2.924279,2.301030,47.528500,-122.377998,86.0,86
21432,9842300540,2014-06-24,5.530200,3,3.615740,1.0,0,0,4,2.857332,2.579784,47.529598,-122.378998,83.0,83
21433,9895000040,2014-07-03,5.601952,2,3.002166,1.5,0,0,3,2.954242,2.707570,47.544601,-122.017998,14.0,14


In [36]:
from sklearn.preprocessing import MinMaxScaler

cols_norm = [col for col in df_modelling.columns if col not in ('id', 'date')]
scaler = MinMaxScaler()
df_normalized = df_modelling.copy()
df_normalized[cols_norm] = scaler.fit_transform(df_modelling[cols_norm])

df_normalized

Unnamed: 0,id,date,price,bedrooms,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,lat,long,yr_renovated_new,yr_built_new
0,1000102,2015-04-22,0.299320,0.545455,0.358633,0.4,0.0,0.0,0.50,0.607344,0.000000,0.273925,0.253324,0.208696,0.208696
1,1200019,2014-05-08,0.465429,0.363636,0.485337,0.0,0.0,0.0,0.75,0.398401,0.802119,0.464053,0.139535,0.591304,0.591304
2,1200021,2014-08-11,0.361434,0.272727,0.547560,0.0,0.0,0.0,0.50,0.464504,0.000000,0.462445,0.142855,0.547826,0.547826
3,2800031,2015-04-01,0.246594,0.272727,0.332612,0.2,0.0,0.0,0.75,0.358607,0.712250,0.518576,0.210963,0.739130,0.739130
4,3600057,2015-03-19,0.362779,0.363636,0.236607,0.0,0.0,0.0,0.50,0.276878,0.800802,0.682645,0.186877,0.017391,0.556522
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21430,9842300095,2014-07-25,0.341663,0.454545,0.258129,0.2,0.0,0.0,0.50,0.490819,0.000000,0.601258,0.114619,0.765217,0.765217
21431,9842300485,2015-03-11,0.350359,0.181818,0.328851,0.0,0.0,0.0,1.00,0.305640,0.624763,0.599325,0.117109,0.660870,0.660870
21432,9842300540,2014-06-24,0.325708,0.272727,0.256933,0.0,0.0,0.0,0.75,0.261340,0.700448,0.601092,0.116279,0.634783,0.634783
21433,9895000040,2014-07-03,0.361380,0.181818,0.081718,0.2,0.0,0.0,0.50,0.325468,0.735144,0.625225,0.416114,0.034783,0.034783


In [126]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from pca import pca

num_data = df_modelling[cols_norm]

minmaxscaler = MinMaxScaler()
standardscaler = StandardScaler()

x = pd.DataFrame(data=standardscaler.fit_transform(num_data[cols_norm]), columns=cols_norm)

pca_input = x.dropna()
model = PCA(n_components=5)
pca_data = model.fit_transform(pca_input)

num_data[['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5']] = pca_data

sns.scatterplot(data=num_data.sort_values(by='price', ascending=True), x='PCA 1', y='PCA 2', hue='price', palette='RdYlGn')
plt.xlabel('PCA component 1')
plt.ylabel('PCA component 2')
plt.title('PCA')
plt.tight_layout()
plt.show()

print(f'Explained variance by first 5 compononents: {sum(model.explained_variance_ratio_):.3f}')

# model = pca(n_components=2)
# pca_data = model.fit_transform(pca_input)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  num_data[['PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5']] = pca_data


Explained variance by first 5 compononents: 0.726


In [39]:
from sklearn.metrics import silhouette_score, davies_bouldin_score
from tqdm import tqdm

In [59]:
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering


def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

linkage_methods = ['ward', 'average', 'complete']
for l in range(len(linkage_methods)):
    method = linkage_methods[l]
    results_silhouette = {}
    results_dbi = {}
    
    for i in tqdm(range(3, 11)):
        X = df_normalized[cols_norm]
        
        # setting distance_threshold=0 ensures we compute the full tree.
        model = AgglomerativeClustering(distance_threshold=None, n_clusters=i, linkage=method)
        
        model = model.fit(X)
        labels = model.labels_
        results_silhouette[i] = silhouette_score(X, labels)
        results_dbi[i] = davies_bouldin_score(X, labels)
        
    plt.plot(results_silhouette.keys(), results_silhouette.values())
    plt.title(f"Silhouette Score - Reduced Data ('{method}')")
    plt.xlabel('n Clusters')
    plt.ylabel('Silhouette Score')
    plt.tight_layout()
    plt.savefig(f'silhouette_reduced_{method}')
    plt.clf()
    
    plt.plot(results_dbi.keys(), results_dbi.values())
    plt.title(f"Davis-Bouldin Score - Reduced Data ('{method}')")
    plt.xlabel('n Clusters')
    plt.ylabel('Davis-Bouldin Score')
    plt.tight_layout()
    plt.savefig(f'dbi_reduced_{method}')
    plt.clf()


100%|██████████| 8/8 [04:29<00:00, 33.64s/it]
100%|██████████| 8/8 [04:18<00:00, 32.30s/it]
100%|██████████| 8/8 [03:55<00:00, 29.45s/it]


<Figure size 640x480 with 0 Axes>

In [113]:
temp1=pd.DataFrame(data={'min_cluster_size_pct': 1,
                   'min_cluster_size': 2,
                   'min_samples': 3,
                   'metric': 4,
                   'cluster_selection': 5}, 
             index=[0])
temp2=pd.DataFrame(data={'min_cluster_size_pct': 5,
                   'min_cluster_size': 2,
                   'min_samples': 3,
                   'metric': 4,
                   'cluster_selection': 5}, 
             index=[0])
pd.concat([temp1, temp2]).reset_index(drop=True)

Unnamed: 0,min_cluster_size_pct,min_cluster_size,min_samples,metric,cluster_selection
0,1,2,3,4,5
1,5,2,3,4,5


In [132]:
import numpy as np
from sklearn.cluster import HDBSCAN
import umap

setting_cols = ['n_components', 'min_cluster_size_pct', 'min_cluster_size', 'min_samples', 'n_clusters',
                'noise_pct', 'silhouette_score', 'dbi_score']
df_settings = pd.DataFrame(columns=setting_cols)


for n_components in (4, 6, 8, 10):
    reducer = umap.UMAP(n_components=n_components)
    data = df_modelling[[col for col in df_modelling.columns if col not in ('id', 'date')]].values
    scaled_data = StandardScaler().fit_transform(data)
    embedding = reducer.fit_transform(scaled_data)
    X = embedding
    for mcs_pct in tqdm((0.05, 0.025, 0.01, 0.005)):
        mcs = round(len(df_modelling) * mcs_pct)
        min_samples_list = [round(i * mcs) for i in [0.5, 0.25, 0.1, 0.05]]
        for min_samples in min_samples_list:
            clusterer = HDBSCAN(
                min_cluster_size=mcs,
                min_samples=min_samples,
                metric="euclidean"
            )
            labels = clusterer.fit_predict(X)
    
            n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
            noise_frac = np.mean(labels == -1)
    
            temp = pd.DataFrame(data={'n_components': n_components,
                                      'min_cluster_size_pct': mcs_pct,
                                      'min_cluster_size': mcs,
                                      'min_samples': min_samples,
                                      'n_clusters': n_clusters,
                                      'noise_pct': noise_frac,
                                      'silhouette_score': silhouette_score(X, labels),
                                      'dbi_score': davies_bouldin_score(X, labels)
                                      },
                                index=[0])
            df_settings = pd.concat([df_settings, temp]).reset_index(drop=True)

  df_settings = pd.concat([df_settings, temp]).reset_index(drop=True)
100%|██████████| 4/4 [02:43<00:00, 40.89s/it]
100%|██████████| 4/4 [02:41<00:00, 40.49s/it]
100%|██████████| 4/4 [02:47<00:00, 41.94s/it]
100%|██████████| 4/4 [02:50<00:00, 42.63s/it]


In [133]:
df_settings.to_csv('HDBSCAN_parameter_results.csv', index=False)

In [134]:
sns.pairplot(df_settings)

<seaborn.axisgrid.PairGrid at 0x1eddf8c2e00>

In [131]:
sns.boxplot(df_settings, x='cluster_selection', y='dbi_score')

<Axes: xlabel='cluster_selection', ylabel='dbi_score'>

In [90]:
# plt.plot(n_clusters_list, results_silhouette.values())
plt.title(f"Silhouette Score - Reduced Data (HDBSCAN)")
plt.xlabel('n Clusters')
plt.ylabel('Silhouette Score')
plt.tight_layout()
plt.savefig(f'silhouette_reduced_HDBSCAN_clusters')
plt.clf()

plt.plot(results_silhouette.keys(), results_silhouette.values())
plt.title(f"Silhouette Score - Reduced Data (HDBSCAN)")
plt.xlabel('min_clusters/N [%]')
plt.ylabel('Silhouette Score')
plt.tight_layout()
plt.savefig(f'silhouette_reduced_HDBSCAN_percentage')
plt.clf()

plt.plot(n_clusters_list, results_dbi.values())
plt.title(f"Davis-Bouldin Score - Reduced Data (HDBSCAN)")
plt.xlabel('n Clusters')
plt.ylabel('Davis-Bouldin Score')
plt.tight_layout()
plt.savefig(f'dbi_reduced_HDBSCAN_clusters')
plt.clf()

plt.plot(results_dbi.keys(), results_dbi.values())
plt.title(f"Davis-Bouldin Score - Reduced Data (HDBSCAN)")
plt.xlabel('min_clusters/N [%]')
plt.ylabel('Davis-Bouldin Score')
plt.tight_layout()
plt.savefig(f'dbi_reduced_HDBSCAN_percentage')
plt.clf()


In [91]:
mcs, n_clusters, noise_frac, stability = zip(*results)

plt.figure()
plt.plot(mcs, n_clusters)
plt.xlabel("min_cluster_size")
plt.ylabel("Number of clusters")
plt.show()

plt.figure()
plt.plot(mcs, noise_frac)
plt.xlabel("min_cluster_size")
plt.ylabel("Noise fraction")
plt.show()

plt.figure()
plt.plot(mcs, stability)
plt.xlabel("min_cluster_size")
plt.ylabel("Mean cluster stability")
plt.show()


In [82]:
X = df_normalized[cols_norm]

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=None, n_clusters=6, linkage='ward')

model = model.fit(X)
labels = model.labels_

In [None]:
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [144]:
reducer = umap.UMAP(n_components=6)
data = df_modelling[[col for col in df_modelling.columns if col not in ('id', 'date')]].values
scaled_data = StandardScaler().fit_transform(data)
embedding = reducer.fit_transform(scaled_data)
X = embedding
mcs = round(len(df_modelling) * 0.025)
min_samples = round(mcs * 0.25)
clusterer = HDBSCAN(
    min_cluster_size=mcs,
    min_samples=min_samples,
    metric="euclidean"
)
labels = clusterer.fit_predict(X)

In [64]:
import umap

In [137]:
from sklearn.preprocessing import StandardScaler

data = df_modelling[cols_norm].values
scaled_data = StandardScaler().fit_transform(data)

In [147]:
reducer = umap.UMAP()

In [148]:
embedding = reducer.fit_transform(scaled_data)
embedding.shape

(21435, 2)

In [149]:
df_results = pd.DataFrame(data=embedding)
df_results = pd.concat([df_results, df_modelling], axis=1)
df_results['label'] = labels
# df_results

In [152]:
col = 'label'
results_sorted = df_results.sort_values(col)
plt.scatter(x=results_sorted.iloc[:, 0], y=results_sorted.iloc[:, 1], s=3, c=results_sorted[col], cmap='tab10_r', alpha=0.7)
plt.colorbar()
plt.title('Hierarchical Clustering on projected Data (n=6, linkage=ward)')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.tight_layout()
# plt.savefig('hierarchical_n6_ward_projected')

In [153]:
df_results['price_real'] = 10**df_results['price']
cluster_data = df_results.groupby('label')['price_real'].agg(['mean', 'median', 'std', 'max', 'min'])
cluster_data

Unnamed: 0_level_0,mean,median,std,max,min
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-1,1340663.0,949974.96875,1091034.0,7062500.5,285000.0625
0,506462.2,474800.125,182980.8,1850000.5,204999.953125
1,439347.4,375000.0625,257129.4,2719999.75,88999.984375
2,484777.8,415000.09375,280515.6,4489000.5,74999.960938
3,563659.3,517999.9375,292055.0,3300001.75,133999.984375
4,553947.4,468999.75,350639.0,5570001.5,100000.0
5,891311.7,749999.625,575498.7,7700001.5,153999.921875


In [154]:
sns.boxplot(df_results, x='label', y='price_real')

[14-01-2026 02:49:09] [matplotlib.category] [INFO] Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
[14-01-2026 02:49:09] [matplotlib.category] [INFO] Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


<Axes: xlabel='label', ylabel='price_real'>

In [155]:
sns.boxplot(df_results, x='label', y='price')

[14-01-2026 02:49:13] [matplotlib.category] [INFO] Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.
[14-01-2026 02:49:13] [matplotlib.category] [INFO] Using categorical units to plot a list of strings that are all parsable as floats or dates. If these strings should be plotted as numbers, cast to the appropriate data type before plotting.


<Axes: xlabel='label', ylabel='price'>

In [156]:
plt.scatter(results_sorted['lat'], results_sorted['long'], c=results_sorted['label'], s=4, alpha=0.6, cmap='tab10_r')
plt.colorbar()
plt.title('Hierarchical Clustering on projected Data (n=6, linkage=ward)')
plt.xlabel('Lat.')
plt.ylabel('Long.')
plt.tight_layout()
plt.savefig('hierarchical_n6_ward_coords')

In [76]:
%matplotlib qt5