In [None]:
from pprint import pprint

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.cluster import AgglomerativeClustering, HDBSCAN
from sklearn.metrics import silhouette_score, davies_bouldin_score
import umap

%matplotlib qt5
sns.set_style('whitegrid')
sns.set_context('paper')
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 0.5

custom_12 = [
    "#1f77b4",  # blue
    "#ff7f0e",  # orange
    "#2ca02c",  # green
    "#d62728",  # red
    "#9467bd",  # purple
    "#8c564b",  # brown
    "#e377c2",  # pink
    "#000000",  # black
    "#bcbd22",  # olive
    "#17becf",  # cyan
    "#aec7e8",  # light blue
    "#ffbb78",  # light orange
]

labels = list(range(16))  # labels 0â€“15
tab20 = plt.get_cmap("tab20")

label_palette = {label: tab20(label) for label in labels}

In [None]:
from typing import Optional
import numbers

def auto_opt_pd_dtypes(df_: pd.DataFrame, inplace=False) -> Optional[pd.DataFrame]:
    """ Automatically downcast Number dtypes for minimal possible,
        will not touch other (datetime, str, object, etc)
        :param df_: dataframe
        :param inplace: if False, will return a copy of input dataset
        :return: `None` if `inplace=True` or dataframe if `inplace=False`
    """
    df_temp = df_ if inplace else df_.copy()
    print(df_temp.info())

    for col in df_temp.columns:
        # integers
        if issubclass(df_temp[col].dtypes.type, numbers.Integral):
            # unsigned integers
            if df_temp[col].min() >= 0:
                df_temp[col] = pd.to_numeric(df_temp[col], downcast='unsigned')
            # signed integers
            else:
                df_temp[col] = pd.to_numeric(df_temp[col], downcast='integer')
        # other real numbers
        elif issubclass(df_temp[col].dtypes.type, numbers.Real):
            df_temp[col] = pd.to_numeric(df_temp[col], downcast='float')

        elif issubclass(df_temp[col].dtypes.type, np.object_):
            df_temp[col] = pd.Categorical(df_temp[col])

    print(df_temp.info())
    if not inplace:
        return df_temp

In [None]:
df = pd.read_csv('kc_house_data.csv', header='infer', delimiter=',', parse_dates=['date'])
df = auto_opt_pd_dtypes(df)

In [None]:
df.describe()

In [None]:
df['sqft_basement'] = df['sqft_basement'].replace(0, np.nan)
df['yr_renovated'] = df['yr_renovated'].replace(0, np.nan)
df['yr_renovated'] = df['yr_renovated'].replace(1, np.nan)

In [None]:
cols = [col for col in df.columns if col not in ('id', 'date')]
cols_log = ['price', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']

# view is mostly 0
cols_pairplot = [col for col in cols if col not in ('sqft_basement', 'zipcode', 'lat', 'long', 'view', 'grade', 'sqft_above', 'sqft_living15', 'sqft_lot15', 'floors', 'waterfront', 'condition')]

In [None]:
df = df.loc[df['bedrooms'] < 30]

In [None]:
df_log = df.copy()
df_log[cols_log] = np.log10(df_log[cols_log])
df_log['time_since_renovation'] = 2016 - df_log['yr_renovated']
df_log['age'] = 2016 - df_log['yr_built']

In [112]:
df_modelling = df_log.copy()

df_modelling = df_modelling[[col for col in df_modelling.columns if col not in ('zipcode', 'yr_built', 'yr_renovated', 'bathrooms', 'sqft_living', 'grade', 'sqft_living15', 'sqft_lot15')]]

# df_modelling.drop(columns=['zipcode'], inplace=True)
df_modelling = df_modelling.sort_values('date').groupby('id').agg('last').reset_index()
df_modelling.loc[df_modelling['time_since_renovation'].isna(), 'time_since_renovation'] = df_modelling['age']
df_modelling = df_modelling.fillna(0)

In [113]:
from sklearn.preprocessing import MinMaxScaler

cols_norm = [col for col in df_modelling.columns if col not in ('id', 'date')]
scaler = MinMaxScaler()
df_normalized = df_modelling.copy()
df_normalized[cols_norm] = scaler.fit_transform(df_modelling[cols_norm])

In [None]:
num_data = df_modelling[cols_norm]

minmaxscaler = MinMaxScaler()
standardscaler = StandardScaler()

x = pd.DataFrame(data=standardscaler.fit_transform(num_data[cols_norm]), columns=cols_norm)

pca_input = x.dropna()
model = PCA()
pca_data = model.fit_transform(pca_input)

num_data = pd.concat([num_data, pd.DataFrame(pca_data, columns=[str(x) for x in range(len(cols_norm))])], axis=1)

In [None]:
pca_data.shape

In [None]:
sns.scatterplot(data=num_data.sort_values(by='price', ascending=True), x='0', y='1', hue='price', palette='RdYlGn')
plt.xlabel('PCA component 1')
plt.ylabel('PCA component 2')
plt.title('PCA')
plt.tight_layout()
plt.show()

print(f'Explained variance by first 5 compononents: {sum(model.explained_variance_ratio_[:5]):.3f}')

In [None]:
plt.plot(range(13), model.explained_variance_ratio_.cumsum())
plt.xlabel('PCA components')
plt.ylabel('Explained Variance')
plt.tight_layout()

In [None]:
from sklearn.cluster import HDBSCAN

reducer = umap.UMAP(n_components=4, random_state=42)
data = df_modelling[[col for col in df_modelling.columns if col not in ('id', 'date')]].values
scaled_data = StandardScaler().fit_transform(data)
embedding = reducer.fit_transform(scaled_data)
X = embedding
mcs = 150
min_samples = round(mcs * 0.5)
clusterer = HDBSCAN(
    min_cluster_size=mcs,
    min_samples=min_samples,
    metric="euclidean"
)
labels = clusterer.fit_predict(X)

In [None]:
df_hdbscan = pd.DataFrame()
df_hdbscan['label'] = labels
for i in range(X.shape[1]):
    df_hdbscan[f'UMAP {i+1}'] = X[:, i]

In [None]:
df_hdbscan.head()

In [None]:
df_hdbscan['label'].nunique()

In [None]:
df_hdbscan = df_hdbscan.loc[df_hdbscan['label'] != -1]

In [None]:
silhouette_score(df_hdbscan.iloc[:, 1:], df_hdbscan['label'])

In [None]:
davies_bouldin_score(X, labels)

## 2D UMAP Clusters

In [None]:
data = df_modelling[cols_norm].values
scaled_data = StandardScaler().fit_transform(data)
reducer = umap.UMAP(random_state=2)
embedding = reducer.fit_transform(scaled_data)
df_results = pd.DataFrame(data=embedding)
df_results = pd.concat([df_results, df_modelling], axis=1)
df_results['label'] = labels

In [None]:
col = 'label'
results_sorted = df_results.sort_values(col)
# results_sorted = results_sorted.loc[results_sorted['label'] != -1]
ax = sns.scatterplot(x=results_sorted.iloc[:, 0], y=results_sorted.iloc[:, 1], s=10, linewidth=0, hue=results_sorted[col], palette='tab10', alpha=0.2)

legend = ax.get_legend()
for handle in legend.legend_handles:
    handle.set_alpha(1)
    
# plt.title('Hierarchical Clustering on projected Data (n=6, linkage=ward)')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.tight_layout()
# plt.savefig('hierarchical_n6_ward_projected')

In [None]:
x = results_sorted['lat']
y = results_sorted['long']

ax = sns.scatterplot(x=x, y=y, hue=results_sorted['label'].astype(str), s=5, alpha=0.5, palette='tab10_r', linewidth=0)

legend = ax.get_legend()
for handle in legend.legend_handles:
    handle.set_alpha(1)
    
plt.title('Hierarchical Clustering on projected Data (n=6, linkage=ward)')
plt.xlabel('Lat.')
plt.ylabel('Long.')
plt.tight_layout()
# plt.savefig('hierarchical_n6_ward_coords')

In [None]:
df_settings = pd.read_csv('HDBSCAN_parameter_results.csv')

In [None]:
# cols_log.remove('sqft_living')
# cols_log.remove('sqft_living15')
# cols_log.remove('sqft_lot15')
# df_results[cols_log] = 10**df_results[cols_log]
df_results['renovated'] = df_results['yr_built_new'] != df_results['yr_renovated_new']
cluster_data = df_results.groupby('label').agg(['mean'])
cluster_data

In [None]:
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

# Hierarchical clustering

In [114]:
reducer = umap.UMAP(n_components=12, random_state=42)
data = df_modelling[[col for col in df_modelling.columns if col not in ('id', 'date')]].values
scaled_data = StandardScaler().fit_transform(data)
embedding = reducer.fit_transform(scaled_data)
X = embedding

# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=None, n_clusters=16, linkage='average')

model = model.fit(X)
labels = model.labels_


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [None]:
plot_dendrogram(model, p=5, truncate_mode='level')
plt.tight_layout()

In [115]:
silhouette_score(X, labels)

0.6470334529876709

In [116]:
davies_bouldin_score(X, labels)

0.4271051303726358

In [137]:
data = df_modelling[cols_norm].values
scaled_data = StandardScaler().fit_transform(data)
reducer = umap.UMAP(n_components=2, random_state=42)
embedding = reducer.fit_transform(scaled_data)
df_results = pd.DataFrame(data=embedding, columns=['UMAP 1', 'UMAP 2'])
df_results = pd.concat([df_results, df_modelling], axis=1)
df_results['label'] = labels
df_results['grade'] = df_log['grade']
df_results = df_results.loc[df_results['label'] != 13]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [138]:
# cols_log_post = list(set(cols_log) & set(df_modelling.columns))
# df_results[cols_log_post] = 10**df_results[cols_log_post]
# df_results[cols_log_post] = df_results[cols_log_post].replace(1, 0)

df_results['renovated'] = df_results['age'] != df_results['time_since_renovation']
df_results['years_until_renovation'] = df_results['age'] - df_results['time_since_renovation']
df_results['years_until_renovation_none'] = df_results['years_until_renovation'].replace(0, None)
# cluster_data = df_results.groupby('label').agg(['mean', 'median', 'std'])
cluster_data = df_results.groupby('label').agg(['mean'])
cluster_sizes = df_results.groupby('label')['id'].agg(['count'])
cluster_data['cluster_size'] = cluster_sizes
cluster_data.reset_index(inplace=True)
cluster_data.fillna(0, inplace=True)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [139]:
final_category_mapping = {
    "New Suburban (Family)": [0, 1, 7, 12],
    "Established Housing (w/o Basement)": [2, 5],
    "Established Housing (with Basement)": [4, 9, 11],
    "Luxurious": [6, 15],
    "Compact Urban Housing": [8, 10, 14],
    "Modern/Renovated": [3]
}

cluster_to_category = {
    cluster: cat
    for cat, clusters in final_category_mapping.items()
    for cluster in clusters
}

df_results["final_category"] = df_results["label"].map(cluster_to_category)
cluster_data["final_category"] = cluster_data["label"].map(cluster_to_category)

In [132]:
final_category_mapping.keys()

dict_keys(['New Suburban (Family)', 'Established Housing (w/o Basement)', 'Established Housing (with Basement)', 'Luxurious', 'Compact Urban Housing', 'Modern/Renovated'])

In [125]:
cluster_data.to_csv('results_hierarchical_ncomp12_nclusters16_linkage_avg_final_log.csv', index=False, sep=';', decimal=',')

In [126]:
df_results.to_csv('full_clustered_data_log.csv', index=False, sep=';', decimal=',')

In [127]:
tab10 = plt.get_cmap("tab10")
clusters_palette = {list(final_category_mapping.keys())[cluster_id]: tab10(cluster_id) for cluster_id in range(len(final_category_mapping.keys()))}

In [128]:
df_results.columns

Index(['UMAP 1', 'UMAP 2', 'id', 'date', 'price', 'bedrooms', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'lat', 'long', 'time_since_renovation', 'age', 'label',
       'grade', 'renovated', 'years_until_renovation',
       'years_until_renovation_none', 'final_category'],
      dtype='object')

In [129]:
df_results['renovated'] = df_results['renovated'].astype(int)

In [134]:
# feature = 'renovated'
for feature in df_results.iloc[:, 4:].columns:
    if feature == 'label':
        continue
    if feature in ('bedrooms', 'waterfront', 'view', 'condition', 'renovated', 'age'):
        discrete = True
    else:
        discrete = False
    try:
        x_delta = (df_results[feature].max() - df_results[feature].min()) * 0.05
        x_lims = (df_results[feature].min() - x_delta, df_results[feature].max() + x_delta)
    
        fig, axes = plt.subplots(2, 3, sharex=True, sharey=True, figsize=(15, 8))
        axes = axes.flatten()
    
        for i, cluster in enumerate(final_category_mapping.keys()):
            ax = axes[i]
            sns.histplot(df_results.loc[df_results['final_category'] == cluster], x=feature, hue='label', multiple='stack', palette=label_palette, linewidth=0, alpha=0.8, ax=ax, stat='percent', discrete=discrete)
            # sns.boxplot(df_results.loc[df_results['final_category'] == cluster], x='label', y=feature, hue='label', palette=label_palette, ax=ax)
    
            ax.set_title(f'{cluster}')
    
        plt.tight_layout()
        plt.savefig(f'plots/superclusters/{feature}_hist.png')
        # plt.savefig(f'plots/superclusters/{feature}_box.png')
        # plt.show(block=True)
        plt.clf()
    except ValueError as e:
        print(e)
    except TypeError as e:
        pass

In [None]:
plot_type = 'kde'
# plot_type = 'box'

# cluster_col = 'label'
cluster_col = 'final_category'
other_col = list({'label', 'final_category'} - {cluster_col})[0]

palette = 'tab20' if cluster_col == 'label' else 'tab10'

if plot_type == 'box':
    fig, axs = plt.subplots(ncols=2, nrows=4, figsize=(18, 10), sharex=True)
elif plot_type == 'kde':
    fig, axs = plt.subplots(ncols=2, nrows=4, figsize=(18, 10))
else:
    pass

index = 0
axs = axs.flatten()

cols_remove = ['bedrooms', 'floors', 'condition', 'view', 'renovated', 'waterfront', 'sqft_basement', 'time_since_renovation', 'years_until_renovation', other_col]
cols_kde = list(set(df_results.iloc[:, 4:].columns) - set(cols_remove))

for k,v in df_results[cols_kde].items():
    try:
        if k in ['bedrooms', 'floors', 'condition', 'view', 'renovated']:
            bw = 5
        else:
            bw = 1
        scale = True if k in cols_log else False
        
        if k == cluster_col:
            n_clusters = df_results[cluster_col].nunique()
            axs[-1].bar(
                range(n_clusters),
                df_results.groupby(cluster_col)['id'].agg(['count'])['count'].values,
                color=sns.color_palette(palette)[:n_clusters],
                edgecolor="none"
            )
            axs[-1].set_xticks(range(n_clusters))
            axs[-1].set_xticklabels(range(n_clusters))
            axs[-1].set_xlabel('Cluster ID')
            axs[-1].set_ylabel('Cluster size')
            index -= 1
        else:
            if plot_type == 'kde':
                sns.kdeplot(x=k, data=df_results, ax=axs[index], log_scale=scale, hue=cluster_col, common_norm=False, palette=palette, multiple='fill', legend=False, linewidth=0, fill=True, bw_adjust=bw)
            elif plot_type == 'box':
                sns.boxplot(df_results, x=cluster_col, y=k, ax=axs[index], hue=cluster_col, palette=palette, log_scale=scale, legend=False)
                axs[index].set_xlabel('Cluster ID')
            else:
                pass
    except ValueError as e:
        pass
    index += 1
    
plt.tight_layout()
plt.show()

In [None]:
plt.title("Hierarchical Clustering Dendrogram")
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=5)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

In [143]:
df_results_csv = pd.read_csv('full_clustered_data_log.csv', sep=';', decimal=',')

In [144]:
df_results_csv['sqft_lot'].describe()

count    21374.000000
mean         3.904884
std          0.393040
min          2.716003
25%          3.703291
50%          3.881955
75%          4.030154
max          6.217841
Name: sqft_lot, dtype: float64

In [140]:
col = 'final_category'
palette = 'tab10'

results_sorted = df_results.sort_values(col)
# results_sorted[col] = results_sorted[col].astype(str)
ax = sns.scatterplot(data=results_sorted, x='UMAP 1', y='UMAP 2', s=10, linewidth=0, hue=col, alpha=0.3, palette=clusters_palette)

ax.legend(ncol=2, loc='upper left')
legend = ax.get_legend()
for handle in legend.legend_handles:
    handle.set_alpha(1)

plt.title('Hierarchical Clustering on projected Data')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.tight_layout()
plt.savefig('umap_final_clusters.png', dpi=500)

In [None]:
df_results.columns

In [None]:
col = 'age'
palette = 'RdYlGn_r'

# col_filter = df_results[col] == 1
# results_sorted = df_results.loc[col_filter].sort_values(col)
results_sorted = df_results.sort_values(col, ascending=True)

sc = plt.scatter(
    results_sorted['UMAP 1'],
    results_sorted['UMAP 2'],
    c=results_sorted[col],
    cmap=palette,
    s=10,
    alpha=0.7,
    linewidths=0
)

plt.colorbar(sc, label=col)
# plt.title('Renovated Houses')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.tight_layout()

In [None]:
y_delta = (df_results['lat'].max() - df_results['lat'].min()) * 0.01
y_lims = (df_results['lat'].min() - y_delta, df_results['lat'].max() + y_delta)
x_delta = (df_results['long'].max() - df_results['long'].min()) * 0.01
x_lims = (df_results['long'].min() - x_delta, df_results['long'].max() + x_delta)

for i, cluster in enumerate(sorted(df_results['final_category'].unique())):
    ax = sns.scatterplot(df_results.loc[df_results['final_category'] == cluster], x='long', y='lat', color=sns.color_palette('tab10')[i], s=5, alpha=0.7, linewidth=0)
    
    # ax.legend(ncol=3, title='Cluster ID', loc='upper left')
    # legend = ax.get_legend()
    # for handle in legend.legend_handles:
    #     handle.set_alpha(1)
    
    plt.title(f'Hierarchical Clustering on Coordinate Data (Cluster: {cluster})')
    plt.xlabel('Lat.')
    plt.ylabel('Long.')
    plt.xlim(*x_lims)
    plt.ylim(*y_lims)
    plt.tight_layout()
    plt.show(block=True)

In [None]:
ax = sns.scatterplot(results_sorted, x='long', y='lat', hue='final_category', s=7, alpha=0.5, palette='tab10', linewidth=0)

ax.legend(ncol=3, title='Cluster ID', loc='lower right')
legend = ax.get_legend()
for handle in legend.legend_handles:
    handle.set_alpha(1)

plt.title('Hierarchical Clustering on Coordinate Data')
plt.xlabel('Lat.')
plt.ylabel('Long.')
plt.tight_layout()

In [51]:
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx

gdf = gpd.GeoDataFrame(
    df_results,
    geometry=gpd.points_from_xy(df_results['long'], df_results['lat']),
    crs="EPSG:4326"  # lat/long
)

gdf = gdf.to_crs(epsg=3857)

fig, ax = plt.subplots(figsize=(8, 8))

gdf.plot(
    ax=ax,
    column='final_category',
    categorical=True,
    legend=True,
    markersize=3,
    alpha=0.5
)

ctx.add_basemap(ax, source=ctx.providers.OpenStreetMap.Mapnik)

ax.set_title('Hierarchical Clustering on Coordinate Data')
ax.set_axis_off()
plt.tight_layout()

In [44]:
df_results['final_category'].unique().tolist()

['New Suburban (Family)',
 'Old Houses',
 'Old Suburban',
 'Renovated',
 'Compact Urban Housing',
 'Luxurious']

In [72]:
import plotly.io as pio
pio.renderers.default = "browser"

In [74]:
df_results.info()

<class 'pandas.core.frame.DataFrame'>
Index: 21374 entries, 0 to 21434
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   UMAP 1                       21374 non-null  float32       
 1   UMAP 2                       21374 non-null  float32       
 2   id                           21374 non-null  uint64        
 3   date                         21374 non-null  datetime64[ns]
 4   price                        21374 non-null  float32       
 5   bedrooms                     21374 non-null  uint8         
 6   sqft_lot                     21374 non-null  float64       
 7   floors                       21374 non-null  float32       
 8   waterfront                   21374 non-null  uint8         
 9   view                         21374 non-null  uint8         
 10  condition                    21374 non-null  uint8         
 11  sqft_above                   21374 non-null  f

In [88]:
def rgba_tuple_to_str(rgba):
    r, g, b, a = rgba
    return f"rgba({int(r*255)}, {int(g*255)}, {int(b*255)}, {a})"

clusters_palette_plotly = {
    k: rgba_tuple_to_str(v)
    for k, v in clusters_palette.items()
}

In [95]:
df_results.columns

Index(['UMAP 1', 'UMAP 2', 'id', 'date', 'price', 'bedrooms', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'lat', 'long', 'time_since_renovation', 'age', 'label',
       'renovated', 'years_until_renovation', 'years_until_renovation_none',
       'final_category'],
      dtype='object')

In [96]:
import plotly.express as px

df_map = df_results.copy()   # [['id', 'lat', 'long', 'price', 'final_category']] 

fig = px.scatter_map(
    df_map,
    lat="lat",
    lon="long",
    color="final_category",
    color_discrete_map=clusters_palette_plotly,
    size_max=20,
    zoom=12,
    center=dict(
        lat=df_map.lat.mean(),
        lon=df_map.long.mean()
    ),
    hover_name="final_category",
    hover_data=['id', 'price', 'age', 'renovated'],
    map_style="open-street-map"
)

fig.update_traces(marker=dict(opacity=0.6))
# 
# fig.update_layout(
#     mapbox_style="carto-darkmatter",
#     margin={"r":0,"t":0,"l":0,"b":0}
# )

fig.show()


In [50]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as mcolors

clusters = df_results['final_category'].unique()
cmap = cm.get_cmap('tab10', len(clusters))

cluster_colors = {
    cluster: mcolors.rgb2hex(cmap(i))
    for i, cluster in enumerate(clusters)
}

m = folium.Map(
    location=[df_results.lat.mean(), df_results.long.mean()],
    zoom_start=6
)

for _, row in df_results.loc[df_results['final_category'].isin(['Compact Urban Housing'])].iterrows():
    folium.CircleMarker(
        location=[row.lat, row.long],
        radius=row.price,
        color=cluster_colors[row.final_category],
        fill=True,
        fill_color=cluster_colors[row.final_category],
        fill_opacity=0.4,
        tooltip=row.final_category,
        linewidth=0
    ).add_to(m)

m

  cmap = cm.get_cmap('tab10', len(clusters))


In [None]:
from tqdm import tqdm

linkage_methods = ['average', 'complete', 'ward']
df_settings = pd.DataFrame(columns=['linkage', 'n_components', 'n_clusters', 'silhouette_score', 'dbi_score'])

for n in tqdm((6, 8, 10, 12)):
    reducer = umap.UMAP(n_components=n)
    data = df_modelling[[col for col in df_modelling.columns if col not in ('id', 'date')]].values
    scaled_data = StandardScaler().fit_transform(data)
    embedding = reducer.fit_transform(scaled_data)
    X = embedding
    for linkage in linkage_methods:
        print(linkage)
        for n_clusters in range(6, 19, 2):
            model = AgglomerativeClustering(distance_threshold=None, n_clusters=n_clusters, linkage=linkage)

            model = model.fit(X)
            labels = model.labels_

            temp = pd.DataFrame(data={'linkage': linkage,
                                      'n_components': n,
                                      'n_clusters': n_clusters,
                                      'silhouette_score': silhouette_score(X, labels),
                                      'dbi_score': davies_bouldin_score(X, labels)
                                      },
                                index=[0])
            df_settings = pd.concat([df_settings, temp]).reset_index(drop=True)

In [None]:
df_settings.to_csv('param_search_hierarchical_standardized.csv', index=False, sep=';', decimal=',')

In [None]:
df_settings = pd.read_csv('param_search_hierarchical_fixed_age.csv', sep=';', decimal=',')
fig, axes = plt.subplots(2, 3, sharex=True, figsize=(15, 8))

for col, linkage in enumerate(('average', 'complete', 'ward')):
    temp = df_settings.loc[df_settings['linkage'] == linkage]
    for row, score in enumerate(('silhouette_score', 'dbi_score')):
        sns.lineplot(temp, x='n_clusters', y=score, hue='n_components', palette='coolwarm', ax=axes[row, col])
        if row == 0:
            axes[row, col].set_title(f'linkage: {linkage}')
        if score == 'silhouette_score':
            axes[row, col].set_ylim(0.45, 0.7)
        else:
            axes[row, col].set_ylim(0.4, 1)
        plt.tight_layout()
plt.show(block=True)