In [None]:
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
import numpy as np
from sklearn.mixture import GaussianMixture

In [None]:
path = 'files/contextual.parquet'

In [None]:
data = pd.read_parquet(path)

In [None]:
# normalise data

x = data.values
scaler = preprocessing.StandardScaler()
cols = list(data.columns)
data[cols] = scaler.fit_transform(data[cols])

We have now normalised data, let's save them.

In [None]:
data.to_parquet('files/contex_data_norm.parquet')

In [None]:
bic = pd.DataFrame(columns=['n', 'bic', 'run'])
ix = 0

n_components_range = range(2, 40)
gmmruns = 3

Measure BIC to estimate optimal number of clusters.

In [None]:
sample = data
for n_components in n_components_range:
    for i in range(gmmruns):
        gmm = GaussianMixture(n_components=n_components, covariance_type="full", max_iter=200, n_init=1, verbose=1)
        fitted = gmm.fit(sample)
        bicnum = gmm.bic(data)
        bic.loc[ix] = [n_components, bicnum, i]
        ix += 1

        print(n_components, i, "BIC:", bicnum)

In [None]:
bic.to_csv('files/complete_BIC.csv')

Plot BIC values

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(16, 16))
sns.lineplot(ax=ax, x='n', y='bic', data=bic)
plt.savefig('files/complete_BIC.pdf')

## Clustering

In [None]:
n = 30

gmm = GaussianMixture(n_components=n, covariance_type="full", max_iter=200, n_init=5, verbose=1)
fitted = gmm.fit(data)

In [None]:
data['cluster'] = gmm.predict(data)

In [None]:
data.reset_index()[['cluster', 'uID']].to_csv('files/200309_clusters_complete_n30.csv')

## Dendrogram

In [None]:
from scipy.cluster import hierarchy
import matplotlib.pyplot as plt

In [None]:
clusters = data.reset_index()[['cluster', 'uID']]

Save to pdf.

In [None]:
group = data.groupby('cluster').mean()
Z = hierarchy.linkage(group, 'ward')
plt.figure(figsize=(25, 10))
dn = hierarchy.dendrogram(Z, color_threshold=30, labels=group.index)

plt.savefig('tree.pdf')