In [None]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_samples
from sklearn.preprocessing import StandardScaler

from matplotlib import use
use('Qt5Agg')
try:
    import matplotlib.pyplot as plt
except ImportError:
    use('TkAgg')
    import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import animation
import seaborn as sns

In [None]:
path = Path.home() / "Google Drive/DDMA Hackathon"

In [None]:
df = pd.read_csv(path / 'complete_hackathon_dataset.csv')

### Prepare data

In [None]:
fields = ['id', 'indexed_price', 'build_year', 'use_surface', 'parcel_surface', 'score_totaal_2018', 'number_of_objects',
         'huishoudens_met_kinderen', 'gemiddelde_huishoudensgrootte', '0_tot_15', '15_tot_25',
         '25_tot_45', '45_tot_65', '65_+', 'omgevingsadressendichtheid', 'koopwoningen',
         'build_type', 'energy_label', 'migratieachtergrond', 'elektriciteitsverbruik', 'aardgasverbruik',
         'percentage_meergezinswoning', 'oppervlakte_land', 'personenautos_huishouden']
cat = ['build_type', 'energy_label']
num = [col for col in fields if col not in cat and col != 'id']

In [None]:
scalar = StandardScaler()
df[num] = scalar.fit_transform(df[num])

In [None]:
new_df = df[fields].copy()

In [None]:
colors = {0: 'pink', 1: 'red', 2: 'green', 3: 'yellow', 4: 'black', 5: 'blue', 6: 'orange', 7: 'gray'}

### Create clusters

In [None]:
class Clusters:
    def __init__(self, df, categorical, n_clusters):
        self.df = df
        self.categorical = categorical
        self.numerical = [col for col in self.df.columns if col not in self.categorical and col != 'id']
        self.n_clusters = n_clusters
    
    def k_means_func(self):
        self.df['clusters'] = KMeans(n_clusters=self.n_clusters).fit(np.array(self.df[self.numerical])).labels_
        return self
    
    def k_prototype_func(self):
        self.df = self.df[['id', *self.categorical, *self.numerical]].dropna()
        self.df['clusters'] = KPrototypes(n_clusters=self.n_clusters, init='Cao'). \
            fit_predict(self.df.drop(columns='id'), categorical=list(range(len(self.categorical))))
        return self

In [None]:
cluster = Clusters(df=new_df, categorical=cat, n_clusters=len(colors.keys()))

# K-Means clustering

In [None]:
new_df = cluster.k_means_func().df

## principal component analysis vs t-Distributed Stochastic Neighbor Embedding

### Create visualization function

In [None]:
def plot3D(x, clusters, colors, gif_name):
  
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(xs=x[:,0], ys=x[:,1], zs=x[:,2], s=10, c=clusters.map(colors))
    ax.axis('off')
    ax.axis('tight')
    
    def rotate(angle):
        ax.view_init(azim=angle)

    angle = 3
    ani = animation.FuncAnimation(fig, rotate, frames=np.arange(0, 360, angle), interval=50)
    ani.save(gif_name, writer=animation.PillowWriter(fps=10))

### PCA

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(new_df[num].sample(10_000, random_state=1))

pca_df = pd.DataFrame(columns=['pca1','pca2', 'pca3'])

pca_df['pca1'] = pca_result[:,0]
pca_df['pca2'] = pca_result[:,1]
pca_df['pca3'] = pca_result[:,2]

top_two_comp = pca_df[['pca1','pca2', 'pca3']] # taking first and second principal component

In [None]:
plot3D(top_two_comp.values, new_df.sample(10_000, random_state=1)['clusters'], colors, '../output/pca_kmeans.gif') # Visualizing the PCA output

### T-SNE

In [None]:
tsne = TSNE(n_components=3).fit_transform(new_df[num].sample(10_000, random_state=1))

In [None]:
plot3D(tsne, new_df.sample(10_000, random_state=1)['clusters'], colors, '../output/tsne_kmeans.gif')

## Model evaluation

In [None]:
silhouette_per_sample = silhouette_sample(new_df[num], new_df['clusters'])
silhouette_avg = np.mean(silhouette_per_sample)
silhouette_avg

In [None]:
new_df['clusters'] = new_df['clusters'].map(colors)
new_df.to_csv(path / "clusters_netherlands_kmeans.csv", index=False)