In [3]:
%matplotlib qt

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, Normalizer, StandardScaler

from kmodes.kprototypes import KPrototypes
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import animation
import seaborn as sns

In [2]:
df = pd.read_csv(r'C:\Users\TSmeitink\Google Drive\DDMA Hackathon\complete_hackathon_dataset.csv')

## principal component analysis vs t-Distributed Stochastic Neighbor Embedding

### Create visualization function

In [4]:
def plot3D(x, clusters, colors, gif_name):
  
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(xs=x[:,0], ys=x[:,1], zs=x[:,2], s=10, c=clusters.map(colors))
    ax.axis('off')
    ax.axis('tight')
    
    def rotate(angle):
        ax.view_init(azim=angle)

    angle = 3
    ani = animation.FuncAnimation(fig, rotate, frames=np.arange(0, 360, angle), interval=50)
    ani.save(gif_name, writer=animation.PillowWriter(fps=10))

### Prepare data

In [8]:
fields = ['id', 'indexed_price', 'build_year', 'use_surface', 'parcel_surface', 'score_totaal_2018', 'number_of_objects',
         'huishoudens_met_kinderen', 'gemiddelde_huishoudensgrootte', '0_tot_15', '15_tot_25',
         '25_tot_45', '45_tot_65', '65_+', 'omgevingsadressendichtheid', 'koopwoningen',
         'build_type', 'energy_label', 'migratieachtergrond', 'elektriciteitsverbruik', 'aardgasverbruik',
         'percentage_meergezinswoning', 'oppervlakte_land', 'personenautos_huishouden']
cat = ['build_type', 'energy_label']
num = [col for col in fields if col not in cat and col != 'id']

In [6]:
scalar = StandardScaler()
df[num] = scalar.fit_transform(df[num])

In [None]:
new_df = df[fields].copy()

In [9]:
colors = {0: 'blue', 1: 'red', 2: 'green', 3: 'yellow', 4: 'black', 5: 'pink', 6: 'orange', 7: 'gray'}

### Create clusters

In [10]:
class Clusters:
    def __init__(self, df, categorical, n_clusters):
        self.df = df
        self.categorical = categorical
        self.numerical = [col for col in self.df.columns if col not in self.categorical and col != 'id']
        self.n_clusters = n_clusters
    
    def k_means_func(self):
        self.df['clusters'] = KMeans(n_clusters=self.n_clusters).fit(np.array(self.df[self.numerical])).labels_
        return self
    
    def k_prototype_func(self):
        self.df = self.df[['id', *self.categorical, *self.numerical]].dropna()
        self.df['clusters'] = KPrototypes(n_clusters=self.n_clusters, init='Cao'). \
            fit_predict(self.df.drop(columns='id'), categorical=list(range(len(self.categorical))))
        return self

In [11]:
cluster = Clusters(df=new_df, categorical=cat, n_clusters=len(colors.keys()))

# K-Means clustering

In [11]:
new_df = cluster.k_means_func().df

In [12]:
new_df['clusters'].value_counts()

1    68364
7    37019
3    20268
2    18651
5    16752
0    11250
6     7475
4     6191
Name: clusters, dtype: int64

In [13]:
new_df.head()

Unnamed: 0,id,indexed_price,build_year,use_surface,parcel_surface,score_totaal_2018,number_of_objects,huishoudens_met_kinderen,gemiddelde_huishoudensgrootte,0_tot_15,...,koopwoningen,build_type,energy_label,migratieachtergrond,elektriciteitsverbruik,aardgasverbruik,percentage_meergezinswoning,oppervlakte_land,personenautos_huishouden,clusters
0,3595,0.3969,-1.746591,3.229399,-0.139897,-0.332839,-0.27028,-1.350204,-1.350984,-1.280426,...,-1.077246,Rijwoning hoek,G,-0.804508,-1.087831,0.096262,0.444671,-0.180873,-0.691604,2
1,8647,-0.769353,-2.032465,-0.784834,-0.485777,-0.332839,-0.27028,-1.350204,-1.350984,-1.280426,...,-1.077246,Vrijstaande woning,G,-0.804508,-1.087831,0.096262,0.444671,-0.180873,-0.691604,2
2,8648,-0.079121,-1.226822,2.481292,0.446028,-0.332839,-0.27028,-1.350204,-1.350984,-1.280426,...,-1.077246,Vrijstaande woning,G,-0.804508,-1.087831,0.096262,0.444671,-0.180873,-0.691604,2
3,11133,-1.15017,-2.11043,3.941013,-0.039431,-0.332839,-0.27028,-1.350204,-1.350984,-1.280426,...,-1.077246,Vrijstaande woning,G,-0.804508,-1.087831,0.096262,0.444671,-0.180873,-0.691604,2
4,11134,-1.15017,1.372025,0.492422,-0.072408,-0.332839,-0.27028,-1.350204,-1.350984,-1.280426,...,-1.077246,Vrijstaande woning,G,-0.804508,-1.087831,0.096262,0.444671,-0.180873,-0.691604,2


### PCA

In [None]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(new_df[num].sample(10_000, random_state=1))

pca_df = pd.DataFrame(columns=['pca1','pca2', 'pca3'])

pca_df['pca1'] = pca_result[:,0]
pca_df['pca2'] = pca_result[:,1]
pca_df['pca3'] = pca_result[:,2]

top_two_comp = pca_df[['pca1','pca2', 'pca3']] # taking first and second principal component

In [16]:
plot3D(top_two_comp.values, new_df.sample(10_000, random_state=1)['clusters'], colors, '../output/pca_kmeans.gif') # Visualizing the PCA output

### T-SNE

In [5]:
new_df = pd.read_csv(r'C:\Users\TSmeitink\Google Drive\DDMA Hackathon\clusters_netherlands_kmeans.csv')

In [12]:
new_df['clusters'] = new_df['clusters'].map({v: k for k, v in colors.items()})

In [14]:
tsne = TSNE(n_components=3).fit_transform(new_df[num].sample(10_000, random_state=1))

In [45]:
plot3D(tsne, new_df.sample(10_000, random_state=1)['clusters'], colors, '../output/tsne_kmeans.gif')

# K-Prototypes (a mix of numerical and categorical data)

In [None]:
new_df = cluster.k_prototype_func().df

In [10]:
new_df['clusters'].value_counts()

4    4214
1    2567
2    1398
0    1102
3     719
Name: clusters, dtype: int64

### PCA

In [24]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(new_df[num])

pca_df = pd.DataFrame(columns=['pca1','pca2', 'pca3'])

pca_df['pca1'] = pca_result[:,0]
pca_df['pca2'] = pca_result[:,1]
pca_df['pca3'] = pca_result[:,2]

top_two_comp = pca_df[['pca1','pca2', 'pca3']] # taking first and second principal component

plot3D(top_two_comp.values, new_df['clusters'], colors, 'pca_kprototype.gif') # Visualizing the PCA output

build_type                       9801
energy_label                     9801
indexed_price                    9801
build_year                       9801
use_surface                      9801
parcel_surface                   9801
score_totaal_2018                9801
number_of_objects                9801
huishoudens_met_kinderen         9801
gemiddelde_huishoudensgrootte    9801
0_tot_15                         9801
15_tot_25                        9801
25_tot_45                        9801
45_tot_65                        9801
65_+                             9801
omgevingsadressendichtheid       9801
koopwoningen                     9801
migratieachtergrond              9801
elektriciteitsverbruik           9801
aardgasverbruik                  9801
percentage_meergezinswoning      9801
oppervlakte_land                 9801
personenautos_huishouden         9801
clusters                         9801
dtype: int64


## T-SNE

In [25]:
tsne = TSNE(n_components=3).fit_transform(new_df[num])

In [26]:
plot3D(tsne, new_df['clusters'], colors, 'tsne_kprototype.gif')
plt.show();

In [18]:
new_df['clusters'] = new_df['clusters'].map(colors)
new_df.to_csv(r'C:\Users\TSmeitink\Google Drive\DDMA Hackathon\clusters_netherlands_kmeans.csv', index=False)