In [1]:
%matplotlib qt

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, Normalizer, StandardScaler

from kmodes.kprototypes import KPrototypes
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import animation
import seaborn as sns

In [2]:
df = pd.read_csv(r'C:\Users\TSmeitink\Google Drive\DDMA Hackathon\complete_hackathon_dataset.csv').sample(10_000)

In [3]:
df.head()

Unnamed: 0,gwb_code_10,gwb_code_8,regio,gm_naam,gwb_code,aantal_inwoners,mannen,vrouwen,0_tot_15,15_tot_25,...,energy_label,number_of_objects,buurt2019,gemeente2019,pc5,id,mun_code,buurt_code,indexed_price,indexed_date
149581,9570302,9570302,Tegelarijeveld-Broekhin,Roermond,BU09570302,-0.243506,0.49734,0.5,0.132979,0.106383,...,D,0.0,9570302,957,6042X,144266,957,9570302,-0.560205,2019-Q3
13445,1185006,1185006,Venesluis,Hoogeveen,BU01185006,-0.253247,0.483784,0.516216,0.105405,0.097297,...,D,0.0,1185006,118,7907A,68349,118,1185006,0.183122,2019-Q3
76750,4000600,4000600,Julianadorp-Oost,Den Helder,BU04000600,-0.547078,0.444444,0.550265,0.142857,0.068783,...,B,24.0,4000600,400,1787A,135542,400,4000600,-0.733479,2019-Q3
18024,1530202,1530202,Pathmos,Enschede,BU01530202,-0.185065,0.497573,0.502427,0.177184,0.123786,...,F,0.0,1530202,153,7545T,118424,153,1530202,-0.288062,2019-Q3
127738,7580200,7580200,Brabantpark,Breda,BU07580200,2.423701,0.47053,0.52947,0.132244,0.189203,...,D,3.0,7580200,758,4817J,149179,758,7580200,-0.253086,2019-Q3


## PCA vs T-SNE

### Create visualization function

In [4]:
def plot3D(x, clusters, colors, gif_name):
  
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(xs=x[:,0], ys=x[:,1], zs=x[:,2], s=40, c=clusters.map(colors))
    ax.axis('off')
    ax.axis('tight')
    
    def rotate(angle):
        ax.view_init(azim=angle)

    angle = 3
    ani = animation.FuncAnimation(fig, rotate, frames=np.arange(0, 360, angle), interval=1000)
    ani.save(gif_name, writer=animation.PillowWriter(fps=10))

### Prepare data

In [5]:
fields = ['id', 'indexed_price', 'build_year', 'use_surface', 'parcel_surface', 'score_totaal_2018', 'number_of_objects',
         'huishoudens_met_kinderen', 'gemiddelde_huishoudensgrootte', '0_tot_15', '15_tot_25',
         '25_tot_45', '45_tot_65', '65_+', 'omgevingsadressendichtheid', 'koopwoningen',
         'build_type', 'energy_label', 'migratieachtergrond', 'elektriciteitsverbruik', 'aardgasverbruik',
         'percentage_meergezinswoning', 'oppervlakte_land', 'personenautos_huishouden']
cat = ['build_type', 'energy_label']
num = [col for col in fields if col not in cat and col != 'id']

In [6]:
scalar = StandardScaler()
df[num] = scalar.fit_transform(df[num])

In [7]:
new_df = df[fields].copy()
colors = {0: 'blue', 1: 'red', 2: 'green', 3: 'yellow', 4: 'black'}

### Create clusters

In [20]:
class Clusters:
    def __init__(self, df, categorical, n_clusters):
        self.df = df
        self.categorical = categorical
        self.numerical = [col for col in self.df.columns if col not in self.categorical]
        self.n_clusters = n_clusters
    
    def k_means_func(self):
        self.df['clusters'] = KMeans(n_clusters=self.n_clusters).fit(np.array(self.df[self.numerical])).labels_
        return self
    
    def k_prototype_func(self):
        self.df = self.df[[*self.categorical, *self.numerical]].dropna()
        self.df['clusters'] = KPrototypes(n_clusters=self.n_clusters, init='Cao'). \
            fit_predict(self.df, categorical=list(range(len(self.categorical))))
        return self

In [23]:
cluster = Clusters(df=new_df, categorical=cat, n_clusters=5)

# K-Means

In [9]:
new_df = cluster.k_means_func().df

In [10]:
new_df['clusters'].value_counts()

4    4214
1    2567
2    1398
0    1102
3     719
Name: clusters, dtype: int64

### PCA

In [12]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(new_df[num])

pca_df = pd.DataFrame(columns=['pca1','pca2', 'pca3'])

pca_df['pca1'] = pca_result[:,0]
pca_df['pca2'] = pca_result[:,1]
pca_df['pca3'] = pca_result[:,2]

top_two_comp = pca_df[['pca1','pca2', 'pca3']] # taking first and second principal component

plot3D(top_two_comp.values, new_df['clusters'], colors, 'pca_kmeans.gif') # Visualizing the PCA output

### T-SNE

In [13]:
tsne = TSNE(n_components=3).fit_transform(new_df[num])

In [14]:
plot3D(tsne, new_df['clusters'], colors, 'tsne_kmeans.gif')
plt.show();

# K-Prototypes

In [None]:
new_df = cluster.k_prototype_func().df

In [10]:
new_df['clusters'].value_counts()

4    4214
1    2567
2    1398
0    1102
3     719
Name: clusters, dtype: int64

### PCA

In [24]:
pca = PCA(n_components=3)
pca_result = pca.fit_transform(new_df[num])

pca_df = pd.DataFrame(columns=['pca1','pca2', 'pca3'])

pca_df['pca1'] = pca_result[:,0]
pca_df['pca2'] = pca_result[:,1]
pca_df['pca3'] = pca_result[:,2]

top_two_comp = pca_df[['pca1','pca2', 'pca3']] # taking first and second principal component

plot3D(top_two_comp.values, new_df['clusters'], colors, 'pca_kprototype.gif') # Visualizing the PCA output

build_type                       9801
energy_label                     9801
indexed_price                    9801
build_year                       9801
use_surface                      9801
parcel_surface                   9801
score_totaal_2018                9801
number_of_objects                9801
huishoudens_met_kinderen         9801
gemiddelde_huishoudensgrootte    9801
0_tot_15                         9801
15_tot_25                        9801
25_tot_45                        9801
45_tot_65                        9801
65_+                             9801
omgevingsadressendichtheid       9801
koopwoningen                     9801
migratieachtergrond              9801
elektriciteitsverbruik           9801
aardgasverbruik                  9801
percentage_meergezinswoning      9801
oppervlakte_land                 9801
personenautos_huishouden         9801
clusters                         9801
dtype: int64


## T-SNE

In [25]:
tsne = TSNE(n_components=3).fit_transform(new_df[num])

In [26]:
plot3D(tsne, new_df['clusters'], colors, 'tsne_kprototype.gif')
plt.show();