<a href="https://colab.research.google.com/github/pawel0508/MachineLearning_UcznieNienadzorowane/blob/main/clustering_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px


Generowanie danych

In [3]:
from sklearn.datasets import make_blobs
blobs_data = make_blobs(n_samples=1000, cluster_std=0.7, random_state=24, center_box=(-4.0, 4.0))[0]
blobs = pd.DataFrame(data = blobs_data, columns = ['x1', 'x2'])
print(blobs.head())
px.scatter(data_frame = blobs, x = 'x1', y = 'x2', template = 'simple_white', 
           width = 950, height = 500, title = 'Blobs')

         x1        x2
0  5.531036 -2.805156
1  3.590178  2.185291
2 -1.141389  2.706887
3 -0.974927  0.116028
4  4.117031 -1.707794


In [6]:
from sklearn.datasets import make_circles
circles_data = make_circles(n_samples=1000, factor = 0.5, noise = 0.05)[0]
circles = pd.DataFrame(data = circles_data, columns = ['x1', 'x2'])
print(circles.head())
px.scatter(data_frame = circles, x = 'x1', y = 'x2', template = 'simple_white', 
           width = 950, height = 500, title = 'Circles')

         x1        x2
0  0.173990  0.499537
1  0.974548 -0.205456
2 -0.195038  0.540015
3 -0.848091  0.426460
4 -0.697502 -0.634476


In [7]:
from sklearn.datasets import make_moons
moons_data = make_circles(n_samples=1000, noise = 0.05)[0]
moons = pd.DataFrame(data = moons_data, columns = ['x1', 'x2'])
print(moons.head())
px.scatter(data_frame = moons, x = 'x1', y = 'x2', template = 'simple_white', 
           width = 950, height = 500, title = 'Moons')

         x1        x2
0  0.965322  0.063871
1 -0.677241 -0.380720
2 -0.696210  0.651151
3  0.675098 -0.453518
4  0.151889 -0.819335


In [9]:
random_data = np.random.rand(1500, 2)
random = pd.DataFrame(data = random_data, columns = ['x1', 'x2'])
print(random.head())
px.scatter(data_frame = random, x = 'x1', y = 'x2', width = 950, height = 500, title = 'Random', 
           template = 'simple_white')

         x1        x2
0  0.395594  0.130885
1  0.533977  0.749413
2  0.545137  0.660118
3  0.548977  0.791912
4  0.682114  0.083761


Porównanie algorytmów - blobs data - 3 klastry

In [20]:
from plotly.subplots import make_subplots

fig = make_subplots(rows = 1, cols = 3, shared_yaxes = True, horizontal_spacing = 0.01)

from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 3)
kmeans.fit(blobs_data)
blobs['cluster'] = kmeans.predict(blobs_data)
trace1 = px.scatter(data_frame = blobs, x = 'x1', y = 'x2', color = 'cluster',
                    width=800, height=500)['data'][0]
fig.add_trace(trace1, row=1, col=1)

from sklearn.cluster import AgglomerativeClustering
aglo = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean')
aglo.fit(blobs_data)
blobs['cluster'] = aglo.fit_predict(blobs_data)
trace2 = px.scatter(data_frame = blobs, x = 'x1', y = 'x2', color = 'cluster',
                    width=800, height=500)['data'][0]
fig.add_trace(trace2, row=1, col=2)

from sklearn.cluster import DBSCAN
dbscan = DBSCAN(eps = 0.5, min_samples = 5)
dbscan.fit(blobs_data)
blobs['cluster'] = dbscan.labels_
trace3 = px.scatter(data_frame = blobs, x = 'x1', y = 'x2', color = 'cluster',
                    width=800, height=500)['data'][0]
fig.add_trace(trace3, row=1, col=3)

fig.update_layout(title='KMeans vs. Agglomerative Clustering vs. DBSCAN - circle data', template='simple_white',
                  coloraxis = {'colorscale': px.colors.diverging.BrBG})

In [21]:
fig = make_subplots(rows = 1, cols = 3, shared_yaxes = True, horizontal_spacing = 0.01)

kmeans = KMeans(n_clusters = 2)
kmeans.fit(circles_data)
circles['cluster'] = kmeans.predict(circles_data)
trace1 = px.scatter(data_frame = circles, x = 'x1', y = 'x2', color = 'cluster',
                    width=800, height=500)['data'][0]
fig.add_trace(trace1, row=1, col=1)


aglo = AgglomerativeClustering(n_clusters = 2, affinity = 'euclidean')
aglo.fit(circles_data)
circles['cluster'] = aglo.fit_predict(circles_data)
trace2 = px.scatter(data_frame = circles, x = 'x1', y = 'x2', color = 'cluster',
                    width=800, height=500)['data'][0]
fig.add_trace(trace2, row=1, col=2)

dbscan = DBSCAN(eps = 0.1, min_samples = 5)
dbscan.fit(circles_data)
circles['cluster'] = dbscan.labels_
trace3 = px.scatter(data_frame = circles, x = 'x1', y = 'x2', color = 'cluster',
                    width=800, height=500)['data'][0]
fig.add_trace(trace3, row=1, col=3)
fig.update_layout(title='KMeans vs. Agglomerative Clustering vs. DBSCAN - circles data', template='simple_white',
                  coloraxis = {'colorscale': px.colors.diverging.BrBG})

In [23]:
fig = make_subplots(rows=1, cols=3, shared_yaxes=True, horizontal_spacing=0.01)

# KMeans
kmeans = KMeans(n_clusters=5)
kmeans.fit(random_data)
clusters = kmeans.predict(random_data)
random['cluster'] = clusters
trace1 = px.scatter(random, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]
fig.add_trace(trace1, row=1, col=1)

# AgglomerativeClustering
agglo = AgglomerativeClustering(n_clusters=5, affinity='euclidean')
clusters = agglo.fit_predict(random_data)
random['cluster'] = clusters
trace2 = px.scatter(random, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]
fig.add_trace(trace2, row=1, col=2)

# DBSCAN
dbscan = DBSCAN(eps=0.05, min_samples=5)
dbscan.fit(random_data)
clusters = dbscan.labels_
random['cluster'] = clusters
trace3 = px.scatter(random, 'x1', 'x2', 'cluster', width=800, height=500)['data'][0]
fig.add_trace(trace3, row=1, col=3)

fig.update_layout(title='KMeans vs. Agglomerative Clustering vs.DBSCAN - random data', template='simple_white',
                  coloraxis = {'colorscale': px.colors.diverging.BrBG})