In [15]:
# import statements
from sklearn.datasets import make_blobs
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances
from statistics import mode

# import hierarchical clustering libraries
from sklearn.model_selection import StratifiedKFold, train_test_split
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering, KMeans
from sklearn.cluster import DBSCAN
from sklearn_extra.cluster import KMedoids

In [2]:
df = pd.read_table('../datos/seeds_dataset.txt', header=None)

X = df.iloc[:, :-1]
y = df.iloc[:,-1]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y , test_size=1/3, stratify=y)

# KMEDIAS ENCAPSULADO

In [4]:
def kmedias(X_train=np.array([]), y_train=np.array([]),
            X_test=np.array([]), y_test=np.array([]), n_clusters=2, init=None):
    km = KMeans(n_clusters=n_clusters, init=init)
    labels_train = km.fit_predict(X_train)
    moda = np.array([mode(y_train[labels_train == i]) for i in range(n_clusters)])

    if X_test.size == 0:
        accuracy = np.mean(moda[labels_train] == y_train)
    else:
        labels_test = km.predict(X_test)
        accuracy = np.mean(moda[labels_test] == y_test)
    return accuracy

## Kmedias con cálculo de centros iniciales

In [5]:
kmedias_init = KMeans(n_clusters=30) # se cogen 30 clusters porque hay 12 baricentros y hay que coger 12*1.5 adicionales
kmedias_init.fit(X_scaled)
kmedias_init_centros = kmedias_init.cluster_centers_


Cogemos los 30 clusters y aplicamos AgglomerativeClustering para quedarnos con 12
Se calculan los baricentros y nos quedamos con argmin de las columnas

## Ward

In [6]:

ward = AgglomerativeClustering(n_clusters=12)
ward_labels = ward.fit_predict(kmedias_init_centros)

# Use the resulting clusters' centroids as initial centers for KMeans
ward_centros = np.array([np.mean(kmedias_init_centros[ward_labels == i], axis=0) for i in range(12)])

K_init = kmedias_init_centros[np.argmin(pairwise_distances(kmedias_init_centros, ward_centros), axis=0)] #baricentros que surgen del ward


### Kmedias final

No podemos meter el K_init directamente porque son puntos ficticios

In [7]:
X_scaled[np.argmin(pairwise_distances(X_scaled, K_init), axis=0)].shape

(12, 7)

In [8]:
kmedias(X_train = X_scaled, y_train=y, n_clusters=12, init=X_scaled[np.argmin(pairwise_distances(X_scaled, K_init), axis=0)])

0.9047619047619048

## Kmedias sin HOLD-OUT

In [9]:
kmedias(X_train=X_scaled, y_train=y, n_clusters=12, init="k-means++")

0.9142857142857143

## Kmedoids

In [14]:
kmedoides = KMedoids(n_clusters=12, init="k-medoids++")
labels_kmedoides = kmedoides.fit_predict(X_scaled)
moda_kmedoides = np.array([mode(y[labels_kmedoides == c]) for c in range(12)])
accuracy_kmedoides = np.mean(moda_kmedoides[labels_kmedoides]==y)
accuracy_kmedoides

0.919047619047619

## DBScan

In [26]:
dbscan = DBSCAN(eps=0.60, min_samples=2)
dbscan.fit(X_scaled)
dbscan.labels_
np.sum(dbscan.labels_==-1) # estos son los puntos aislados

70

### Eliminar puntos aislados

In [27]:
X_sin_aislados = np.delete(X_scaled, np.argwhere(dbscan.labels_==-1), axis=0)
y_sin_aislados = np.delete(y, np.argwhere(dbscan.labels_==-1), axis=0)
etiquetas_sin_aislados = np.array(dbscan.labels_[dbscan.labels_!= -1])

np.unique(dbscan.labels_[dbscan.labels_!= -1]).shape[0]

23

In [32]:
moda_dbscan = np.array([mode(y_sin_aislados[etiquetas_sin_aislados==c]) for c in np.unique(etiquetas_sin_aislados)])
accuracy_dbscan = np.mean(moda_dbscan[etiquetas_sin_aislados]==y_sin_aislados)
accuracy_dbscan

0.9714285714285714

Los mejores resultados se consiguen con dbscan