In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('./datasets/training_r21d_all_10_mean.csv')       # data training con el target
X_train = df.iloc[:,1:-2]
Y_train = df.iloc[:, -2]

In [7]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import FactorAnalysis
import umap

#escalar la data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(X_train)
X_train_scaled = data_scaled

pca = PCA(n_components=135)
X_pca = pca.fit_transform(X_train_scaled)

svd = TruncatedSVD(n_components=135, n_iter=5, random_state=42) #cambiar numero de componentes
X_svd = svd.fit_transform(X_train_scaled)

lda = LinearDiscriminantAnalysis()
X_lda = lda.fit_transform(X_train_scaled, Y_train)

rng = np.random.RandomState(42)
gauss_rand = GaussianRandomProjection(random_state=rng, eps=0.99)
X_gauss_rand = gauss_rand.fit_transform(X_train_scaled)

transformer = FactorAnalysis(n_components=7, random_state=0)
X_fa = transformer.fit_transform(X_train_scaled)

reducer = umap.UMAP() 
X_umap = pd.DataFrame(reducer.fit_transform(X_train_scaled))

dim_reductions = [X_pca, X_svd, X_lda, X_gauss_rand, X_fa, X_umap]
reductions = ["PCA", "SVD", "LDA", "GaussRand", "FA", "UMAP"]
for i in range(len(dim_reductions)):
    print(reductions[i],": ", dim_reductions[i].shape)

PCA :  (5424, 135)
SVD :  (5424, 135)
LDA :  (5424, 9)
GaussRand :  (5424, 206)
FA :  (5424, 7)
UMAP :  (5424, 2)


In [8]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from scipy.stats import mode

def apply_kmeans_and_evaluate(X_train, y_train, X_test, y_test, n_clusters):
    # Encodificar las etiquetas
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    # Aplicar Kmeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X_train)
    y_train_pred = kmeans.predict(X_train)
    y_test_pred = kmeans.predict(X_test)

    labels_map = np.zeros_like(y_train_pred)
    for i in range(n_clusters):
        mask = (y_train_pred == i)
        if np.any(mask):
            labels_map[mask] = mode(y_train_encoded[mask])[0]

    # Mapear las predicciones
    y_test_pred_mapped = np.zeros_like(y_test_pred)
    for i in range(n_clusters):
        mask = (y_test_pred == i)
        if np.any(mask):
            y_test_pred_mapped[mask] = mode(y_train_encoded[(y_train_pred == i)])[0]

    # Calcular accuracy
    accuracy = accuracy_score(y_test_encoded, y_test_pred_mapped)
    print(f'Accuracy: {accuracy * 100:.2f}%')

n_clusters = len(np.unique(Y_train))


In [9]:
from sklearn.model_selection import train_test_split
for i in range(len(dim_reductions)):
    x_train, x_test, y_train, y_test  = train_test_split(dim_reductions[i],Y_train, test_size=0.2, random_state=42)
    print(reductions[i])
    apply_kmeans_and_evaluate(x_train, y_train, x_test, y_test, n_clusters)


PCA
Accuracy: 93.92%
SVD
Accuracy: 94.01%
LDA
Accuracy: 90.51%
GaussRand
Accuracy: 90.14%
FA
Accuracy: 88.20%
UMAP
Accuracy: 96.31%
