In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn import metrics
from sklearn.mixture import GaussianMixture
from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import LabelEncoder

import statistics
import numpy as np
import pandas as pd

# along with torch seed, we also need to set numpy seed for result regeneration
torch.manual_seed(123)
np.random.seed(123)

In [2]:
if torch.cuda.is_available():
    device = "cuda:0"
else:
    device = "cpu"

device = "cpu"

print(device)

cpu


In [3]:
class Autoencoder(nn.Module):
    
    def __init__(self,input_dim,latent_dim):
        
        super().__init__()        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, latent_dim),
            nn.ReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, input_dim),
            nn.ReLU()
        )
        

    def forward(self, x):
        
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded, encoded
    

In [4]:
class Deep_Autoencoder(nn.Module):
    
    def __init__(self,input_dim, latent_dim):
        
        super().__init__()
        
        level_1 = 500
        level_2 = 500
        level_3 = 2000
        
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, level_1),
            nn.ReLU(),
            nn.Linear(level_1, level_2),
            nn.ReLU(),
            nn.Linear(level_2, level_3),
            nn.ReLU(),
            nn.Linear(level_3, latent_dim),
            nn.ReLU()
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, level_3),
            nn.ReLU(),
            nn.Linear(level_3, level_2),
            nn.ReLU(),
            nn.Linear(level_2, level_1),
            nn.ReLU(),
            nn.Linear(level_1, input_dim),
            nn.ReLU()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        
        
        return decoded, encoded

In [5]:
def load_breast_cancer_data():
    dbName = 'breast_cancer'
    y_column_array = ['target']
    data = load_breast_cancer()
    df = pd.DataFrame(data.data, columns=data.feature_names)
    df_clean = df.dropna()
    y_actual = data.target
    
    return df_clean.to_numpy(), y_actual

In [6]:
def kmean_gmm_on_x_and_z(dataset, latent_dim, epoch):
    
    X, y_actual = load_breast_cancer_data()
    
    csv_columns = ["Dataset","fold","gamma","Accuracy","NMI","ARI","iter_num_max_acc"]
    kmeans_to_csv = []
    kmeans_to_csv.append(
        csv_columns
    )
    ae_kmeans_to_csv = []
    ae_kmeans_to_csv.append(
        csv_columns
    )
    gmm_to_csv = []
    gmm_to_csv.append(
        csv_columns
    )
    ae_gmm_to_csv = []
    ae_gmm_to_csv.append(
        csv_columns
    )
    
    fold_counter = 0
    kfold = KFold(n_splits=5, random_state=123, shuffle=True)
    
    for gamma_index, test_index in kfold.split(X):
        
        fold_counter +=1
        best_gamma = 0

        print("fold "+str(fold_counter))

        X_test, X_gamma = X[test_index], X[gamma_index]
        y_test, y_gamma = y_actual[test_index], y_actual[gamma_index]
        
        # standardize data
        scaler = preprocessing.StandardScaler()
        X_gamma = scaler.fit_transform(X_gamma)
        X_test = scaler.transform(X_test)

        # X_gamma is the fold used to find the best gamma, not used here
        data_tensor = torch.from_numpy(X_test).float()


        # method 1: Kmean only
        
        kmeans = KMeans(n_clusters = len(np.unique(y_actual)), init='k-means++', random_state=123)
        y_pred = kmeans.fit_predict(X_test)

        # Save results in csv
        kmeans_to_csv.append(
            [
                dataset,
                fold_counter,
                '-',
                str(np.round(metrics.accuracy_score(y_test, y_pred), 5) *100),
                str(np.round(metrics.normalized_mutual_info_score(y_test, y_pred), 5)*100),
                str(np.round(metrics.adjusted_rand_score(y_test, y_pred), 5)*100),
                '-'

            ]
        )


        # method 2: GMM only

        #predictions from gmm
        skl_gmm = GaussianMixture(n_components=len(np.unique(y_actual)),random_state=123)
        y_pred = skl_gmm.fit_predict(X_test)

        # Save results in csv
        gmm_to_csv.append(
            [
                dataset,
                fold_counter,
                '-',
                str(np.round(metrics.accuracy_score(y_test, y_pred), 5) *100),
                str(np.round(metrics.normalized_mutual_info_score(y_test, y_pred), 5)*100),
                str(np.round(metrics.adjusted_rand_score(y_test, y_pred), 5)*100),
                '-'

            ]
        )



        # method 3: 5000 epoch AE then apply Kmean (separately)
        # setting latent dimension to 5
        model = Deep_Autoencoder(input_dim = data_tensor.size()[1], latent_dim = latent_dim)

        criterion = nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

        for epoch in range(epoch):

            recon, latent = model(data_tensor)
            loss = criterion(recon, data_tensor)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        kmeans = KMeans(n_clusters = len(np.unique(y_actual)), init='k-means++', random_state=123)
        y_pred = kmeans.fit_predict(latent.detach().numpy())

        # Save results in csv
        ae_kmeans_to_csv.append(
            [
                dataset,
                fold_counter,
                '-',
                str(np.round(metrics.accuracy_score(y_test, y_pred), 5) *100),
                str(np.round(metrics.normalized_mutual_info_score(y_test, y_pred), 5)*100),
                str(np.round(metrics.adjusted_rand_score(y_test, y_pred), 5)*100),
                '-'

            ]
        )

        # method 4: 5000 epoch AE then apply GMM (separately)

        skl_gmm = GaussianMixture(n_components=len(np.unique(y_actual)), random_state=123)
        y_pred = skl_gmm.fit_predict(latent.detach().numpy())

        # Save results in csv
        ae_gmm_to_csv.append(
            [
                dataset,
                fold_counter,
                '-',
                str(np.round(metrics.accuracy_score(y_test, y_pred), 5) *100),
                str(np.round(metrics.normalized_mutual_info_score(y_test, y_pred), 5)*100),
                str(np.round(metrics.adjusted_rand_score(y_test, y_pred), 5)*100),
                '-'

            ]
        )

    pd.DataFrame(kmeans_to_csv).to_csv('Results/kmeans_on_x_summary_'+ dataset +".csv")
    pd.DataFrame(ae_kmeans_to_csv).to_csv('Results/kmeans_on_z_summary_'+ dataset +".csv")
    pd.DataFrame(gmm_to_csv).to_csv('Results/gmm_on_x_summary_'+ dataset +".csv")
    pd.DataFrame(ae_gmm_to_csv).to_csv('Results/gmm_on_z_summary_'+ dataset +".csv")


In [None]:
dataset_list = ['breast_cancer']

for dataset in dataset_list:
    print(dataset)
    kmean_gmm_on_x_and_z(dataset =  dataset, latent_dim = 10, epoch=5000)
   