<a href="https://colab.research.google.com/github/midnightripper/IDEC/blob/main/DEC_Working.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import scipy.io
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from keras import backend as K
from keras.layers import Layer, InputSpec
def dataloader(path):
    data = scipy.io.loadmat(path)
    AF = data['AF']
    modified_rows1 = AF[:-2]
    last_rows1 = AF[-2:]
    CF = data['CF']
    concatenated_array = np.concatenate((modified_rows1, CF,last_rows1), axis=0)
    new=concatenated_array
    u, count = np.unique(new[-1], return_counts=True)
    a = u[np.logical_or(count < 2, count > 2)]
    c = new[:, np.isin(new[-1], a, invert=True)]
    x = c[0:-2]; y = c[-2]; w = c[-1];
    return x.T, y.T, w.T, data['CF_info']

def normalization(feats):
    df = pd.DataFrame(feats)
    scaler = StandardScaler()
    x_new = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return x_new

In [19]:
import numpy as np
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import SGD
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from scipy.optimize import linear_sum_assignment
import csv, os

def cluster_acc(y_true, y_pred):
    y_true = y_true.astype(np.int64)
    assert y_pred.size == y_true.size
    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)
    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1
    _, col_ind = linear_sum_assignment(w.max() - w)
    return sum([w[i, j] for i, j in zip(range(w.shape[0]), col_ind)]) * 1.0 / y_pred.size

def autoencoder(dims, act='relu'):
    n_stacks = len(dims) - 1
    x = Input(shape=(dims[0],), name='input')
    h = x
    for i in range(n_stacks-1):
        h = Dense(dims[i + 1], activation=act, name='encoder_%d' % i)(h)
    h = Dense(dims[-1], name='encoder_%d' % (n_stacks - 1))(h)
    for i in range(n_stacks-1, 0, -1):
        h = Dense(dims[i], activation=act, name='decoder_%d' % i)(h)
    h = Dense(dims[0], name='decoder_0')(h)
    return Model(inputs=x, outputs=h)

class ClusteringLayer(Dense):
    def __init__(self, n_clusters, alpha=1.0, **kwargs):
        self.n_clusters = n_clusters
        self.alpha = alpha
        super(ClusteringLayer, self).__init__(n_clusters, **kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = InputSpec(dtype=tf.float32, shape=(None, input_dim))
        self.clusters = self.add_weight(shape=(self.n_clusters, input_dim),
                                        initializer='glorot_uniform', name='clusters')
        self.built = True

    def get_config(self):
        config = super(ClusteringLayer, self).get_config()
        config['n_clusters'] = self.n_clusters
        config['alpha'] = self.alpha
        return config

    def call(self, inputs):
        q = 1.0 / (1.0 + (K.sum(K.square(K.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
        q **= (self.alpha + 1.0) / 2.0
        q = K.transpose(K.transpose(q) / K.sum(q, axis=1))
        return q

class DEC(object):
    def __init__(self, dims, n_clusters=2, alpha=1.0):
        self.dims = dims
        self.input_dim = dims[0]
        self.n_stacks = len(self.dims) - 1
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.autoencoder = autoencoder(self.dims)
        hidden = self.autoencoder.get_layer(name='encoder_%d' % (self.n_stacks - 1)).output
        self.encoder = Model(inputs=self.autoencoder.input, outputs=hidden)
        clustering_layer = ClusteringLayer(self.n_clusters, alpha=self.alpha, name='clustering')(hidden)
        self.model = Model(inputs=self.autoencoder.input, outputs=clustering_layer)

    def pretrain(self, x, batch_size=256, epochs=200, optimizer='adam'):
        self.autoencoder.compile(loss='mse', optimizer=optimizer)
        self.autoencoder.fit(x, x, batch_size=batch_size, epochs=epochs)

    def compile(self, loss='kld', optimizer='adam'):
        self.model.compile(loss=loss, optimizer=optimizer)

    def fit(self, x, y=None, batch_size=256, maxiter=2e4, tol=1e-3, update_interval=140, save_dir='./results/dec'):
        print('Update interval', update_interval)
        save_interval = int(x.shape[0] / batch_size) * 5
        print('Save interval', save_interval)
        kmeans = KMeans(n_clusters=self.n_clusters, n_init=20)
        y_pred = kmeans.fit_predict(self.encoder.predict(x))
        y_pred_last = np.copy(y_pred)
        self.model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        logfile = open(save_dir + '/dec_log.csv', 'w')
        logwriter = csv.DictWriter(logfile, fieldnames=['iter', 'acc', 'nmi', 'ari', 'L'])
        logwriter.writeheader()
        loss = 0
        index = 0
        for ite in range(int(maxiter)):
            if ite % update_interval == 0:
                q = self.model.predict(x, verbose=0)
                p = self.target_distribution(q)
                y_pred = q.argmax(1)
                if y is not None:
                    acc = cluster_acc(y, y_pred)
                    nmi = normalized_mutual_info_score(y, y_pred)
                    ari = adjusted_rand_score(y, y_pred)
                    loss = self.model.evaluate(x, p, verbose=0)
                    logwriter.writerow({'iter': ite, 'acc': acc, 'nmi': nmi, 'ari': ari, 'L': loss})
                    print('Iter-%d: ACC=%.4f, NMI=%.4f, ARI=%.4f;  L=%.5f' % (ite, acc, nmi, ari, loss))
                delta_label = np.sum(y_pred != y_pred_last).astype(np.float32) / y_pred.shape[0]
                y_pred_last = np.copy(y_pred)
                if ite > 0 and delta_label < tol:
                    print('Reached tolerance threshold. Stopping training.')
                    logfile.close()
                    break
            if (index + 1) * batch_size > x.shape[0]:
                loss = self.model.train_on_batch(x=x[index * batch_size::], y=p[index * batch_size::])
                index = 0
            else:
                loss = self.model.train_on_batch(x=x[index * batch_size:(index + 1) * batch_size],
                                                 y=p[index * batch_size:(index + 1) * batch_size])
                index += 1
            if ite % save_interval == 0:
                self.model.save_weights(save_dir + '/DEC_model_' + str(ite) + '.h5')
            ite += 1
        logfile.close()
        self.model.save_weights(save_dir + '/DEC_model_final.h5')
        return y_pred

    def predict_clusters(self, x):
      q = self.model.predict(x, verbose=0)
      return q.argmax(axis=1)

    @staticmethod
    def target_distribution(q):
        weight = q ** 2 / q.sum(0)
        return (weight.T / weight.sum(1)).T



In [20]:
# fatyp = 'TypicalFA_comb1'
# drivepath = 'finalData/'+ fatyp +'/';
# filee = drivepath+'GER_train_fisher-2000_FA_GT_ESTphnTrans_estStress.mat'
original_dim = 38

# train_path = filee; test_path = filee.replace('train','test')
train_path='/content/drive/MyDrive/finalData/TypicalFA_comb1/GER_train_fisher-2000_FA_GT_ESTphnTrans_estStress.mat'
test_path='/content/drive/MyDrive/finalData/TypicalFA_comb1/GER_test_fisher-2000_FA_GT_ESTphnTrans_estStress.mat'
x, y, wtrain1, info_train1 = dataloader(train_path);
xtest, ytest, wtest1, info_test1 = dataloader(test_path);
xtest= normalization(xtest)
x= normalization(x)

# Create and Train the DEC Model
dec = DEC(dims=[x.shape[-1], 500, 500, 2000, 10], n_clusters=2)
dec.pretrain(x, batch_size=256, epochs=200, optimizer='adam')
dec.compile(loss='kld', optimizer='adam')
dec.fit(x, y=y, batch_size=256, tol=0.001, maxiter=20000, update_interval=140, save_dir='./results/dec')

# Show the final results
y_pred = dec.predict_clusters(x)
print('ACC:', cluster_acc(y, y_pred))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [21]:
def calculate_accuracy(arr1, arr2):
    count = sum(1 for itr1, itr2 in zip(arr1, arr2) if itr1 == itr2)
    return count / len(arr1)

In [22]:
print(calculate_accuracy(y,y_pred))

0.5239656912209889


In [23]:
def flip(arr):
  result = []
  for num in arr:
      if num == 1:
          result.append(0)
      else:
          result.append(1)
  return result

In [24]:
y_pred_flip=flip(y_pred)

In [25]:
print(calculate_accuracy(y,y_pred_flip))

0.4760343087790111


In [26]:
print(y_pred)

[0 1 0 ... 1 1 1]


In [28]:
import numpy as np
import tensorflow as tf
from keras.layers import Dense, Input
from keras.models import Model
from keras.optimizers import SGD
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from scipy.optimize import linear_sum_assignment
import csv, os
import matplotlib.pyplot as plt

In [29]:
def plot_clusters(x, y_pred, centers):
    plt.figure(figsize=(10, 7))
    plt.scatter(x[:, 0], x[:, 1], c=y_pred, cmap='tab10', s=50)
    plt.scatter(centers[:, 0], centers[:, 1], marker='X', s=200, color='red', label='Cluster Centers')
    plt.legend()
    plt.title('DEC Clustering Results')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.show()

def plot_loss(logfile_path):
    log_data = np.genfromtxt(logfile_path, delimiter=',', names=True)
    plt.figure(figsize=(10, 5))
    plt.plot(log_data['iter'], log_data['L'])
    plt.title('Clustering Loss during Training')
    plt.xlabel('Iteration')
    plt.ylabel('Clustering Loss')
    plt.grid(True)
    plt.show()

In [31]:
    plot_clusters(x, y_pred, dec.model.get_layer(name='clustering').get_weights()[0])

    # Visualize clustering loss during training
    plot_loss('./results/dec/dec_log.csv')

InvalidIndexError: ignored

<Figure size 1000x700 with 0 Axes>

In [32]:
!pip install matplotlib seaborn




In [33]:
def plot_clusters(x, y_true, y_pred, title):
    plt.figure(figsize=(10, 6))
    plt.scatter(x[:, 0], x[:, 1], c=y_pred, cmap='tab10', s=50)
    plt.scatter(x[:, 0], x[:, 1], c=y_true, cmap='Set1', marker='x', s=100, edgecolor='black')
    plt.title(title)
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.colorbar()
    plt.show()

def plot_tsne(x, y_pred, title):
    tsne = TSNE(n_components=2, perplexity=30, n_iter=300)
    x_tsne = tsne.fit_transform(x)
    plt.figure(figsize=(10, 6))
    plt.scatter(x_tsne[:, 0], x_tsne[:, 1], c=y_pred, cmap='tab10', s=50)
    plt.title(title)
    plt.xlabel('t-SNE Component 1')
    plt.ylabel('t-SNE Component 2')
    plt.colorbar()
    plt.show()

def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 6))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()



In [36]:
  plot_clusters(x, y_pred, 'True Labels vs Predicted Labels (Clustered)')
  plot_tsne(x, y_pred, 't-SNE Visualization of Predicted Clusters')
  plot_confusion_matrix(y, y_pred)

TypeError: ignored