<a href="https://colab.research.google.com/github/mrzResearchArena/esm-embedding-clustering/blob/main/Clustering-kmeans-HC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Hierarchical Clustering (Agglomerative Clustering)

In [91]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [92]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import time

In [93]:
# # X = np.array([[1, 2],
#               [1, 4],
#               [1, 0],
#               [10, 2],
#               [10, 4],
#               [10, 0]])
# X

In [94]:
!ls

PyTorchKMeans+ESM1b.npy  PyTorchKMeans+ESM1v.npy  sample_data


In [95]:
!pwd

/content


In [105]:
# dataset load
X = np.load('/drive/My Drive/ESM/Xv-24578.npy')
Y = np.load('/drive/My Drive/ESM/originalLabels.npy')

print(X.shape)
print(Y.shape)

(24578, 1280)
(24578,)


In [106]:
X.shape

(24578, 1280)

In [107]:
HCmodel = AgglomerativeClustering(n_clusters=192, affinity='cosine', linkage='average')

In [108]:
begin = time.time()

HCmodel.fit(X)

print(time.time() - begin)

418.1067752838135


In [109]:
Yp = HCmodel.labels_
Yp

array([ 1,  1,  1, ...,  1, 69, 69])

In [110]:
C=0
#predictedLabel --> yp
#originalLabel  --> y

for y, yp in zip(Y, Yp):
    if y==yp:
        C += 1
print(f'{C/len(Y)}')

0.00528928309870616


In [111]:
np.save(arr=Yp, file='HC+ESM1v')

- *Some* URLs:

    - https://github.com/DeMoriarty/fast_pytorch_kmeans/blob/master/fast_pytorch_kmeans/kmeans.py
    - https://github.com/subhadarship/kmeans_pytorch
    - https://stackoverflow.com/questions/44759982/are-there-any-implementations-of-kmeans-with-cosine-distance-in-apache-spark-pys

### K-Means (PyTorch Acceleration)

- This tool can run on both GPU and CPU.
    - However, I am forced to use CPU memory. Be that as it may, the CPU contains more memory compared to GPU in Google Colab Pro (i.e., The available CPU and GPU memory are 25.5 GB and 14.5 GB, respectively.).

- able to train batch-wise

In [20]:
# !pip install kmeans-pytorch
!pip install git+https://github.com/subhadarship/kmeans_pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/subhadarship/kmeans_pytorch
  Cloning https://github.com/subhadarship/kmeans_pytorch to /tmp/pip-req-build-4ijltfby
  Running command git clone -q https://github.com/subhadarship/kmeans_pytorch /tmp/pip-req-build-4ijltfby


In [71]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from kmeans_pytorch import kmeans, kmeans_predict
import time

In [72]:
# set random seed
np.random.seed(101)

In [73]:
from google.colab import drive
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [74]:
# dataset load
X = np.load('/drive/My Drive/ESM/Xv-24578.npy')
Y = np.load('/drive/My Drive/ESM/originalLabels.npy')

print(X.shape)
print(Y.shape)

(24578, 1280)
(24578,)


In [75]:
X = torch.from_numpy(X)
print(X.shape)

torch.Size([24578, 1280])


In [76]:
# set device
# if torch.cuda.is_available():
#     device = torch.device('cuda:0')
# else:
#     device = torch.device('cpu')

device = torch.device('cpu')
print(f'device: {device}')

device: cpu


In [77]:
help(kmeans)

Help on function kmeans in module kmeans_pytorch:

kmeans(X, num_clusters, distance='euclidean', cluster_centers=[], tol=0.0001, tqdm_flag=True, iter_limit=0, device=device(type='cpu'), gamma_for_soft_dtw=0.001, seed=None)
    perform kmeans
    :param X: (torch.tensor) matrix
    :param num_clusters: (int) number of clusters
    :param distance: (str) distance [options: 'euclidean', 'cosine'] [default: 'euclidean']
    :param seed: (int) seed for kmeans
    :param tol: (float) threshold [default: 0.0001]
    :param device: (torch.device) device [default: cpu]
    :param tqdm_flag: Allows to turn logs on and off
    :param iter_limit: hard limit for max number of iterations
    :param gamma_for_soft_dtw: approaches to (hard) DTW as gamma -> 0
    :return: (torch.tensor, torch.tensor) cluster ids, cluster centers



In [78]:
# k-means
num_clusters = 192

begin = time.time()

cluster_ids_x, cluster_centers = kmeans(
    X=X[:24578,:],
    num_clusters=num_clusters,
    distance='cosine',
    device=device,
)

print()
print(f'Time elapsed {time.time() - begin} seconds')

running k-means on cpu..


[running kmeans]: 62it [04:42,  4.56s/it, center_shift=0.000000, iteration=62, tol=0.000100]


Time elapsed 282.54061365127563 seconds





In [79]:
print(cluster_ids_x)
print(cluster_centers)

tensor([54, 54, 54,  ..., 11, 11, 11])
tensor([[-0.2014, -0.1432, -0.1042,  ..., -0.1642, -0.0607, -0.1622],
        [-0.1472,  0.2373, -0.0984,  ..., -0.0676,  0.1003,  0.2316],
        [-0.0628, -0.1229, -0.0441,  ..., -0.1878,  0.0213, -0.1657],
        ...,
        [-0.2269,  0.0989, -0.0133,  ..., -0.4153, -0.0005, -0.0077],
        [-0.2202, -0.0570,  0.1944,  ..., -0.4112,  0.2120,  0.0055],
        [-0.1452,  0.0770,  0.0670,  ..., -0.0075,  0.0632, -0.0102]])


In [80]:
len(set(cluster_ids_x))

24578

In [81]:
type(cluster_ids_x)

torch.Tensor

In [82]:
cluster_ids_x = cluster_ids_x.numpy()

In [83]:
Y

array([  0,   0,   0, ..., 191, 191, 191])

In [84]:
cluster_ids_x

array([54, 54, 54, ..., 11, 11, 11])

In [85]:
# C=1
# for i in cluster_ids_x:
#     print(i)
#     if C==100: break
#     C+=1

# v1 = set(cluster_ids_x)
# v2 = len(set(cluster_ids_x))
# print(v1)
# print(v2)

In [86]:
# d = {}
# for i in cluster_ids_x:
#     if i not in d:
#         d[i] = 1
#     else:
#         d[i] += 1

In [87]:
C=0
#predictedLabel --> yp
#originalLabel  --> y

for y, yp in zip(Y, cluster_ids_x):
    if y==yp:
        C += 1
print(f'{C/len(Y)}')

0.0016274717226788184


In [88]:
np.save(arr=cluster_ids_x, file='PyTorchKMeans+ESM1v')

In [89]:
# device

In [90]:
# !nvidia-smi

### K-Means (PyTorch batch-wise Acceleration)

In [None]:
# print(cluster_ids_x)
# print(cluster_centers)

In [None]:
# cluster_centers

In [None]:
# begin = time.time()

# cluster_ids_x, cluster_centers = kmeans(
#     X=b2,
#     num_clusters=num_clusters,
#     cluster_centers = cluster_centers,
#     distance='cosine',
#     device=device,
# )

# print()
# print(time.time() - begin)

running k-means on cpu..
resuming


[running kmeans]: 38it [00:57,  1.53s/it, center_shift=0.000000, iteration=38, tol=0.000100]


59.47051739692688





In [None]:
# begin = time.time()

# cluster_ids_x, cluster_centers = kmeans(
#     X=b3,
#     cluster_centers = cluster_centers,
#     num_clusters=num_clusters,
#     distance='cosine',
#     device=device
# )

# print()
# print(time.time() - begin)

running k-means on cpu..
resuming


[running kmeans]: 33it [00:49,  1.51s/it, center_shift=0.000000, iteration=33, tol=0.000100]


51.37775921821594





In [None]:
# print(cluster_ids_x)
# print(cluster_centers)

In [15]:
# begin = time.time()

# cluster_ids_x, cluster_centers = kmeans(
#     X=b4,
#     num_clusters=num_clusters,
#     distance='cosine',
#     cluster_centers = cluster_centers,
#     device=device
# )

# print()
# print(time.time() - begin)

In [None]:
# print(cluster_ids_x)
# print(cluster_centers)

In [16]:
# cluster_ids_X = kmeans_predict(
#     X=X[:32000],
#     cluster_centers = cluster_centers,
#     distance='cosine',
#     device=device,
# )

In [17]:
# cluster_ids_X

In [18]:
# len(set(cluster_ids_X))

In [None]:
# cluster_centers.shape

### K-Means (NLTK Acceleration)

In [128]:
from nltk.cluster import KMeansClusterer, cosine_distance
import numpy as np

In [129]:
# vectors = [np.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]]
# # means = [[4, 3], [5, 5]]

In [130]:
# dataset load
X = np.load('/drive/My Drive/ESM/Xb-24578.npy')
Y = np.load('/drive/My Drive/ESM/originalLabels.npy')

print(X.shape)
print(Y.shape)

(24578, 1280)
(24578,)


In [None]:
K=192

begin = time.time()

model = KMeansClusterer(num_means=K, 
                        distance=cosine_distance,
                        initial_means=None,
                        repeats=1)
clusters = model.cluster(X, True, trace=True)

# print("Clustered:", vectors)
print("Predicted clusters:", clusters)
print("Means:", model.means())
print()

print(f'Time elapsed: {(time.time() - begin)/60.0} minutes')

k-means trial 0
iteration
iteration
iteration
iteration


In [120]:
# help(KMeansClusterer)

In [121]:
#predictedLabel --> yp
#originalLabel  --> y

C=0
for y, yp in zip(Y, clusters):
    if y==yp:
        C += 1
print(f'{C/len(Y)}')

0.0045976076165676625


In [122]:
np.save(arr=clusters, file='NLTK+HC+ESM1b')

In [123]:
len(clusters)

24578