In [1]:
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib
from matplotlib import pyplot
import gower
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from numpy import unique
from numpy import where
from sklearn.cluster import *
from survival_plot import cluster_KMplot
from sklearn_extra.cluster import KMedoids, CommonNNClustering
import numpy as np
from sklearn.mixture import GaussianMixture
matplotlib.use('TkAgg')

In [2]:
surv_file = "../../survival_KIRP.csv"
encoding = "../data/raw/sm/kirp_sm251.csv"

In [None]:
enc = pd.read_csv(encoding, index_col=0).fillna(0)
cols = enc.columns
for col in cols:
    enc[col] = enc[col].astype(float)

In [None]:
enc

In [None]:
pca_reducer = PCA(n_components=2)
X2 = pca_reducer.fit_transform(enc)
plt.scatter(X2[:,0], X2[:,1])
plt.show()
tsne_reducer = TSNE(2, learning_rate='auto', init='random')
X3 = tsne_reducer.fit_transform(enc)
plt.scatter(X3[:,0], X3[:,1])
plt.show()

In [None]:
plt.scatter(enc.iloc[:,0], enc.iloc[:,1])
plt.show()
gow = gower.gower_matrix(enc)
plt.scatter(gow[:,0], gow[:,1])
plt.show()

## Kmeans

In [None]:
X = np.array(enc)
# dist = gower.gower_matrix(enc)
# X = dist
print(X.shape)
model = KMeans(n_clusters=2)
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()

In [None]:
sse = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(enc)
    sse.append(kmeans.inertia_)
    print(kmeans.inertia_)

plt.plot(range(2, 7), sse)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

kl = KneeLocator(range(2, 7), sse, curve="convex", direction="decreasing")
kl.elbow

In [None]:
silhouette_coefficients = []
for k in range(2, 7):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(enc)
    score = silhouette_score(enc, kmeans.labels_)
    silhouette_coefficients.append(score)
    print(score)
    

plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
pval = []
for i in range(2,7):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(enc)
    pred = pd.DataFrame(kmeans.labels_,index =enc.index) 
    cluster_assign = pd.DataFrame([enc.index,kmeans.labels_]).T
    cluster_assign.columns = ["patient", "subtype"]
    cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
    # print(cluster_assign)
    cluster_assign=cluster_assign.set_index("patient")
    clust_asgn = cluster_assign.subtype
    pval.append(cluster_KMplot(clust_asgn, surv_file, delimiter=','))
for i in pval:
    print(i)

## PAM clustering

In [None]:
dist = gower.gower_matrix(enc)

In [None]:
X = dist
print(X.shape)
model = KMedoids(n_clusters=2,init='k-medoids++')
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()

In [None]:
silhouette_coefficients = []
sse = []
for k in range(2, 7):
    pam = KMedoids(n_clusters=k,init='k-medoids++')
    pam.fit(dist)
    score = silhouette_score(dist, pam.labels_)
    silhouette_coefficients.append(score)
    sse.append(kmeans.inertia_)
    print(score)

for i in sse:
    print(i)

plt.plot(range(2, 7), sse)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
pval = []
for i in range(2,7):
    dist = gower.gower_matrix(enc)
    pam = KMedoids(n_clusters=i, init='k-medoids++')
    pam.fit(dist)
    pred = pd.DataFrame(pam.labels_,index =enc.index) 
    cluster_assign = pd.DataFrame([enc.index,pam.labels_]).T
    cluster_assign.columns = ["patient", "subtype"]
    cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
    # print(cluster_assign)
    cluster_assign=cluster_assign.set_index("patient")
    clust_asgn = cluster_assign.subtype
    pval.append(cluster_KMplot(clust_asgn, surv_file, delimiter=','))
for i in pval:
    print(i)

## CommonNN 

In [None]:
clustering = CommonNNClustering(eps=0.005, min_samples=0)
clustering.fit(enc)

In [None]:
cluster_assign = pd.DataFrame([enc.index,clustering.labels_]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

## Affinitypropagation

In [None]:
dist = gower.gower_matrix(enc)
X = dist
print(X.shape)
model = AffinityPropagation(damping=0.9)
model.fit(X)
yhat = model.predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
pyplot.show()


cluster_assign = pd.DataFrame([enc.index,model.labels_]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

## Agglomerative clustering

In [None]:
X = np.array(enc)
# dist = gower.gower_matrix(enc)
# X = dist
print(X.shape)
model = AgglomerativeClustering(n_clusters=2)
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()


cluster_assign = pd.DataFrame([enc.index,model.labels_]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

In [None]:
silhouette_coefficients = []
for k in range(2, 7):
    clust = AgglomerativeClustering(n_clusters=k)
    clust.fit(enc)
    score = silhouette_score(enc, clust.labels_)
    silhouette_coefficients.append(score)
    print(score)
    

plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
pval = []
for i in range(2,7):
    clust = AgglomerativeClustering(n_clusters=i)
    clust.fit(enc)
    pred = pd.DataFrame(clust.labels_,index =enc.index) 
    cluster_assign = pd.DataFrame([enc.index,clust.labels_]).T
    cluster_assign.columns = ["patient", "subtype"]
    cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
    # print(cluster_assign)
    cluster_assign=cluster_assign.set_index("patient")
    clust_asgn = cluster_assign.subtype
    pval.append(cluster_KMplot(clust_asgn, surv_file, delimiter=','))
for i in pval:
    print(i)

## BIRCH

In [None]:
# X = np.array(enc)
dist = gower.gower_matrix(enc)
X = dist
print(X.shape)
model = Birch(threshold=0.01, n_clusters=2)
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()



cluster_assign = pd.DataFrame([enc.index,model.labels_]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

In [None]:
silhouette_coefficients = []
for k in range(2, 7):
    clust = Birch(n_clusters=k)
    clust.fit(dist)
    score = silhouette_score(dist, clust.labels_)
    silhouette_coefficients.append(score)
    print(score)
    

plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
pval = []
for i in range(2,7):
    clust = AgglomerativeClustering(n_clusters=i)
    clust.fit(dist)
    pred = pd.DataFrame(clust.labels_,index =enc.index) 
    cluster_assign = pd.DataFrame([enc.index,clust.labels_]).T
    cluster_assign.columns = ["patient", "subtype"]
    cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
    # print(cluster_assign)
    cluster_assign=cluster_assign.set_index("patient")
    clust_asgn = cluster_assign.subtype
    pval.append(cluster_KMplot(clust_asgn, surv_file, delimiter=','))
for i in pval:
    print(i)

## DBscan

In [None]:
X = np.array(enc)
# dist = gower.gower_matrix(enc)
# X = dist
print(X.shape)
model = DBSCAN(eps=0.01, min_samples=10)
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()



cluster_assign = pd.DataFrame([enc.index,model.labels_]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

## Meanshift

In [None]:
X = np.array(enc)
# dist = gower.gower_matrix(enc)
# X = dist
print(X.shape)
model = MeanShift()
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()



cluster_assign = pd.DataFrame([enc.index,model.labels_]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

In [None]:
# X = np.array(enc)
dist = gower.gower_matrix(enc)
X = dist
print(X.shape)
model = OPTICS(eps=0.8, min_samples=10)
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()



cluster_assign = pd.DataFrame([enc.index,model.labels_]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

## Spectral Clustering

In [None]:
X = np.array(enc)
# dist = gower.gower_matrix(enc)
# X = dist
print(X.shape)
model = SpectralClustering(n_clusters=3)
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()



cluster_assign = pd.DataFrame([enc.index,model.labels_]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

In [None]:
silhouette_coefficients = []
for k in range(2, 7):
    clust = SpectralClustering(n_clusters=k)
    clust.fit(enc)
    score = silhouette_score(enc, clust.labels_)
    silhouette_coefficients.append(score)
    print(score)
    

plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
pval = []
for i in range(2,7):
    clust = SpectralClustering(n_clusters=i)
    clust.fit(enc)
    pred = pd.DataFrame(clust.labels_,index =enc.index) 
    cluster_assign = pd.DataFrame([enc.index,clust.labels_]).T
    cluster_assign.columns = ["patient", "subtype"]
    cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
    # print(cluster_assign)
    cluster_assign=cluster_assign.set_index("patient")
    clust_asgn = cluster_assign.subtype
    pval.append(cluster_KMplot(clust_asgn, surv_file, delimiter=','))
for i in pval:
    print(i)

## Gaussian Mixture

In [None]:
X = np.array(enc)
# dist = gower.gower_matrix(enc)
# X = dist
print(X.shape)
model = GaussianMixture(n_components=3)
yhat = model.fit_predict(X)
pred = pd.DataFrame(yhat,index =enc.index) 
clusters = unique(yhat)
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X2[row_ix, 0], X2[row_ix, 1])
pyplot.show()
for cluster in clusters:
    row_ix = where(yhat == cluster)
    pyplot.scatter(X3[row_ix, 0], X3[row_ix, 1])
pyplot.show()



cluster_assign = pd.DataFrame([enc.index,yhat]).T
cluster_assign.columns = ["patient", "subtype"]
cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
cluster_assign=cluster_assign.set_index("patient")
clust_asgn = cluster_assign.subtype
cluster_KMplot(clust_asgn, surv_file, delimiter=',')

In [None]:
silhouette_coefficients = []
for k in range(2, 7):
    clust = GaussianMixture(n_components=k)
    yhat = clust.fit_predict(X)
    score = silhouette_score(enc, yhat)
    silhouette_coefficients.append(score)
    print(score)
    

plt.plot(range(2, 7), silhouette_coefficients)
plt.xticks(range(2, 7))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

In [None]:
pval = []
for i in range(2,7):
    clust = GaussianMixture(n_components=k)
    clust.fit(enc)
    pred = pd.DataFrame(clust.labels_,index =enc.index) 
    cluster_assign = pd.DataFrame([enc.index,clust.labels_]).T
    cluster_assign.columns = ["patient", "subtype"]
    cluster_assign = cluster_assign.drop_duplicates(subset=["patient"], keep="first")
    # print(cluster_assign)
    cluster_assign=cluster_assign.set_index("patient")
    clust_asgn = cluster_assign.subtype
    pval.append(cluster_KMplot(clust_asgn, surv_file, delimiter=','))
for i in pval:
    print(i)