This file performs K-Means clustering on the 216 features used for prediction, with the number of groups set to 4 to match theory of ovarian cancer subtypes. 

In [33]:
import os
import pandas as pd
from sklearn.cluster import KMeans
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.0.2.


In [34]:
DATA_FILE = os.path.join('Data','mibi_table_main.csv')
df_orig = pd.read_csv(DATA_FILE)

# Remove ID and outcomes to cluster just on features
df = df_orig.drop(labels=['fov_id','TMA_ID','PFS','OS','OS_high','PFS_high','Primary','Recurrence','Death'],axis=1)
assert len(df.columns) == 216, f"unexpected feature length, {len(df.columns)}"
#print(df.columns.values.tolist())

# Replace NA values with the mean of that column
mean = df.mean()
df.fillna(mean, inplace=True)

# Normalize all the columns before clustering
df=(df-df.mean())/df.std()
print(df)

X = df.to_numpy()
print(X)

    CD8+ T cells_assort  CD8+ T cells_prop  CD8+ T cells_region  \
0             -0.106953           0.236041             0.094694   
1              1.347605           2.648545             2.014089   
2             -0.581531          -0.416611            -0.340943   
3              0.153444          -0.025814             0.086600   
4             -0.146928           0.084471            -0.023723   
..                  ...                ...                  ...   
72            -1.148825          -0.613270            -1.908599   
73            -1.148825          -0.613270            -1.908599   
74            -1.148825          -0.613270            -1.908599   
75            -1.148825          -0.613270            -1.908599   
76            -1.148825          -0.613270            -1.908599   

    CD8+ T cells_tumor_med_dist  CD8+ T cells_tumor_contact  \
0                 -8.707584e-01                    1.895720   
1                 -9.761482e-01                   -0.221999   
2     

In [35]:
SEED = 1989
kmeans = KMeans(n_clusters=4, init='k-means++', random_state=SEED, copy_x=True, algorithm='auto', verbose=1)
out = kmeans.fit(X)
print(out.labels_)

Initialization complete
Iteration 0, inertia 24080.833979872834
Iteration 1, inertia 15135.432502747692
Iteration 2, inertia 14880.069583120568
Iteration 3, inertia 14724.933829744494
Iteration 4, inertia 14668.9135295015
Converged at iteration 4: strict convergence.
Initialization complete
Iteration 0, inertia 22646.266481458177
Iteration 1, inertia 14994.419077228318
Iteration 2, inertia 14896.25470489588
Iteration 3, inertia 14859.994540147121
Iteration 4, inertia 14735.94392705613
Iteration 5, inertia 14592.780260546017
Iteration 6, inertia 14445.244261288699
Iteration 7, inertia 14394.697430204096
Converged at iteration 7: strict convergence.
Initialization complete
Iteration 0, inertia 23552.122599746126
Iteration 1, inertia 14738.943602772377
Iteration 2, inertia 14535.374631302187
Iteration 3, inertia 14428.31605124131
Iteration 4, inertia 14398.159182417372
Converged at iteration 4: strict convergence.
Initialization complete
Iteration 0, inertia 23644.61518936932
Iteration 1,

In [36]:
if not os.path.exists("Clustering_Results"):
    os.mkdir("Clustering_Results")
cluster_df = df_orig[['fov_id','TMA_ID','PFS','OS','OS_high','PFS_high','Primary','Recurrence','Death']].copy()
cluster_df['cluster_labels'] = out.labels_.copy()
print(cluster_df)
cluster_df.to_csv(os.path.join('Clustering_Results/cluster_labels.csv'), index=False)

    fov_id  TMA_ID  PFS   OS  OS_high  PFS_high  Primary  Recurrence  Death  \
0        9      92  122  122        1         1        1           0      0   
1        6      87   12   75        1         0        0           1      1   
2       10      93   30  112        1         1        0           1      1   
3       15      98   38  153        1         1        0           1      0   
4       18     102   13   78        1         0        1           1      1   
..     ...     ...  ...  ...      ...       ...      ...         ...    ...   
72      56      13   15   29        0         0        1           1      1   
73      37     126   35   35        0         1        1           0      1   
74      69      46   15   15        0         0        1           0      1   
75      76      61   26   82        1         1        1           1      1   
76      44     139   10   38        0         0        1           1      1   

    cluster_labels  
0                1  
1        