In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from src.tools import MMSeqs
from sklearn.metrics import normalized_mutual_info_score, mutual_info_score, adjusted_mutual_info_score

%load_ext autoreload
%autoreload 2

In [2]:
dataset_df = pd.read_csv('../data/datasets/swissprot/dataset.csv', index_col=0)

In [4]:
dataset_df.domain.value_counts()

domain
Bacteria    259078
none         19487
Viruses      15941
Name: count, dtype: int64

In [None]:
# mmseqs = MMSeqs()
# mmseqs_cluster_df = mmseqs.cluster(dataset_df, name='dataset', output_dir='../data/datasets/swissprot', sequence_identity=0.5, overwrite=False)
# mmseqs.cleanup()

mmseqs_cluster_df = MMSeqs.load_cluster('../data/datasets/swissprot/dataset_cluster_mmseqs.tsv').loc[dataset_df.index]
kmeans_cluster_df = pd.read_csv('../data/datasets/swissprot/dataset_cluster_kmeans.csv', index_col=0).loc[dataset_df.index]

In [41]:
# normalized_mutual_info_score(kmeans_cluster_df.cluster_id, mmseqs_cluster_df.cluster_id)
adjusted_mutual_info_score(kmeans_cluster_df.cluster_id, mmseqs_cluster_df.cluster_id)

np.float64(0.2076523047432236)

In [33]:
mmseqs_cluster_df['label'] = dataset_df['label']
mmseqs_cluster_df['lineage'] = dataset_df['lineage']

kmeans_cluster_df['label'] = dataset_df['label']
kmeans_cluster_df['lineage'] = dataset_df['lineage']

In [None]:
def get_cluster_metadata(cluster_df:pd.DataFrame):
    cluster_metadata_df = list()
    for cluster_id, df in cluster_df.groupby('cluster_id'):
        row = dict()
        row['singleton'] = (len(df) == 1)
        row['homogenous'] = (df.label.nunique() == 1)
        row['cluster_id'] = cluster_id 
        row['size'] = len(df)
        row['n_taxa'] = df.lineage.nunique()
        cluster_metadata_df.append(row)
    cluster_metadata_df = pd.DataFrame(cluster_metadata_df).set_index('cluster_id')
    return cluster_metadata_df

mmseqs_cluster_metadata_df = get_cluster_metadata(mmseqs_cluster_df)
kmeans_cluster_metadata_df = get_cluster_metadata(kmeans_cluster_df)

In [None]:
def table_1(kmeans_cluster_metadata_df, mmseqs_cluster_metadata_df):

    cols = ['n_clusters', 'mean_size', 'max_size', 'n_singleton', 'n_homogenous', 'mean_n_taxa', 'max_n_taxa']
    table_df = pd.DataFrame(index=['mmseqs', 'kmeans'], columns=cols)

    for strategy, df in {'kmeans':kmeans_cluster_metadata_df, 'mmseqs':mmseqs_cluster_metadata_df}.items():
        table_df.loc[strategy, 'n_clusters'] = len(df)
        table_df.loc[strategy, 'mean_size'] = df['size'].mean()
        table_df.loc[strategy, 'max_size'] = df['size'].max()
        table_df.loc[strategy, 'n_singleton'] = df.singleton.sum()
        table_df.loc[strategy, 'n_homogenous'] = df.homogenous.sum()
        table_df.loc[strategy, 'mean_n_taxa'] = df.n_taxa.mean()
        table_df.loc[strategy, 'max_n_taxa'] = df.n_taxa.max()
    
    return table_df

table_1(kmeans_cluster_metadata_df, mmseqs_cluster_metadata_df)


Unnamed: 0,n_clusters,mean_size,max_size,n_singleton,n_homogenous,mean_n_taxa,max_n_taxa
mmseqs,64591,4.559552,840,41997,64545,2.701119,335
kmeans,50000,5.89012,12,6642,50000,3.68082,12
