In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

**Functions**

In [3]:
# Create a function, when given a foldseek ID, returns all proteins within that cluster

def get_proteins(cluster_id, foldseek_dict, mmseqs_dict):
    ''' Input a cluster id and 2 dictionaries describing cluster
        patterns, return a list of all proteins in that cluster'''
    
    mmseqs_rep = foldseek_dict[cluster_id]

    protein_list = []
    for id in mmseqs_rep:
        protein_list.extend(mmseqs_dict[id])
    
    return protein_list

In [None]:
# Function to map single protein IDS to their respective clusters

def get_cluster(raw_list, mmseqs_dict, foldseek_dict):
    ''' Input a list of ids and 2 dictionaries describing cluster
        patterns, return a list of foldseek cluster ids'''
    
    mmseqs_list = []
    hits = []

    for value in raw_list:
        for key, content in mmseqs_dict.items():
            if value in content or value == key:
                mmseqs_list.append(key)
                hits.append(value)

    # Need to figure out what to do here if mmseqs dont map
    no_match = [x for x in raw_list if x not in hits]

    print(f'length of no hits: {len(no_match)}')

    foldseek_list = []
    hits = []

    for value in mmseqs_list:

        for key, content in foldseek_dict.items():
            if value in content or value == key:
                foldseek_list.append(key)
                hits.append(value)

    no_match = [x for x in mmseqs_list if x not in hits]

    foldseek_list.extend(no_match)

    return foldseek_list

In [37]:
# testing for new function

foldseek_ids = list(foldseek_dict.keys())

# print(len(foldseek_ids), foldseek_ids)

new_df = pd.DataFrame(columns=['cluster_id', 'proteins'])

for idx, id in enumerate(foldseek_ids):
    proteins = get_proteins(id, foldseek_dict, mmseqs_dict)
    new_df.loc[idx] = [id, len(proteins)]

print(new_df)


     cluster_id  proteins
0    A0A078MK14         2
1    A0A0A2VBZ4         3
2    A0A0B7MQS5         1
3    A0A0D0RVH7         7
4    A0A0G1MTH1         1
..          ...       ...
127      S7U7G8         1
128      V4R1H7         1
129      V6MEX2         6
130      V7PZ44         1
131      W4QUP1         1

[132 rows x 2 columns]


In [60]:
print(sum(new_df['proteins']))

1879


In [36]:
signalp = pd.read_csv('C:\\Users\\odesa\\OneDrive - University of Toronto\\CRC\\LatestDataJan\\signalp\\prediction_results.txt', sep='\t')

# display(signalp.head()) 

signalp['# ID'] = signalp['# ID'].str.split('_').str[2]

signalp['# ID'] = signalp['# ID'].str.split('|').str[0]

display(signalp.head()) 


Unnamed: 0,# ID,Prediction,OTHER,SP(Sec/SPI),LIPO(Sec/SPII),TAT(Tat/SPI),TATLIPO(Sec/SPII),PILIN(Sec/SPIII),CS Position
0,A0A023CM44,OTHER,1.000064,0.0,0.0,0.0,0.0,0.0,
1,A0A060DHE3,OTHER,0.612178,0.00843,0.379395,1.3e-05,1.3e-05,1.3e-05,
2,A0A072NJA5,OTHER,1.000076,0.0,0.0,0.0,0.0,0.0,
3,A0A072NN92,OTHER,1.000046,2e-06,0.0,0.0,0.0,0.0,
4,A0A077MGS3,OTHER,1.000048,1e-06,0.0,0.0,0.0,0.0,


In [42]:
sec_dict = {}

for idx, id in enumerate(foldseek_ids):
    proteins = get_proteins(id, foldseek_dict, mmseqs_dict)
    for protein in proteins:
        if protein in signalp['# ID'].values:
            other_column_value = signalp.loc[signalp['# ID'] == protein, 'Prediction'].values[0]
            sec_dict[protein] = other_column_value


print(len(sec_dict))

1879


**Import cluster data and clean it**

In [54]:
mmseqs_df = pd.read_csv('mmseqs_cluster.tsv', sep="\t")
foldseek_df = pd.read_csv('foldseek_result_cluster.tsv', sep="\t")

In [None]:
mmseqs_df.columns = ['mmseqs_cluster', 'unclustered']

foldseek_df.columns = ['foldseek_cluster', 'unclustered']

display(mmseqs_df.head())
display(foldseek_df.head())

In [None]:

mmseqs_df['mmseqs_cluster'] = mmseqs_df['mmseqs_cluster'].apply(lambda x: x.split('_')[2])
mmseqs_df['mmseqs_cluster'] = mmseqs_df['mmseqs_cluster'].apply(lambda x: x.split('|')[0])
mmseqs_df['unclustered'] = mmseqs_df['unclustered'].apply(lambda x: x.split('_')[2])
mmseqs_df['unclustered'] = mmseqs_df['unclustered'].apply(lambda x: x.split('|')[0])


display(mmseqs_df.head())


In [None]:
foldseek_df['foldseek_cluster'] = foldseek_df['foldseek_cluster'].apply(lambda x: x.split('-')[1])
foldseek_df['unclustered'] = foldseek_df['unclustered'].apply(lambda x: x.split('-')[1])


display(foldseek_df.head())

**Create the Foldseek and mmseqs2 dicts**

In [70]:
# Group by the representative sequence and aggregate the clustered sequences into lists
foldseek_dict = foldseek_df.groupby('foldseek_cluster')['unclustered'].apply(list).to_dict()

print(len(list(foldseek_dict.items())))

132


In [69]:
print(len(set(foldseek_dict.keys())))

132


In [59]:
# Group by the representative sequence and aggregate the clustered sequences into lists
mmseqs_dict = mmseqs_df.groupby('mmseqs_cluster')['unclustered'].apply(list).to_dict()

print(len(list(mmseqs_dict.items())))

542


**Test the clustering on the PRJEB7774 Data**

In [None]:
sample_humann = pd.read_csv('C:\\Users\\odesa\\OneDrive - University of Toronto\\CRC\\LatestDataJan\\PRJEB7774\\clean_joined_genefamilies_relab_7774.tsv', sep="\t")

display(sample_humann.head())

In [None]:
# Select only the DL columns
dl_df = sample_humann.loc[:, sample_humann.columns.str.startswith('DL-endopeptidase')]

display(dl_df.head())

In [14]:
# Make a list of the column names and check length
column_names = dl_df.columns.tolist()

print(len(column_names))

369


In [15]:
# Extract the UniRef IDs from the column names
column_ids = [x.split('_')[2] for x in column_names]

print(column_ids)

['A0A0A1MTG8', 'A0A0F0C6W3', 'A0A0F0CDC4', 'A0A0F0CGA5', 'A0A0F0CIG8', 'A0A0F0CJY0', 'A0A0F0CLG3', 'A0A0M6WWS7', 'A0A0P0FDA5', 'A0A0P0GID1', 'A0A108T752', 'A0A133S199', 'A0A143X3H5', 'A0A143X992', 'A0A143Y3F3', 'A0A143ZRA8', 'A0A151G0K1', 'A0A151G1C0', 'A0A151G438', 'A0A173R646', 'A0A173RM29', 'A0A173S7J0', 'A0A173SB35', 'A0A173SDE8', 'A0A173SF50', 'A0A173SQ00', 'A0A173SYG7', 'A0A173T002', 'A0A173T2N3', 'A0A173TCM6', 'A0A173TCP1', 'A0A173TWP3', 'A0A173TXE3', 'A0A173U138', 'A0A173U4W7', 'A0A173U6N6', 'A0A173UKT6', 'A0A173VAM1', 'A0A173WBD2', 'A0A173WHC1', 'A0A173WZX5', 'A0A173XJ87', 'A0A173Y9D9', 'A0A173YAR0', 'A0A173YI13', 'A0A173YY14', 'A0A173Z3W7', 'A0A174AG47', 'A0A174B8I9', 'A0A174BSD0', 'A0A174BWQ9', 'A0A174C4S4', 'A0A174CBG2', 'A0A174CGJ4', 'A0A174DPP8', 'A0A174DTW2', 'A0A174E351', 'A0A174E8P5', 'A0A174EN32', 'A0A174F2M5', 'A0A174FMR5', 'A0A174G463', 'A0A174GWM5', 'A0A174JNK9', 'A0A174JP72', 'A0A174KQX2', 'A0A174M4B0', 'A0A174MFZ4', 'A0A174N6D9', 'A0A174NG16', 'A0A174Q425', 'A0A1

In [None]:
# Apply the function to the test set and print some diagnostics

test = get_cluster(column_ids, mmseqs_dict, foldseek_dict)

print(len(test), len(column_ids))
print(test)

In [None]:
# Rename single proteins with their respective foldseek cluster reps

dl_df.columns = test

display(dl_df.head())

In [19]:
# Combine the columns with the same name and sum the values

agg_df = dl_df.groupby(dl_df.columns, axis=1).sum()

display(agg_df.head())

  agg_df = dl_df.groupby(dl_df.columns, axis=1).sum()


Unnamed: 0,A0A165B3M3,A0A174N6D9,A0A174RHF1,A0A1C5KMM8,A0A1C5KPL5,A0A1C5ML14,A0A1C5NTX1,A0A1C5QMY6,A0A1C5S0Y9,A0A1C5SKU4,...,A0A2K4ZQP4,A0A2K9E4F8,A0A399ERK4,A0A3S5AQD8,A0A4R7RUE4,A0A6V8LYJ9,A0A7R7E2C2,A0A928K5J0,R7C958,R7F3K0
0,8e-06,7.1e-05,6.4e-05,5.8e-05,2.9e-05,0.0,0.0,2.20625e-06,8.5e-05,0.0,...,2e-06,0.0,3e-05,3.6e-05,0.0,3.4e-05,5e-06,4.03368e-07,1.3e-05,0.0
1,4e-06,5.4e-05,2.2e-05,4.7e-05,2.7e-05,9.29607e-07,0.0,1.13545e-06,6.6e-05,0.0,...,2e-06,0.0,1.6e-05,8e-06,0.0,2e-05,6e-06,1.28513e-05,4e-06,4.48847e-07
2,1.4e-05,1.4e-05,2.7e-05,2.9e-05,2.3e-05,8.24217e-07,0.0,3.34088e-06,0.000104,2.11397e-07,...,2e-06,0.0,2.4e-05,3.1e-05,0.0,1.6e-05,9e-06,5.322443e-06,5e-06,1.056356e-06
3,1.5e-05,2.1e-05,3.2e-05,2.3e-05,1.4e-05,6.14132e-07,7.66209e-07,0.0,4.3e-05,0.0,...,1e-06,0.0,3e-06,1.5e-05,0.0,7e-06,5e-06,1.635855e-05,1e-05,4.0299e-07
4,6e-06,3.2e-05,5.1e-05,3.6e-05,1.7e-05,5.18522e-07,0.0,3.74502e-07,0.000139,0.0,...,2e-06,0.0,7e-06,1.1e-05,0.0,4.8e-05,5e-06,3.316007e-06,1e-05,1.96286e-07


In [None]:
# Replace the sample id column to the aggregated dataframe

agg_df = agg_df.merge(sample_humann[['sample_id']], left_index=True, right_index=True)

display(agg_df.head())

In [None]:
# Save the dataframe to a feather file

agg_df.to_feather('clustered_complete_7774.feather')

**Clustering DRA008156**

In [8]:
DRA008156_humann = pd.read_feather('C:\\Users\\odesa\\OneDrive - University of Toronto\\CRC\\LatestDataJan\\DRA008156\\clean_joined_genefamilies_relab_DRA008156.feather')

display(DRA008156_humann.head())

# Gene Family,sample_id,UNMAPPED,Amidase_UniRef100_A0A010NJR5,Amidase_UniRef100_A0A010NWV0,Amidase_UniRef100_A0A010PQ73,Amidase_UniRef100_A0A010YT92,Amidase_UniRef100_A0A010ZI67,Amidase_UniRef100_A0A015SN82,Amidase_UniRef100_A0A015SSH9,Amidase_UniRef100_A0A015SVI2,...,UC118_WP_242439805.1,UC118_WP_242458478.1,UC118_WP_243463155.1,UC118_WP_249742344.1,UC118_WP_250200784.1,UC118_WP_253005939.1,UC118_WP_255820014.1,UC118_WP_263296879.1,UC118_WP_263297069.1,UC118_WP_263298109.1
0,DRR127476,0.995873,0.0,0.0,0.0,0.0,0.0,5.38203e-08,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,DRR127478,0.994367,0.0,0.0,0.0,0.0,0.0,1.67309e-07,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,DRR127481,0.995911,0.0,0.0,0.0,0.0,0.0,1.51146e-07,0.0,6.41207e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,DRR127485,0.99606,0.0,0.0,0.0,0.0,0.0,4.45893e-08,0.0,1.83732e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DRR127488,0.995879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Select only the DL columns
dl_df_8156 = DRA008156_humann.loc[:, DRA008156_humann.columns.str.startswith('DL-endopeptidase')]

display(dl_df_8156.head())

# Gene Family,DL-endopeptidase_UniRef100_A0A0A1MTG8,DL-endopeptidase_UniRef100_A0A0C3KJ32,DL-endopeptidase_UniRef100_A0A0D1KI97,DL-endopeptidase_UniRef100_A0A0D1KRP9,DL-endopeptidase_UniRef100_A0A0F0C6W3,DL-endopeptidase_UniRef100_A0A0F0CDC4,DL-endopeptidase_UniRef100_A0A0F0CGA5,DL-endopeptidase_UniRef100_A0A0F0CIG8,DL-endopeptidase_UniRef100_A0A0F0CJY0,DL-endopeptidase_UniRef100_A0A0F0CLG3,...,DL-endopeptidase_UniRef100_UPI00203B6123,DL-endopeptidase_UniRef100_UPI00203BC975,DL-endopeptidase_UniRef100_UPI00204244DF,DL-endopeptidase_UniRef100_UPI0020B33D17,DL-endopeptidase_UniRef100_UPI0021558C01,DL-endopeptidase_UniRef100_UPI00227E98D5,DL-endopeptidase_UniRef100_UPI00228231F2,DL-endopeptidase_UniRef100_UPI002282FA9A,DL-endopeptidase_UniRef100_UPI002286517A,DL-endopeptidase_UniRef100_UPI0022AA0C16
0,2.55904e-07,0.0,0.0,0.0,0.0,3.10215e-07,6.50298e-07,0.0,2.17124e-07,1.41587e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.61218e-06,0.0,0.0,0.0,0.0,6.42969e-07,1.50553e-06,4.64553e-07,4.8775e-07,7.87507e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.47969e-07,0.0,0.0,0.0,0.0,1.58424e-07,5.5901e-07,0.0,0.0,6.58255e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.12123e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.30417e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.82275e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Make a list of the column names and check length
column_names = dl_df_8156.columns.tolist()

print(len(column_names))

461


In [11]:
# Extract the UniRef IDs from the column names
column_ids = [x.split('_')[2] for x in column_names]

print(len(column_ids))

461


In [20]:
cluster_reps_8156 = get_cluster(column_ids, mmseqs_dict, foldseek_dict)

length of no hits: 0


In [21]:
print(len(cluster_reps_8156), len(column_ids))

461 461


In [22]:
dl_df_8156.columns = cluster_reps_8156

display(dl_df_8156.head())

Unnamed: 0,A0A1Y2MSQ6,A0A1I0SEW6,A0A6C1VQR4,A0A6C1VQR4.1,A0A1I6KY25,A0A399ERK4,A0A1C5NTX1,A0A1C5YXG5,A0A165B3M3,A0A3S5AQD8,...,A0A928K5J0,A0A928K5J0.1,A0A928K5J0.2,A0A928K5J0.3,A0A928K5J0.4,A0A928K5J0.5,A0A928K5J0.6,A0A928K5J0.7,A0A928K5J0.8,A0A928K5J0.9
0,2.55904e-07,0.0,0.0,0.0,0.0,3.10215e-07,6.50298e-07,0.0,2.17124e-07,1.41587e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.61218e-06,0.0,0.0,0.0,0.0,6.42969e-07,1.50553e-06,4.64553e-07,4.8775e-07,7.87507e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.47969e-07,0.0,0.0,0.0,0.0,1.58424e-07,5.5901e-07,0.0,0.0,6.58255e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.12123e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.30417e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.82275e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
agg_df_8156 = dl_df_8156.groupby(dl_df_8156.columns, axis=1).sum()

display(agg_df_8156.head())

  agg_df_8156 = dl_df_8156.groupby(dl_df_8156.columns, axis=1).sum()


Unnamed: 0,A0A165B3M3,A0A174N6D9,A0A174RHF1,A0A1C5KMM8,A0A1C5KPL5,A0A1C5ML14,A0A1C5NTX1,A0A1C5QMY6,A0A1C5S0Y9,A0A1C5SKU4,...,A0A4R7RUE4,A0A6C1VQR4,A0A6V8LYJ9,A0A7R7E2C2,A0A928K5J0,C3JB20,F5XN07,R7C958,R7F3K0,R7K962
0,3.2e-05,1.3e-05,7e-06,2.4e-05,3.306468e-06,0.0,1.603904e-06,0.0,4.2e-05,0.0,...,0.0,5.86635e-07,5.42681e-07,3e-06,0.0,0.0,2.98616e-08,1.442146e-06,0.0,0.0
1,7e-06,0.000123,1.1e-05,2.3e-05,1.03257e-06,0.0,7.26073e-06,0.0,1.7e-05,0.0,...,0.0,0.0,7.561224e-06,2e-06,0.0,0.0,0.0,5.64572e-07,0.0,0.0
2,2e-06,7e-06,8e-06,1.1e-05,9.75243e-07,0.0,2.687863e-06,0.0,3.5e-05,0.0,...,0.0,0.0,2.118201e-06,3e-06,0.0,0.0,0.0,4.78786e-07,0.0,0.0
3,2.7e-05,2e-06,7e-06,2.1e-05,3.860533e-06,0.0,6.32727e-07,4e-06,6.1e-05,0.0,...,0.0,3.94496e-07,1.347567e-06,1e-06,0.0,0.0,3.57363e-07,3.844152e-06,8.83811e-08,0.0
4,1e-05,3.4e-05,3e-06,1.7e-05,4.284396e-06,0.0,0.0,4e-06,2.8e-05,0.0,...,0.0,1.90404e-07,1.39144e-06,1e-06,0.0,0.0,0.0,1.084904e-06,1.03047e-07,0.0


In [None]:
agg_df_8156 = agg_df_8156.merge(DRA008156_humann[['sample_id']], left_index=True, right_index=True)

display(agg_df_8156.head())

In [26]:
agg_df_8156.to_feather('clustered_complete_DRA008156.feather')