In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
mmseqs_df = pd.read_csv('mmseqs_cluster.tsv', sep="\t")
foldseek_df = pd.read_csv('foldseek_result_cluster.tsv', sep="\t")

In [3]:
mmseqs_df.columns = ['mmseqs_cluster', 'unclustered']

foldseek_df.columns = ['foldseek_cluster', 'unclustered']


In [4]:
pattern = r'UniRef100_([A-Za-z0-9]+)\|'
mmseqs_df['mmseqs_cluster'] = mmseqs_df['mmseqs_cluster'].str.extract(pattern)
mmseqs_df['unclustered'] = mmseqs_df['unclustered'].str.extract(pattern)


display(mmseqs_df.head())


Unnamed: 0,mmseqs_cluster,unclustered
0,A0A174B8I9,A0A174B8I9
1,A0A1E2RWF6,A0A1E2RWF6
2,A0A1V6HF93,A0A1V6HF93
3,A0A380MZP3,A0A380MZP3
4,A0A428I3D7,A0A428I3D7


In [5]:
foldseek_df['foldseek_cluster'] = foldseek_df['foldseek_cluster'].apply(lambda x: x.split('-')[1])
foldseek_df['unclustered'] = foldseek_df['unclustered'].apply(lambda x: x.split('-')[1])


display(foldseek_df.head())

Unnamed: 0,foldseek_cluster,unclustered
0,A0A078MK14,A0A4V2DZ80
1,A0A078MK14,A0A2R2W5C2
2,A0A0A2VBZ4,A0A0A2VBZ4
3,A0A0A2VBZ4,A0A1G8S2X6
4,A0A0B7MQS5,A0A0B7MQS5


In [6]:
mmseqs_df.to_csv('mmseqs__cleaned_cluster.tsv', sep="\t", index=False)
foldseek_df.to_csv('foldseek__cleaned_cluster.tsv', sep="\t", index=False)


In [7]:
# Group by the representative sequence and aggregate the clustered sequences into lists
foldseek_dict = foldseek_df.groupby('foldseek_cluster')['unclustered'].apply(list).to_dict()

list(foldseek_dict.items())[:10]

[('A0A078MK14', ['A0A4V2DZ80', 'A0A2R2W5C2']),
 ('A0A0A2VBZ4', ['A0A0A2VBZ4', 'A0A1G8S2X6']),
 ('A0A0B7MQS5', ['A0A0B7MQS5']),
 ('A0A0D0RVH7', ['A0A0D0RVH7', 'A0A098EI80', 'A0A2S5D4V8']),
 ('A0A0G1MTH1', ['A0A0G1MTH1']),
 ('A0A117EBF4', ['A0A117EBF4', 'A0A1V5U177', 'A0A829ZIN9']),
 ('A0A151B350',
  ['A0A151B350', 'A0A162TRS5', 'A0A1V4IFT6', 'A0A0L6Z818', 'A0A1V5L937']),
 ('A0A163ZA77', ['A0A163ZA77']),
 ('A0A165B3M3',
  ['A0A165B3M3',
   'A0A1H0WYG5',
   'A0A1V6C403',
   'A0A4Y7R8Q8',
   'A0A6V7R5K0',
   'A0A7T1F3W4',
   'S7SZV7',
   'A0A2R3JUC6',
   'A0A1Y0WKX0',
   'A0A6F9XLF6',
   'A0A330LFL9',
   'A0A7U9QYQ8',
   'A0A7U9MPR8',
   'A0A143X992',
   'A0A1C5KYE3',
   'A0A173U6N6',
   'A0A174B8I9',
   'A0A1C5PI34',
   'A0A1C6A4C0',
   'A0A1C6JX73',
   'A0A7U9WXR3',
   'A0A7V8FF20',
   'A0A2T0MBL5',
   'A0A378I0S2',
   'A0A857JNU5',
   'A3TVW4']),
 ('A0A166IG42', ['A0A166IG42'])]

In [10]:
mmseqs_values = mmseqs_df['mmseqs_cluster'].tolist()

print(mmseqs_values, len(mmseqs_values))

['A0A174B8I9', 'A0A1E2RWF6', 'A0A1V6HF93', 'A0A380MZP3', 'A0A428I3D7', 'R5RW36', 'R5RW36', 'A0A285NMQ5', 'A0A285NMQ5', 'A0A285NMQ5', 'A0A7J0AAJ5', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A0J6VLZ1', 'A0A0J6VLZ1', 'A0A1C6G745', 'A0A0J1I6K8', 'A0A1V4SDX7', 'A0A3S5AQD8', 'A0A3S5AQD8', 'A0A3S5AQD8', 'A0A3S5AQD8', 'A0A3S5AQD8', 'A0A6M1ZB96', 'A0A248JXY1', 'A0A433XPS9', 'A0A433XPS9', 'A0A1C5ML14', 'A0A1C5YPL9', 'A0A1V4WZV6', 'A0A1V6JEK4', 'A0A246KEF4', 'A0A246KEF4', 'A0A246KEF4', 'A0A246KEF4', 'A0A6N2SLV1', 'A0A977II70', 'R5STP1', 'A0A7U9MND5', 'A0A7U9MND5', 'R6SGY6', 'A0A1I6KYN6', 'A0A841RQA6', 'A0A841RQA6', 'A0A841RQA6', 'A0A841RQA6', 'A0A841RQA6', 'A0A841RQA6', 'A0A841RQA6', 'A0A1U7M8N9', 'A0A379DHS9', 'A0A379DHS9', 'A0A6N7B1P8', 'C3JB20', 'A0A0V8JNZ3', 'A0A0V8JNZ3', 'A0A0V8JNZ3

1879 1879


In [17]:
no_match = [x for x in mmseqs_values if x not in hits]

print(no_match)

['A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A916HN28', 'A0A977II70', 'A0A916IAR6', 'A0A2U3N4J8', 'A0A8A0RIX7', 'A0A919ZDK0', 'A0A919ZDK0', 'A0A919ZDK0', 'A0A916BQR8', 'A0A916BQR8', 'A0A916BQR8', 'A0A917VZF3', 'A0A917VZF3', 'UPI0001CF73DB', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A928K5J0', 'A0A949RNE6', 'A0A916KA10', 'A0A916HNI5', 'A0A940ZKM3', 'A0A8S0GDS2', 'A0A942PKR4', 'A

In [56]:
print(len(foldseek_list))

0


In [21]:
sample_humann = pd.read_csv('C:\\Users\\odesa\\Desktop\\CRCFinal\\PRJEB7774\\clean_joined_genefamilies_relab_7774.tsv', sep="\t")

In [None]:
dl_df = sample_humann.loc[:, sample_humann.columns.str.startswith('DL-endopeptidase')]

display(dl_df.head())

In [58]:
column_names = dl_df.columns.tolist()

print(len(column_names))

369


In [33]:
column_ids = [x.split('_')[2] for x in column_names]

print(column_ids)

['A0A0A1MTG8', 'A0A0F0C6W3', 'A0A0F0CDC4', 'A0A0F0CGA5', 'A0A0F0CIG8', 'A0A0F0CJY0', 'A0A0F0CLG3', 'A0A0M6WWS7', 'A0A0P0FDA5', 'A0A0P0GID1', 'A0A108T752', 'A0A133S199', 'A0A143X3H5', 'A0A143X992', 'A0A143Y3F3', 'A0A143ZRA8', 'A0A151G0K1', 'A0A151G1C0', 'A0A151G438', 'A0A173R646', 'A0A173RM29', 'A0A173S7J0', 'A0A173SB35', 'A0A173SDE8', 'A0A173SF50', 'A0A173SQ00', 'A0A173SYG7', 'A0A173T002', 'A0A173T2N3', 'A0A173TCM6', 'A0A173TCP1', 'A0A173TWP3', 'A0A173TXE3', 'A0A173U138', 'A0A173U4W7', 'A0A173U6N6', 'A0A173UKT6', 'A0A173VAM1', 'A0A173WBD2', 'A0A173WHC1', 'A0A173WZX5', 'A0A173XJ87', 'A0A173Y9D9', 'A0A173YAR0', 'A0A173YI13', 'A0A173YY14', 'A0A173Z3W7', 'A0A174AG47', 'A0A174B8I9', 'A0A174BSD0', 'A0A174BWQ9', 'A0A174C4S4', 'A0A174CBG2', 'A0A174CGJ4', 'A0A174DPP8', 'A0A174DTW2', 'A0A174E351', 'A0A174E8P5', 'A0A174EN32', 'A0A174F2M5', 'A0A174FMR5', 'A0A174G463', 'A0A174GWM5', 'A0A174JNK9', 'A0A174JP72', 'A0A174KQX2', 'A0A174M4B0', 'A0A174MFZ4', 'A0A174N6D9', 'A0A174NG16', 'A0A174Q425', 'A0A1

In [8]:
# Group by the representative sequence and aggregate the clustered sequences into lists
mmseqs_dict = mmseqs_df.groupby('mmseqs_cluster')['unclustered'].apply(list).to_dict()

print(len(list(mmseqs_dict.items())))

542


In [39]:
mmseqs_list = []
hits = []

for value in column_ids:
    for key, content in mmseqs_dict.items():
        if value in content or value == key:
            mmseqs_list.append(key)
            hits.append(value)

no_match = [x for x in column_ids if x not in hits]

print(len(no_match), len(mmseqs_list))

369
0 369


In [41]:
foldseek_list = []
hits = []

for value in mmseqs_list:

    for key, content in foldseek_dict.items():
        if value in content or value == key:
            foldseek_list.append(key)
            hits.append(value)
            


no_match = [x for x in mmseqs_list if x not in hits]

print(len(no_match), len(foldseek_list))  


18 351


In [55]:
def get_cluster(raw_list, mmseqs_dict, foldseek_dict):
    ''' Input a list of ids and 2 dictionaries describing cluster
        patterns, return a list of foldseek cluster ids'''
    
    mmseqs_list = []
    hits = []

    for value in raw_list:
        for key, content in mmseqs_dict.items():
            if value in content or value == key:
                mmseqs_list.append(key)
                hits.append(value)

    # Need to figure out what to do here if mmseqs dont map
    no_match = [x for x in raw_list if x not in hits]

    print(f'length of no hits: {len(no_match)}')

    foldseek_list = []
    hits = []

    for value in mmseqs_list:

        for key, content in foldseek_dict.items():
            if value in content or value == key:
                foldseek_list.append(key)
                hits.append(value)

    no_match = [x for x in mmseqs_list if x not in hits]

    foldseek_list.extend(no_match)

    return foldseek_list

In [56]:
test = get_cluster(column_ids, mmseqs_dict, foldseek_dict)

length of no hits: 0


In [57]:
print(len(test), len(column_ids))
print(test)

369 369
['A0A1Y2MSQ6', 'A0A1I6KY25', 'A0A399ERK4', 'A0A1C5NTX1', 'A0A1C5YXG5', 'A0A165B3M3', 'A0A3S5AQD8', 'A0A1M5VYK8', 'A0A1C6JJG3', 'A0A1C6JJG3', 'A0A1C6JJG3', 'A0A7R7E2C2', 'A0A1C5S0Y9', 'A0A165B3M3', 'R7C958', 'A0A285HCD2', 'A0A165B3M3', 'A0A1C6JJG3', 'A0A399ERK4', 'A0A1U7M8N9', 'A0A399ERK4', 'A0A1I6KY25', 'A0A1M5VYK8', 'A0A1M5VYK8', 'A0A1I2NVF2', 'A0A1C5YXG5', 'A0A1C6JJG3', 'A0A165B3M3', 'A0A1C6JJG3', 'A0A1U7M8N9', 'A0A1C5KPL5', 'R7C958', 'A0A1C5KPL5', 'A0A174RHF1', 'A0A1C6JJG3', 'A0A165B3M3', 'A0A399ERK4', 'A0A1C5KMM8', 'A0A399ERK4', 'A0A1C5KPL5', 'A0A174RHF1', 'A0A1C5YXG5', 'A0A1C5KPL5', 'A0A1C5KPL5', 'A0A7R7E2C2', 'A0A1U7M8N9', 'A0A174RHF1', 'A0A6V8LYJ9', 'A0A165B3M3', 'A0A1V6BVN9', 'A0A6V8LYJ9', 'A0A399ERK4', 'A0A1C6JJG3', 'A0A1C5KPL5', 'A0A399ERK4', 'A0A1C5KPL5', 'A0A174RHF1', 'A0A1U7M8N9', 'A0A1C5KPL5', 'A0A174N6D9', 'A0A399ERK4', 'A0A1M5VYK8', 'A0A1C5KMM8', 'A0A1U7M8N9', 'A0A1C5KPL5', 'A0A1U7M8N9', 'A0A174RHF1', 'A0A1C5YXG5', 'A0A174N6D9', 'A0A6V8LYJ9', 'A0A6V8LYJ9', 'A0A1

In [59]:
dl_df.columns = test

display(dl_df.head())

Unnamed: 0,A0A1Y2MSQ6,A0A1I6KY25,A0A399ERK4,A0A1C5NTX1,A0A1C5YXG5,A0A165B3M3,A0A3S5AQD8,A0A1M5VYK8,A0A1C6JJG3,A0A1C6JJG3.1,...,A0A928K5J0,A0A928K5J0.1,A0A928K5J0.2,A0A928K5J0.3,A0A928K5J0.4,A0A928K5J0.5,A0A928K5J0.6,A0A928K5J0.7,A0A928K5J0.8,A0A928K5J0.9
0,3.6524e-07,0.0,0.0,0.0,0.0,0.0,1.3177e-05,0.0,9.11035e-07,1.24682e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.03368e-07
1,1.11603e-07,0.0,0.0,0.0,0.0,0.0,3.28863e-06,0.0,1.08744e-06,2.89881e-06,...,1.75495e-06,0.0,0.0,0.0,1.00561e-06,0.0,0.0,0.0,6.90165e-07,1.6436e-07
2,5.16826e-07,0.0,1.96652e-07,0.0,0.0,0.0,1.03072e-05,6.76531e-08,0.0,0.0,...,5.82473e-07,0.0,0.0,0.0,0.0,0.0,2e-06,0.0,0.0,0.0
3,1.85868e-07,0.0,0.0,3.04414e-07,0.0,1.21514e-07,5.98678e-06,1.48626e-06,4.10468e-07,9.42576e-07,...,1.11619e-06,0.0,1.2e-05,0.0,0.0,0.0,1e-06,0.0,0.0,0.0
4,3.249e-07,0.0,0.0,0.0,0.0,0.0,3.03995e-07,1.51553e-06,0.0,0.0,...,0.0,0.0,0.0,0.0,8.59547e-07,0.0,0.0,0.0,0.0,0.0


In [62]:
agg_df = dl_df.groupby(dl_df.columns, axis=1).sum()

display(agg_df.head())

  agg_df = dl_df.groupby(dl_df.columns, axis=1).sum()


Unnamed: 0,A0A165B3M3,A0A174N6D9,A0A174RHF1,A0A1C5KMM8,A0A1C5KPL5,A0A1C5ML14,A0A1C5NTX1,A0A1C5QMY6,A0A1C5S0Y9,A0A1C5SKU4,...,A0A2K4ZQP4,A0A2K9E4F8,A0A399ERK4,A0A3S5AQD8,A0A4R7RUE4,A0A6V8LYJ9,A0A7R7E2C2,A0A928K5J0,R7C958,R7F3K0
0,8e-06,7.1e-05,6.4e-05,5.8e-05,2.9e-05,0.0,0.0,2.20625e-06,8.5e-05,0.0,...,2e-06,0.0,3e-05,3.6e-05,0.0,3.4e-05,5e-06,4.03368e-07,1.3e-05,0.0
1,4e-06,5.4e-05,2.2e-05,4.7e-05,2.7e-05,9.29607e-07,0.0,1.13545e-06,6.6e-05,0.0,...,2e-06,0.0,1.6e-05,8e-06,0.0,2e-05,6e-06,1.28513e-05,4e-06,4.48847e-07
2,1.4e-05,1.4e-05,2.7e-05,2.9e-05,2.3e-05,8.24217e-07,0.0,3.34088e-06,0.000104,2.11397e-07,...,2e-06,0.0,2.4e-05,3.1e-05,0.0,1.6e-05,9e-06,5.322443e-06,5e-06,1.056356e-06
3,1.5e-05,2.1e-05,3.2e-05,2.3e-05,1.4e-05,6.14132e-07,7.66209e-07,0.0,4.3e-05,0.0,...,1e-06,0.0,3e-06,1.5e-05,0.0,7e-06,5e-06,1.635855e-05,1e-05,4.0299e-07
4,6e-06,3.2e-05,5.1e-05,3.6e-05,1.7e-05,5.18522e-07,0.0,3.74502e-07,0.000139,0.0,...,2e-06,0.0,7e-06,1.1e-05,0.0,4.8e-05,5e-06,3.316007e-06,1e-05,1.96286e-07
