In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from functions import *
import os

**Import cluster data and create dicts**

In [2]:
# Directory containing the TSV files
directory_path = '../../data/clustering/cluster_maps/'

# List all files in the directory
file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.tsv')]


# Read each TSV file into a DataFrame and store them in a list
dataframes = []
for file_path in file_paths:
    df = pd.read_csv(file_path, sep='\t')
    dataframes.append(df.iloc[:, 1:])

# Concatenate all DataFrames side by side
combined_df = pd.concat(dataframes, axis=1)

# Save the combined DataFrame to a new TSV file
combined_df.to_csv('combined_clusters.tsv', sep='\t', index=False)

In [None]:
cluster_df = pd.read_csv('../../data/clustering/cluster_maps/combined_clusters.tsv', sep='\t')

headers = list(cluster_df.columns)

enzymes = set([x.split('_')[0] for x in headers])

print(enzymes) 

In [16]:
mmseqs_df = pd.read_csv('../../data/clustering/mmseqs_cleaned_cluster.tsv', sep="\t")
foldseek_df = pd.read_csv('../../data/clustering/foldseek_cleaned_cluster.tsv', sep="\t")

In [17]:
# Group by the representative sequence and aggregate the clustered sequences into lists
foldseek_dict = foldseek_df.groupby('foldseek_cluster')['unclustered'].apply(list).to_dict()

print(len(list(foldseek_dict.items())))

132


In [18]:
# Group by the representative sequence and aggregate the clustered sequences into lists
mmseqs_dict = mmseqs_df.groupby('mmseqs_cluster')['unclustered'].apply(list).to_dict()

print(len(list(mmseqs_dict.items())))

542


In [None]:
# Flatten the list of mmseqs cluster representatives from foldseek_dict
foldseek_mmseqs_reps = set()
for reps in foldseek_dict.values():
    foldseek_mmseqs_reps.update(reps)

# Check if each mmseqs cluster representative is in foldseek_mmseqs_reps
unmapped_mmseqs_reps = [rep for rep in mmseqs_dict if rep not in foldseek_mmseqs_reps]

unmapped_mmseqs_reps

# Some proteins do not have their 3d structures predicted yet and so they are not in the foldseek_dict

**Create Secretion table and dictionary**

In [None]:
signalp = pd.read_csv('C:\\Users\\odesa\\OneDrive - University of Toronto\\CRC\\LatestDataJan\\signalp\\prediction_results.txt', sep='\t')

# signalp = pd.read_csv('/home/oliver/CRConedriveData/LatestDataJan/signalp/prediction_results.txt', sep='\t')

# display(signalp.head()) 

signalp['# ID'] = signalp['# ID'].str.split('_').str[2]

signalp['# ID'] = signalp['# ID'].str.split('|').str[0]

display(signalp.head()) 


In [17]:
foldseek_ids = list(foldseek_dict.keys())

In [18]:
# Create a dictionary containing the sec tag status for each of the original proteins present in the db

sec_dict = {}

for idx, id in enumerate(foldseek_ids):
    proteins = get_proteins(id, foldseek_dict, mmseqs_dict)
    for protein in proteins:
        if protein in signalp['# ID'].values:
            other_column_value = signalp.loc[signalp['# ID'] == protein, 'Prediction'].values[0]
            sec_dict[protein] = other_column_value

print(sec_dict)


{'A0A4V2DZ80': 'OTHER', 'A0A2R2W5C2': 'OTHER', 'A0A0A2VBZ4': 'SP', 'A0A0A2TFV7': 'SP', 'A0A1G8S2X6': 'SP', 'A0A0B7MQS5': 'OTHER', 'A0A0D0RVH7': 'SP', 'A0A098EI80': 'SP', 'A0A8F5H2Q7': 'SP', 'A0A2S5D4V8': 'SP', 'A0A828ZA80': 'SP', 'B1HNB1': 'SP', 'A0A828ZJD3': 'SP', 'A0A0G1MTH1': 'OTHER', 'A0A117EBF4': 'OTHER', 'A0A2S6XDR4': 'OTHER', 'A0A3N6H9U2': 'SP', 'A0A3N6HDX8': 'SP', 'A0A6C0QBM0': 'SP', 'A0A1V5U177': 'OTHER', 'A0A829ZIN9': 'SP', 'A0A151B350': 'SP', 'A0A162TRS5': 'SP', 'A0A1V4IFT6': 'SP', 'A0A161XER4': 'SP', 'A0A162L9K6': 'OTHER', 'A0A0L6Z818': 'SP', 'A0A1V5L937': 'SP', 'A0A163ZA77': 'OTHER', 'A0A165B3M3': 'SP', 'A0A8G0S7I1': 'SP', 'UPI0022864AF2': 'SP', 'A0A0M2PCH6': 'SP', 'UPI00155FFE64': 'SP', 'UPI0008FB71ED': 'SP', 'S7VHP7': 'OTHER', 'A0A0T6BMY2': 'SP', 'A0A179SV92': 'SP', 'UPI00203B6123': 'SP', 'UPI000745E235': 'SP', 'UPI001363BBCC': 'SP', 'UPI0022815CEC': 'SP', 'A0A6H2JRD4': 'SP', 'A0A8G0TKA2': 'SP', 'UPI0007726AF7': 'SP', 'UPI0013748E59': 'SP', 'UPI002281B2C3': 'SP', 'A0A8I1

In [19]:
# Doesnt currently work but need to check if SP is the only possible sec tag

# Flattening lists and extracting unique values
unique_values = set(value for values_list in sec_dict.values() for value in values_list)

print(unique_values)

{'O', 'L', 'S', 'E', 'T', 'P', 'H', 'A', 'I', 'R'}


**Create Domain Table**

In [None]:
IPS = pd.read_csv('/home/oliver/CRConedriveData/LatestDataJan/IPS/all_dl_endo_domains.tsv', sep='\t', header=None)

display(IPS.head())

In [None]:
IPS[0] = IPS[0].str.split('_').str[2]

IPS[0] = IPS[0].str.split('|').str[0]

display(IPS.head())

In [40]:
column_5_list = IPS.iloc[:, 5].tolist()
unique_values = set(column_5_list)
print(unique_values)


{'Transglycosylase-like domain', 'CAP-associated N-terminal', 'NlpC/P60 family', 'Immunoglobulin-like domain of bacterial spore germination', 'Amyloid A4 N-terminal heparin-binding', 'Transglycosylase SLT domain', 'Cell Wall Hydrolase', 'Lysozyme-like', 'SPOR domain', 'Bacterial Ig-like domain (group 2)', 'Zinc carboxypeptidase', 'SH3 domain (SH3b1 type)', 'FG-GAP repeat', 'LysM domain', 'Bacterial protein of unknown function (DUF882)', 'Fibronectin type III domain', 'S-layer homology domain', 'Putative peptidoglycan binding domain', 'Bacterial SH3 domain', 'Copper amine oxidase N-terminal domain', 'Cysteine-rich secretory protein family', 'SH3 domain of SH3b2 type', 'Beta-lactamase', 'Receptor family ligand binding region', 'Peptidase family M23', 'Bacterial Ig-like domain', 'Phage-related minor tail protein', 'Protein of unknown function (DUF5818)', 'Penicillin-insensitive murein endopeptidase', 'NLPC_P60 stabilising domain, N term', 'Bacterial dipeptidyl-peptidase Sh3 domain', 'alph

In [53]:
a = 'A0A4Y7RJ07'
foldseek_id = get_cluster([a], mmseqs_dict, foldseek_dict)[0]
print(foldseek_id)

A0A1C3RCD0


In [None]:
for idx, value in enumerate(IPS[0]):
    foldseek_id = get_cluster([value], mmseqs_dict, foldseek_dict)[0]
    IPS.at[idx, 0] = foldseek_id

display(IPS.head())

**Test the clustering on the PRJEB7774 Data**

In [2]:
PRJEB7774 = pd.read_feather('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB7774/clean_joined_genefamilies_relab_7774.feather')
print(PRJEB7774.shape)

(155, 80749)


In [3]:
clustered_7774 = cluster_humann_table('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB7774/clean_joined_genefamilies_relab_7774.feather',
                                      'C:/Users/odesa/Desktop/Code/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

print(clustered_7774.shape)

369 DL-endopeptidase found
369 DL-endopeptidase found, 9 DL-endopeptidase unclustered
4154 LD-carboxypeptidase found


KeyboardInterrupt: 

In [10]:
clustered_7774.to_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB7774.tsv', sep='\t', index=False)

**Clustering PRJEB10878**

In [3]:
PRJEB10878 = pd.read_feather('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB10878/clean_joined_genefamilies_relab_10878.feather')
print(PRJEB10878.shape)

(127, 76121)


In [4]:
clustered_10878 = cluster_humann_table('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB10878/clean_joined_genefamilies_relab_10878.feather',
                                      'C:/Users/odesa/Desktop/Code/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

print(clustered_10878.shape)

329 DL-endopeptidase found
329 DL-endopeptidase found, 8 DL-endopeptidase unclustered
3935 LD-carboxypeptidase found
3935 LD-carboxypeptidase found, 647 LD-carboxypeptidase unclustered
31 LD-endopeptidase found
31 LD-endopeptidase found, 31 LD-endopeptidase unclustered
2165 Glucosaminidase found
2165 Glucosaminidase found, 652 Glucosaminidase unclustered
11481 DD-carboxypeptidase found
11481 DD-carboxypeptidase found, 956 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
20960 Amidase found
20960 Amidase found, 1172 Amidase unclustered
34465 Muramidase found
34465 Muramidase found, 2549 Muramidase unclustered
(127, 1379)


In [5]:
clustered_10878.to_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB10878.tsv', sep='\t', index=False)

**Clustering DRA008156**

In [None]:
DRA = pd.read_feather('~/Downloads/clean_joined_genefamilies_relab_DRA008156.feather')
print(DRA.shape)

In [2]:
# Test cluster_humann_table function

test = cluster_humann_table('~/Downloads/clean_joined_genefamilies_relab_DRA008156.feather',
                             '~/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

461 DL-endopeptidase found
461 DL-endopeptidase found, 11 DL-endopeptidase unclustered
4871 LD-carboxypeptidase found
4871 LD-carboxypeptidase found, 691 LD-carboxypeptidase unclustered
81 LD-endopeptidase found
81 LD-endopeptidase found, 81 LD-endopeptidase unclustered
3259 Glucosaminidase found
3259 Glucosaminidase found, 1107 Glucosaminidase unclustered
13996 DD-carboxypeptidase found
13996 DD-carboxypeptidase found, 1102 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
26364 Amidase found
26364 Amidase found, 1631 Amidase unclustered
46941 Muramidase found
46941 Muramidase found, 4995 Muramidase unclustered


In [7]:
test.to_csv('~/CRC-Final/data/clustering/humann_clustered/clustered_complete_DRA008156.tsv', sep='\t', index=False)

**Clustering ICI trial PRJEB22893**

In [13]:
ici_22893 = pd.read_feather('E:/ICI/ici_humann/clean_joined_PRJEB22893_relab.feather')

print(ici_22893.shape)

(25, 42545)


In [14]:
ici_22893_clustered = cluster_humann_table('E:/ICI/ici_humann/clean_joined_PRJEB22893_relab.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

249 DL-endopeptidase found
249 DL-endopeptidase found, 6 DL-endopeptidase unclustered
1504 LD-carboxypeptidase found
1504 LD-carboxypeptidase found, 119 LD-carboxypeptidase unclustered
26 LD-endopeptidase found
26 LD-endopeptidase found, 26 LD-endopeptidase unclustered
1266 Glucosaminidase found
1266 Glucosaminidase found, 285 Glucosaminidase unclustered
7003 DD-carboxypeptidase found
7003 DD-carboxypeptidase found, 383 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
12719 Amidase found
12719 Amidase found, 597 Amidase unclustered
17803 Muramidase found
17803 Muramidase found, 1060 Muramidase unclustered


In [15]:
ici_22893_clustered.to_csv('E:/ICI/ici_humann/clustered_complete_PRJEB22893.tsv', sep='\t', index=False)

**Clustering ICI trial PRJNA399742**

In [6]:
ici_399742 = pd.read_feather('E:/ICI/ici_humann/clean_joined_PRJNA399742_relab.feather')

print(ici_399742.shape)

(39, 62621)


In [7]:
ici_399742_clustered = cluster_humann_table('E:/ICI/ici_humann/clean_joined_PRJNA399742_relab.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

317 DL-endopeptidase found
317 DL-endopeptidase found, 8 DL-endopeptidase unclustered
3194 LD-carboxypeptidase found
3194 LD-carboxypeptidase found, 555 LD-carboxypeptidase unclustered
30 LD-endopeptidase found
30 LD-endopeptidase found, 30 LD-endopeptidase unclustered
2118 Glucosaminidase found
2118 Glucosaminidase found, 834 Glucosaminidase unclustered
10072 DD-carboxypeptidase found
10072 DD-carboxypeptidase found, 814 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
18278 Amidase found
18278 Amidase found, 967 Amidase unclustered
26147 Muramidase found
26147 Muramidase found, 2266 Muramidase unclustered


In [None]:
ici_399742_clustered.to_csv('E:/ICI/ici_humann/clustered_complete_PRJEB399742.tsv', sep='\t', index=False)

**Clustering Bariatric Data**

In [5]:
bariatric = pd.read_feather('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather')

print(bariatric['sample_id'])

1      T_103
2      T_103
3      T_103
4      T_103
5      T_107
       ...  
109     T_79
110     T_79
112     T_94
113     T_94
114     T_94
Name: sample_id, Length: 110, dtype: object


In [None]:
# make the timepoint column for metadata

bariatric_ids = pd.DataFrame(bariatric['sample_id'])

bariatric_ids['TimePoint'] = bariatric_ids['sample_id'].apply(lambda x: x.split('_')[2])

print(bariatric_ids)

In [8]:
# Format IDs

bariatric_ids['sample_id'] = bariatric_ids['sample_id'].apply(lambda x: '_'.join(x.split('_')[:2]))

print(bariatric_ids)

    sample_id TimePoint
1       T_103        1M
2       T_103        6M
3       T_103        OR
4       T_103        BL
5       T_107        1M
..        ...       ...
109      T_79        BL
110      T_79        OR
112      T_94        1M
113      T_94        6M
114      T_94        BL

[110 rows x 2 columns]


In [None]:
# Fix stupid naming scheme...

bariatric['base_id'] = bariatric['sample_id'].apply(lambda x: '_'.join(x.split('_')[:-1]))

base_ids_with_211001 = bariatric[bariatric['sample_id'].str.endswith('211001')]['base_id'].unique()

# Mark _pooled entries for removal if their base_id is in the list identified above
bariatric['remove_flag'] = bariatric.apply(lambda row: row['base_id'] in base_ids_with_211001 and row['sample_id'].endswith('pooled'), axis=1)

# Filter out the rows marked for removal
bariatric = bariatric[~bariatric['remove_flag']].drop(columns=['base_id', 'remove_flag'])  # Drop the helper columns

bariatric.head(), bariatric.shape

In [51]:
bariatric.reset_index(drop=True, inplace=True)

In [None]:
bariatric.drop([0, 9, 59, 64, 111, 115, 116, 117], inplace=True)

In [54]:
bariatric['TimePoint'] = bariatric['sample_id'].apply(lambda x: '1M' if '1M' in x else ('6M' if '6M' in x else ('OR' if 'OR' in x else ('BL' if 'L' in x else ''))))

In [55]:
bariatric['sample_id'] = bariatric['sample_id'].apply(lambda x: '_'.join(x.split('_')[0].split('-')[:2]).strip('L'))

In [58]:
bariatric['sample_id'] = bariatric.apply(lambda row: row['sample_id'] + '_' + row['TimePoint'], axis=1)


In [60]:
bariatric.drop(columns=['TimePoint'], inplace=True)

In [None]:
bariatric['sample_id']

In [None]:
bariatric['sample_id'] = bariatric_ids['sample_id']

print(bariatric['sample_id'])

In [11]:
bariatric.to_feather('/media/oliver/PGH_Backup/bariatric/clean_joined_genefamilies_relab_bariatric.feather')
bariatric.to_csv('/media/oliver/PGH_Backup/bariatric/clean_joined_genefamilies_relab_bariatric.tsv', sep='\t', index=False)

In [12]:
make_outputs(bariatric_ids, '/media/oliver/PGH_Backup/bariatric/bariatric_metadata')

'/media/oliver/PGH_Backup/bariatric/bariatric_metadata'

In [6]:
bariatric_clustered = cluster_humann_table('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

432 DL-endopeptidase found
432 DL-endopeptidase found, 8 DL-endopeptidase unclustered
3856 LD-carboxypeptidase found
3856 LD-carboxypeptidase found, 571 LD-carboxypeptidase unclustered
113 LD-endopeptidase found
113 LD-endopeptidase found, 113 LD-endopeptidase unclustered
1889 Glucosaminidase found
1889 Glucosaminidase found, 773 Glucosaminidase unclustered
10083 DD-carboxypeptidase found
10083 DD-carboxypeptidase found, 736 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
17138 Amidase found
17138 Amidase found, 961 Amidase unclustered
29971 Muramidase found
29971 Muramidase found, 2806 Muramidase unclustered


In [7]:
print(bariatric_clustered.shape)
bariatric_clustered = bariatric_clustered.drop(columns=['unclustered'])
print(bariatric_clustered.shape)


(110, 1257)
(110, 1250)


In [8]:
make_outputs(bariatric_clustered, 'E:/bariatric/clustered_complete_bariatric')

'E:/bariatric/clustered_complete_bariatric'

In [None]:
grouped_bariatric = group_humann_table('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather')

print(grouped_bariatric.shape, grouped_bariatric.columns)

In [None]:
grouped_bariatric.rename(columns={'sample': 'sample_id'}, inplace=True)

print(grouped_bariatric.columns)


In [4]:
make_outputs(grouped_bariatric, 'E:/bariatric/grouped_bariatric')

'E:/bariatric/grouped_bariatric'

In [4]:
clustered_7774 = pd.read_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB7774.tsv', sep='\t')

print(clustered_7774.shape)

(155, 1462)


In [10]:
# Cluster large ICI trial

ici = pd.read_feather('~/Downloads/clean_joined_70966_43119.feather')

print(ici.shape)


(397, 99685)


In [7]:
clustered_ici = cluster_humann_table('~/Downloads/clean_joined_70966_43119.feather',
                                    '../../data/clustering/cluster_maps/combined_clusters.tsv')

397 DL-endopeptidase found
397 DL-endopeptidase found, 9 DL-endopeptidase unclustered
4751 LD-carboxypeptidase found
4751 LD-carboxypeptidase found, 668 LD-carboxypeptidase unclustered
45 LD-endopeptidase found
45 LD-endopeptidase found, 45 LD-endopeptidase unclustered
3008 Glucosaminidase found
3008 Glucosaminidase found, 997 Glucosaminidase unclustered
13941 DD-carboxypeptidase found
13941 DD-carboxypeptidase found, 1323 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
25442 Amidase found
25442 Amidase found, 1573 Amidase unclustered
45648 Muramidase found
45648 Muramidase found, 5524 Muramidase unclustered


In [8]:
print(clustered_ici.shape)

(397, 1638)


In [9]:
clustered_ici.to_csv('/Users/odesa/OneDrive - University of Toronto/LabWork/ICI/LatestData/clustered_complete_70966_43119.tsv', sep='\t', index=False)

In [11]:
grouped_ici = group_humann_table('~/Downloads/clean_joined_70966_43119.feather')

  grouped_df = humann_df.groupby(humann_df.columns, axis=1).sum()


Original width: 99685, Grouped width: 13


In [1]:
display(grouped_ici.head())

NameError: name 'grouped_ici' is not defined

In [14]:
grouped_ici.to_csv('/Users/odesa/OneDrive - University of Toronto/LabWork/ICI/LatestData/grouped_70966_43119.tsv', sep='\t', index=False)

**iHMP2 Data, IBD**

In [None]:
ibd = pd.read_csv('E:/ibd_data/humann_second_run/ibd_genefamilies_relab_p2.tsv', sep='\t')

display(ibd.head())

In [6]:
clean_ibd = clean_table('E:/ibd_data/humann_second_run/ibd_genefamilies_relab_p2.tsv')

display(clean_ibd.head())

# Gene Family,sample_id,UNMAPPED,Amidase_UniRef100_A0A010ZI67,Amidase_UniRef100_A0A015SN82,Amidase_UniRef100_A0A015SSH9,Amidase_UniRef100_A0A015SVI2,Amidase_UniRef100_A0A015T2F2,Amidase_UniRef100_A0A015T8Q8,Amidase_UniRef100_A0A015TT75,Amidase_UniRef100_A0A015UH29,...,Muramidase_UniRef100_W9B3G2,Muramidase_UniRef100_W9B6S0,Muramidase_UniRef100_W9BES9,Muramidase_UniRef100_W9BN36,Muramidase_UniRef100_X5NUJ9,Muramidase_UniRef100_X5NVJ2,Muramidase_UniRef100_X5NWD2,Muramidase_UniRef100_X7RW21,Muramidase_UniRef100_X8GUH2,Muramidase_UniRef100_X8ISV2
0,CSM5FZ4M,0.994914,0.0,0.0,0.0,0.0,0.0,1e-06,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CSM5MCUO,0.995403,0.0,0.0,0.0,0.0,0.0,1e-06,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,CSM5MCVL,0.994398,0.0,1e-06,0.0,0.0,1e-06,2e-06,0.0,1e-06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CSM5MCVN,0.99473,0.0,1e-06,0.0,0.0,1e-06,1e-06,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,CSM5MCW6,0.99539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
make_outputs(clean_ibd, 'E:/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned')

'E:/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned'

In [4]:
ibd = pd.read_csv('E:/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned.tsv', sep='\t')

display(ibd.head())

Unnamed: 0.1,Unnamed: 0,sample_id,UNMAPPED,Amidase_UniRef100_A0A010ZI67,Amidase_UniRef100_A0A015SN82,Amidase_UniRef100_A0A015SSH9,Amidase_UniRef100_A0A015SVI2,Amidase_UniRef100_A0A015T2F2,Amidase_UniRef100_A0A015T8Q8,Amidase_UniRef100_A0A015TT75,...,Muramidase_UniRef100_W9B3G2,Muramidase_UniRef100_W9B6S0,Muramidase_UniRef100_W9BES9,Muramidase_UniRef100_W9BN36,Muramidase_UniRef100_X5NUJ9,Muramidase_UniRef100_X5NVJ2,Muramidase_UniRef100_X5NWD2,Muramidase_UniRef100_X7RW21,Muramidase_UniRef100_X8GUH2,Muramidase_UniRef100_X8ISV2
0,0,CSM5FZ4M,0.994914,0.0,3.82633e-07,0.0,4.06192e-07,4.02904e-07,1.17311e-06,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,CSM5MCUO,0.995403,0.0,2.07056e-07,2.20017e-08,1.52552e-07,2.15811e-07,5.36638e-07,0.0,...,2.73077e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,CSM5MCVL,0.994398,0.0,9.14115e-07,4.28739e-07,0.0,9.96035e-07,1.57402e-06,4.73219e-07,...,0.0,0.0,0.0,7.89048e-09,0.0,0.0,0.0,0.0,0.0,0.0
3,3,CSM5MCVN,0.99473,0.0,7.5434e-07,2.4813e-07,0.0,7.99161e-07,1.4534e-06,3.3317e-07,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,CSM5MCW6,0.99539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
clustered_ibd = cluster_humann_parallel('E:/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned.feather',
                                    '../../data/clustering/cluster_maps/combined_clusters.tsv')

323 DL-endopeptidase found
323 DL-endopeptidase found, 8 DL-endopeptidase unclustered
3190 LD-carboxypeptidase found
3190 LD-carboxypeptidase found, 8 LD-carboxypeptidase unclustered
32 LD-endopeptidase found
32 LD-endopeptidase found, 32 LD-endopeptidase unclustered
2037 Glucosaminidase found
2037 Glucosaminidase found, 52 Glucosaminidase unclustered
11066 DD-carboxypeptidase found
11066 DD-carboxypeptidase found, 70 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
22232 Amidase found
22232 Amidase found, 167 Amidase unclustered
39750 Muramidase found
39750 Muramidase found, 234 Muramidase unclustered


In [4]:
clustered_ibd_old = cluster_humann_table('E:/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned.feather',
                                    '../../data/clustering/cluster_maps/combined_clusters.tsv')

323 DL-endopeptidase found
323 DL-endopeptidase found, 8 DL-endopeptidase unclustered
3190 LD-carboxypeptidase found
3190 LD-carboxypeptidase found, 386 LD-carboxypeptidase unclustered
32 LD-endopeptidase found
32 LD-endopeptidase found, 32 LD-endopeptidase unclustered
2037 Glucosaminidase found
2037 Glucosaminidase found, 506 Glucosaminidase unclustered
11066 DD-carboxypeptidase found
11066 DD-carboxypeptidase found, 906 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
22232 Amidase found
22232 Amidase found, 1284 Amidase unclustered
39750 Muramidase found
39750 Muramidase found, 3468 Muramidase unclustered


In [7]:
def cluster_humann_table_p_test(humann_feather, cluster_tsv):
    """Cluster the humann table for each of the PGH enzymes
       input the raw humann df, clustering dataframes, and
       output the clustered humann df"""
    
    # read in the humann table
    humann_df = pd.read_feather(humann_feather)

    # read in the clustering dataframes
    cluster_df = pd.read_csv(cluster_tsv, sep='\t', low_memory=False)

    # list of enzymes
    enzymes = ['DL-endopeptidase', 'LD-carboxypeptidase', 
               'LD-endopeptidase', 'Glucosaminidase',
               'DD-carboxypeptidase', 'Diadenylate-cyclase',
               'Amidase', 'Muramidase']

    clustered_dfs = []
    for enzyme in enzymes:
        # Filter columns for the enzyme
        enzyme_columns = [col for col in humann_df.columns if col.startswith(enzyme)]
        df = humann_df[enzyme_columns]

        print(f'{len(enzyme_columns)} {enzyme} found')

        # Extract UniRef IDs
        column_ids = [col.split('_')[2] for col in enzyme_columns]
        
        # Create a mapping DataFrame for the enzyme
        cluster_col_unclustered = f"{enzyme.replace('-', '_').lower()}-unclustered"
        cluster_col_clustered = f"{enzyme.replace('-', '_').lower()}-foldseek_cluster"
        
        cluster_mapping = cluster_df.set_index(cluster_col_unclustered)[cluster_col_clustered]

        # Debugging: Print the first few rows of the cluster mapping
        print(f"Cluster Mapping for {enzyme}:\n", cluster_mapping.head())

        # Map the column_ids to clusters, fill "unclustered" where no match
        cluster_labels = [cluster_mapping.get(col_id, "unclustered") for col_id in column_ids]

        # Debugging: Check for unexpected cluster mappings
        unmatched_ids = [col_id for col_id, cluster in zip(column_ids, cluster_labels) if cluster == "unclustered"]
        print(f"{len(cluster_labels)} {enzyme} found, {cluster_labels.count('unclustered')} {enzyme} unclustered")
        print(f"Unmatched IDs for {enzyme}:\n", unmatched_ids[:10])  # Print first 10 unmatched for debug

        # Replace the column names with the foldseek cluster
        df.columns = cluster_labels

        # Aggregate the columns by foldseek cluster
        agg_df = df.T.groupby(df.columns).sum().T

        # Add the aggregated df to the list of clustered dfs
        clustered_dfs.append(agg_df)

    # Concatenate all the clustered dfs
    clustered_df = pd.concat(clustered_dfs, axis=1)

    # Add the sample id column back to the dataframe
    clustered_df['sample_id'] = humann_df['sample_id']

    return clustered_df

In [9]:
clustered_ibd_test = cluster_humann_table_p_test('E:/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned.feather',
                                    '../../data/clustering/cluster_maps/combined_clusters.tsv')

323 DL-endopeptidase found
Cluster Mapping for DL-endopeptidase:
 dl_endopeptidase-unclustered
A0A7S8CCP0    A0A7X2Z4H8
A0A928LPM8    A0A1C5Q6A7
A0A413G4S5    A0A1C5Q6A7
A0A926IJ38    A0A1C5Q6A7
A0A498CNA2    A0A1C5Q6A7
Name: dl_endopeptidase-foldseek_cluster, dtype: object
323 DL-endopeptidase found, 8 DL-endopeptidase unclustered
Unmatched IDs for DL-endopeptidase:
 ['A0A173Y9D9', 'A0A1C5QMY6', 'A0A1C5WT81', 'A0A1C6GSN7', 'A0A1V5U0D9', 'A0A422M907', 'R5KSG7', 'R5YP05']
3190 LD-carboxypeptidase found
Cluster Mapping for LD-carboxypeptidase:
 ld_carboxypeptidase-unclustered
A0A964YMG5           NaN
A0A969WWE3    A0A7X8WLE8
A0A7X8WLE8    A0A7X8WLE8
E0RLM3        A0A2E9X5T2
S5LZK0        A0A3E3E572
Name: ld_carboxypeptidase-foldseek_cluster, dtype: object
3190 LD-carboxypeptidase found, 8 LD-carboxypeptidase unclustered
Unmatched IDs for LD-carboxypeptidase:
 ['A0A1M7FYP6', 'A0A373P8V8', 'A0A3Q8SN33', 'A0A6N3DZJ8', 'A0A7U9RH74', 'F9LZ01', 'G9YG34', 'W1I3D0']
32 LD-endopeptidase found
Clu

In [5]:
make_outputs(clustered_ibd_old, 'E:/ibd_data/humann_second_run/clustered_ibd_relab')

ValueError: Duplicate column names found: ['DL-endopeptidase-A0A0B0HUJ2', 'DL-endopeptidase-A0A0P8W7Z2', 'DL-endopeptidase-A0A132I0H7', 'DL-endopeptidase-A0A174CGJ4', 'DL-endopeptidase-A0A174M4B0', 'DL-endopeptidase-A0A174TDS3', 'DL-endopeptidase-A0A1C5KMM8', 'DL-endopeptidase-A0A1C5KNH6', 'DL-endopeptidase-A0A1C5PVP3', 'DL-endopeptidase-A0A1C5Q6A7', 'DL-endopeptidase-A0A1C5VJV2', 'DL-endopeptidase-A0A1C5WD06', 'DL-endopeptidase-A0A1C6FAW9', 'DL-endopeptidase-A0A1C6G745', 'DL-endopeptidase-A0A1I0DL10', 'DL-endopeptidase-A0A1I3GQ51', 'DL-endopeptidase-A0A1V5YPB2', 'DL-endopeptidase-A0A1V6BVN9', 'DL-endopeptidase-A0A1Y6AQ80', 'DL-endopeptidase-A0A259UIB8', 'DL-endopeptidase-A0A259UPD7', 'DL-endopeptidase-A0A2W7MNT1', 'DL-endopeptidase-A0A348AMN4', 'DL-endopeptidase-A0A4R1N543', 'DL-endopeptidase-A0A4R7RUE4', 'DL-endopeptidase-A0A518C016', 'DL-endopeptidase-A0A564W7U7', 'DL-endopeptidase-A0A6N2TU31', 'DL-endopeptidase-A0A6N3BHG0', 'DL-endopeptidase-A0A6N3EZH1', 'DL-endopeptidase-A0A6N3I5I0', 'DL-endopeptidase-A0A7U9SHV5', 'DL-endopeptidase-A0A829ZMP2', 'DL-endopeptidase-A0A8B4YK50', 'DL-endopeptidase-C0D0G2', 'DL-endopeptidase-R5ATJ0', 'DL-endopeptidase-R5N6Z6', 'DL-endopeptidase-R6MIH8', 'DL-endopeptidase-R6YTV6', 'DL-endopeptidase-R7C958', 'DL-endopeptidase-R7HVK1', 'unclustered', 'LD-carboxypeptidase-A0A0D1L643', 'LD-carboxypeptidase-A0A0G1MST7', 'LD-carboxypeptidase-A0A0J1ISN8', 'LD-carboxypeptidase-A0A108T442', 'LD-carboxypeptidase-A0A147KAH7', 'LD-carboxypeptidase-A0A1B8WKW2', 'LD-carboxypeptidase-A0A1E3AR66', 'LD-carboxypeptidase-A0A1G0GV85', 'LD-carboxypeptidase-A0A1I0QSS3', 'LD-carboxypeptidase-A0A1I2ZTD4', 'LD-carboxypeptidase-A0A1V5Q4F1', 'LD-carboxypeptidase-A0A1V5X6T1', 'LD-carboxypeptidase-A0A1W6KC51', 'LD-carboxypeptidase-A0A2C6E889', 'LD-carboxypeptidase-A0A2E9X5T2', 'LD-carboxypeptidase-A0A2V2CDK1', 'LD-carboxypeptidase-A0A2V9NRA2', 'LD-carboxypeptidase-A0A317QD51', 'LD-carboxypeptidase-A0A380P0X3', 'LD-carboxypeptidase-A0A380XUM0', 'LD-carboxypeptidase-A0A3D3KCN0', 'LD-carboxypeptidase-A0A3E3E572', 'LD-carboxypeptidase-A0A523Q9X9', 'LD-carboxypeptidase-A0A543A0R1', 'LD-carboxypeptidase-A0A5C1Q7L6', 'LD-carboxypeptidase-A0A5K1I9X5', 'LD-carboxypeptidase-A0A660MZL1', 'LD-carboxypeptidase-A0A6A0IMM5', 'LD-carboxypeptidase-A0A6I1MJ35', 'LD-carboxypeptidase-A0A7D5MQW5', 'LD-carboxypeptidase-A0A7W1YUM5', 'LD-carboxypeptidase-A0A7X6EPJ9', 'LD-carboxypeptidase-A0A7X7CCE1', 'LD-carboxypeptidase-A0A7X7FE45', 'LD-carboxypeptidase-A0A7X7QSS1', 'LD-carboxypeptidase-A0A7X8WLE8', 'LD-carboxypeptidase-A0A811IE54', 'LD-carboxypeptidase-A0A828RRP0', 'LD-carboxypeptidase-A0A844GDF8', 'LD-carboxypeptidase-C9N182', 'LD-carboxypeptidase-R5H9T8', 'unclustered', 'unclustered', 'Glucosaminidase-A0A031IDF1', 'Glucosaminidase-A0A069RIP0', 'Glucosaminidase-A0A0B4WBQ7', 'Glucosaminidase-A0A0G1FTE9', 'Glucosaminidase-A0A0M0VNU9', 'Glucosaminidase-A0A0S2DKP3', 'Glucosaminidase-A0A173TGS3', 'Glucosaminidase-A0A174D5Y6', 'Glucosaminidase-A0A174DAB8', 'Glucosaminidase-A0A174F7D0', 'Glucosaminidase-A0A174IZW4', 'Glucosaminidase-A0A174LS49', 'Glucosaminidase-A0A174P1K1', 'Glucosaminidase-A0A1C5L9S3', 'Glucosaminidase-A0A1C5QA98', 'Glucosaminidase-A0A1C5VT36', 'Glucosaminidase-A0A1C5XSX6', 'Glucosaminidase-A0A1C5YF09', 'Glucosaminidase-A0A1C5Z9X0', 'Glucosaminidase-A0A1C6D969', 'Glucosaminidase-A0A1C6H8R9', 'Glucosaminidase-A0A1D4PPZ9', 'Glucosaminidase-A0A1H0T8H2', 'Glucosaminidase-A0A1H2Z0U0', 'Glucosaminidase-A0A1H9CDL0', 'Glucosaminidase-A0A1I1ZQ20', 'Glucosaminidase-A0A1I2NKJ3', 'Glucosaminidase-A0A1I3K6S5', 'Glucosaminidase-A0A1K1P5S6', 'Glucosaminidase-A0A1S8NJE1', 'Glucosaminidase-A0A1T4VMW3', 'Glucosaminidase-A0A1T5BVD3', 'Glucosaminidase-A0A1V4IKH7', 'Glucosaminidase-A0A1V4SU15', 'Glucosaminidase-A0A1Y3WIU4', 'Glucosaminidase-A0A242ASC8', 'Glucosaminidase-A0A242BET2', 'Glucosaminidase-A0A255U393', 'Glucosaminidase-A0A286TBS1', 'Glucosaminidase-A0A2E9R175', 'Glucosaminidase-A0A2P2BVU5', 'Glucosaminidase-A0A2S0UAV6', 'Glucosaminidase-A0A2T3QJ39', 'Glucosaminidase-A0A2T6K1D7', 'Glucosaminidase-A0A2U2EEE4', 'Glucosaminidase-A0A2V3VWI7', 'Glucosaminidase-A0A2Z4IUI6', 'Glucosaminidase-A0A2Z5VGM9', 'Glucosaminidase-A0A318KKH9', 'Glucosaminidase-A0A318MMB0', 'Glucosaminidase-A0A351ADY8', 'Glucosaminidase-A0A376TZ39', 'Glucosaminidase-A0A377E257', 'Glucosaminidase-A0A377FWM0', 'Glucosaminidase-A0A377KMG8', 'Glucosaminidase-A0A380LML7', 'Glucosaminidase-A0A3R9QFF2', 'Glucosaminidase-A0A3S4IAF0', 'Glucosaminidase-A0A485AMI6', 'Glucosaminidase-A0A4R0UIH4', 'Glucosaminidase-A0A4R0V501', 'Glucosaminidase-A0A4R1L6A7', 'Glucosaminidase-A0A4R2LZN7', 'Glucosaminidase-A0A4R3J4M3', 'Glucosaminidase-A0A4R3KCG4', 'Glucosaminidase-A0A4U9D433', 'Glucosaminidase-A0A4V2SL39', 'Glucosaminidase-A0A4Y3VQ71', 'Glucosaminidase-A0A5P3IEM7', 'Glucosaminidase-A0A650MU22', 'Glucosaminidase-A0A6G7ZBN2', 'Glucosaminidase-A0A6N2RZA5', 'Glucosaminidase-A0A7C1VDL4', 'Glucosaminidase-A0A7U6QWL4', 'Glucosaminidase-A0A809N6U3', 'Glucosaminidase-A0A822LKY6', 'Glucosaminidase-A0A855GEP2', 'Glucosaminidase-A1IHD1', 'Glucosaminidase-A4ITC8', 'Glucosaminidase-A6ERN7', 'Glucosaminidase-B0M9G9', 'Glucosaminidase-B6G019', 'Glucosaminidase-C2EH32', 'Glucosaminidase-C5EPD4', 'Glucosaminidase-C6LIN5', 'Glucosaminidase-C9B8X0', 'Glucosaminidase-D7J6K9', 'Glucosaminidase-E4KQG4', 'Glucosaminidase-E5VFB3', 'Glucosaminidase-G5K3P9', 'Glucosaminidase-G5KCX5', 'Glucosaminidase-H0U4F0', 'Glucosaminidase-J7CVT0', 'Glucosaminidase-M7MUC1', 'Glucosaminidase-R5U3B0', 'Glucosaminidase-R5UHX4', 'Glucosaminidase-R5XUB8', 'Glucosaminidase-R6C641', 'Glucosaminidase-R6D0B5', 'Glucosaminidase-R6JF01', 'Glucosaminidase-R6MBQ9', 'Glucosaminidase-R6RW29', 'Glucosaminidase-R6VPZ4', 'Glucosaminidase-R7I4L6', 'Glucosaminidase-S4B8W7', 'Glucosaminidase-S6EJE2', 'Glucosaminidase-T1WGN1', 'Glucosaminidase-T2RE69', 'Glucosaminidase-T3DBU4', 'Glucosaminidase-U2P4K3', 'Glucosaminidase-U2T162', 'Glucosaminidase-U6RX90', 'Glucosaminidase-U7ULZ9', 'Glucosaminidase-W4P4D8', 'Glucosaminidase-W4TPE5', 'Glucosaminidase-W4TQN4', 'unclustered', 'DD-carboxypeptidase-A0A090R9T9', 'DD-carboxypeptidase-A0A091C383', 'DD-carboxypeptidase-A0A0S2PEW3', 'DD-carboxypeptidase-A0A0T9Q4J1', 'DD-carboxypeptidase-A0A136MDS1', 'DD-carboxypeptidase-A0A139PQX0', 'DD-carboxypeptidase-A0A139RKZ9', 'DD-carboxypeptidase-A0A159Z0V5', 'DD-carboxypeptidase-A0A173SXF8', 'DD-carboxypeptidase-A0A174LTW7', 'DD-carboxypeptidase-A0A174LYR8', 'DD-carboxypeptidase-A0A174V5H5', 'DD-carboxypeptidase-A0A1A9C453', 'DD-carboxypeptidase-A0A1C0VFZ5', 'DD-carboxypeptidase-A0A1C3J0Z3', 'DD-carboxypeptidase-A0A1C4CKS7', 'DD-carboxypeptidase-A0A1C5LH51', 'DD-carboxypeptidase-A0A1C5ZJ20', 'DD-carboxypeptidase-A0A1C6H3C2', 'DD-carboxypeptidase-A0A1C6KKW6', 'DD-carboxypeptidase-A0A1E3W6P6', 'DD-carboxypeptidase-A0A1G2IYF1', 'DD-carboxypeptidase-A0A1G6RCT9', 'DD-carboxypeptidase-A0A1G6RER6', 'DD-carboxypeptidase-A0A1G7N0U6', 'DD-carboxypeptidase-A0A1G8EUZ3', 'DD-carboxypeptidase-A0A1H4KCD0', 'DD-carboxypeptidase-A0A1I1LI54', 'DD-carboxypeptidase-A0A1I1MXD1', 'DD-carboxypeptidase-A0A1I6ZPS0', 'DD-carboxypeptidase-A0A1M5MRY1', 'DD-carboxypeptidase-A0A1M7GAP3', 'DD-carboxypeptidase-A0A1N6LUX6', 'DD-carboxypeptidase-A0A1Q2CHA6', 'DD-carboxypeptidase-A0A1Q6LGJ1', 'DD-carboxypeptidase-A0A1Q6QQ59', 'DD-carboxypeptidase-A0A1Q9JVE2', 'DD-carboxypeptidase-A0A1R1SMN3', 'DD-carboxypeptidase-A0A1S8L9Y7', 'DD-carboxypeptidase-A0A1V5DYL7', 'DD-carboxypeptidase-A0A1V5P587', 'DD-carboxypeptidase-A0A1V5SBF9', 'DD-carboxypeptidase-A0A1V5X3P0', 'DD-carboxypeptidase-A0A1V9VC09', 'DD-carboxypeptidase-A0A1W1V6H8', 'DD-carboxypeptidase-A0A1W9KMP0', 'DD-carboxypeptidase-A0A1Y4MJD3', 'DD-carboxypeptidase-A0A249LE24', 'DD-carboxypeptidase-A0A257S370', 'DD-carboxypeptidase-A0A272EWR4', 'DD-carboxypeptidase-A0A2A7IKU2', 'DD-carboxypeptidase-A0A2A9ESX1', 'DD-carboxypeptidase-A0A2G6MBS6', 'DD-carboxypeptidase-A0A2N3Y022', 'DD-carboxypeptidase-A0A2S6SYE8', 'DD-carboxypeptidase-A0A2S8KF42', 'DD-carboxypeptidase-A0A2U3MQN8', 'DD-carboxypeptidase-A0A2V9T716', 'DD-carboxypeptidase-A0A2W4PXU4', 'DD-carboxypeptidase-A0A2X3U5A0', 'DD-carboxypeptidase-A0A2X3U8F1', 'DD-carboxypeptidase-A0A316PCN6', 'DD-carboxypeptidase-A0A349Q945', 'DD-carboxypeptidase-A0A366FRF3', 'DD-carboxypeptidase-A0A369P8F3', 'DD-carboxypeptidase-A0A374BHZ0', 'DD-carboxypeptidase-A0A374NV70', 'DD-carboxypeptidase-A0A376KXG0', 'DD-carboxypeptidase-A0A376RLE8', 'DD-carboxypeptidase-A0A377DMK2', 'DD-carboxypeptidase-A0A377W4R6', 'DD-carboxypeptidase-A0A377XNE3', 'DD-carboxypeptidase-A0A378AHG2', 'DD-carboxypeptidase-A0A378EEC0', 'DD-carboxypeptidase-A0A378G239', 'DD-carboxypeptidase-A0A379BD30', 'DD-carboxypeptidase-A0A379FEM7', 'DD-carboxypeptidase-A0A379GGW0', 'DD-carboxypeptidase-A0A379U1R6', 'DD-carboxypeptidase-A0A379WDN8', 'DD-carboxypeptidase-A0A380E108', 'DD-carboxypeptidase-A0A380K364', 'DD-carboxypeptidase-A0A3A9G105', 'DD-carboxypeptidase-A0A3B9JHN5', 'DD-carboxypeptidase-A0A3D5PDC1', 'DD-carboxypeptidase-A0A3D9L5Q8', 'DD-carboxypeptidase-A0A3E3E6W8', 'DD-carboxypeptidase-A0A3N0I4M4', 'DD-carboxypeptidase-A0A3P1WU16', 'DD-carboxypeptidase-A0A3P8KP37', 'DD-carboxypeptidase-A0A3Q9RPA1', 'DD-carboxypeptidase-A0A3R9H3A0', 'DD-carboxypeptidase-A0A3S4HG07', 'DD-carboxypeptidase-A0A3S4I3R5', 'DD-carboxypeptidase-A0A413H312', 'DD-carboxypeptidase-A0A414MAG5', 'DD-carboxypeptidase-A0A415DTS0', 'DD-carboxypeptidase-A0A416IU48', 'DD-carboxypeptidase-A0A416ZZD6', 'DD-carboxypeptidase-A0A447JKI8', 'DD-carboxypeptidase-A0A447RSB1', 'DD-carboxypeptidase-A0A447RSI8', 'DD-carboxypeptidase-A0A455TKA5', 'DD-carboxypeptidase-A0A484ZMW2', 'DD-carboxypeptidase-A0A484ZV02', 'DD-carboxypeptidase-A0A485AFX9', 'DD-carboxypeptidase-A0A485AHN2', 'DD-carboxypeptidase-A0A485B5S5', 'DD-carboxypeptidase-A0A485JCH6', 'DD-carboxypeptidase-A0A494T9E7', 'DD-carboxypeptidase-A0A4P0UDW6', 'DD-carboxypeptidase-A0A4P8IH93', 'DD-carboxypeptidase-A0A4Q3CKM9', 'DD-carboxypeptidase-A0A4R8HNK7', 'DD-carboxypeptidase-A0A4U9HH08', 'DD-carboxypeptidase-A0A4U9ITZ3', 'DD-carboxypeptidase-A0A4U9UH14', 'DD-carboxypeptidase-A0A4V2RHZ3', 'DD-carboxypeptidase-A0A509E7T9', 'DD-carboxypeptidase-A0A535ERD0', 'DD-carboxypeptidase-A0A538G3W9', 'DD-carboxypeptidase-A0A538U787', 'DD-carboxypeptidase-A0A542ZE06', 'DD-carboxypeptidase-A0A562ICL0', 'DD-carboxypeptidase-A0A564TVH4', 'DD-carboxypeptidase-A0A5B9YAV8', 'DD-carboxypeptidase-A0A5C5V2E1', 'DD-carboxypeptidase-A0A5J4E6H9', 'DD-carboxypeptidase-A0A660KWX8', 'DD-carboxypeptidase-A0A660SPQ0', 'DD-carboxypeptidase-A0A663BP17', 'DD-carboxypeptidase-A0A6H3TM56', 'DD-carboxypeptidase-A0A6J4P204', 'DD-carboxypeptidase-A0A6L9G7Z4', 'DD-carboxypeptidase-A0A6N2X415', 'DD-carboxypeptidase-A0A6N2Z4U7', 'DD-carboxypeptidase-A0A7C5DJG4', 'DD-carboxypeptidase-A0A7C6YP48', 'DD-carboxypeptidase-A0A7D5SBJ3', 'DD-carboxypeptidase-A0A7J9W7Q1', 'DD-carboxypeptidase-A0A7V1URB0', 'DD-carboxypeptidase-A0A7V2JNQ2', 'DD-carboxypeptidase-A0A7V5N9K8', 'DD-carboxypeptidase-A0A7V7T1L3', 'DD-carboxypeptidase-A0A7W0F5M9', 'DD-carboxypeptidase-A0A7W0QN99', 'DD-carboxypeptidase-A0A7W8H7C0', 'DD-carboxypeptidase-A0A7X3VGL9', 'DD-carboxypeptidase-A0A7X5DBR2', 'DD-carboxypeptidase-A0A7X8JS66', 'DD-carboxypeptidase-A0A7X9HWB6', 'DD-carboxypeptidase-A0A7Y6HIQ6', 'DD-carboxypeptidase-A0A7Z9I726', 'DD-carboxypeptidase-A0A826HES1', 'DD-carboxypeptidase-A0A839IQ00', 'DD-carboxypeptidase-A0A840P717', 'DD-carboxypeptidase-A0A844DZI0', 'DD-carboxypeptidase-A0A847KV57', 'DD-carboxypeptidase-A0A847PYN2', 'DD-carboxypeptidase-A0A850QT51', 'DD-carboxypeptidase-A0A850SRV6', 'DD-carboxypeptidase-A0A853IPX3', 'DD-carboxypeptidase-A0A8B5XJT8', 'DD-carboxypeptidase-B1XRX6', 'DD-carboxypeptidase-C2SUM1', 'DD-carboxypeptidase-D4MMG2', 'DD-carboxypeptidase-D6E7C8', 'DD-carboxypeptidase-D7GNX4', 'DD-carboxypeptidase-F9YA98', 'DD-carboxypeptidase-I4C2Y9', 'DD-carboxypeptidase-K6Q8H8', 'DD-carboxypeptidase-K9TH29', 'DD-carboxypeptidase-M1ULY0', 'DD-carboxypeptidase-M7NSZ5', 'DD-carboxypeptidase-Q59439', 'DD-carboxypeptidase-R5AND2', 'DD-carboxypeptidase-R5H8T8', 'DD-carboxypeptidase-R5PUW3', 'DD-carboxypeptidase-R5V930', 'DD-carboxypeptidase-R6TSS7', 'DD-carboxypeptidase-R6Z7X6', 'DD-carboxypeptidase-R7A918', 'DD-carboxypeptidase-R7FCR3', 'DD-carboxypeptidase-R7H1N8', 'DD-carboxypeptidase-S3H7H7', 'DD-carboxypeptidase-S7SW74', 'DD-carboxypeptidase-S9R355', 'DD-carboxypeptidase-T0CFX8', 'DD-carboxypeptidase-T0THQ0', 'DD-carboxypeptidase-T4VPV1', 'DD-carboxypeptidase-W1DKQ9', 'DD-carboxypeptidase-W4QY04', 'DD-carboxypeptidase-X0PBS9', 'DD-carboxypeptidase-X8FBX9', 'unclustered', 'Amidase-A0A015UH29', 'Amidase-A0A017NBB5', 'Amidase-A0A024GY02', 'Amidase-A0A062BTC2', 'Amidase-A0A072N4U2', 'Amidase-A0A072Y8E9', 'Amidase-A0A077FDY1', 'Amidase-A0A081BJ85', 'Amidase-A0A084JCH1', 'Amidase-A0A087D016', 'Amidase-A0A087DHD9', 'Amidase-A0A090Z2H6', 'Amidase-A0A098CXV0', 'Amidase-A0A0A7FY91', 'Amidase-A0A0B0HK70', 'Amidase-A0A0D1LZ26', 'Amidase-A0A0E1KSA8', 'Amidase-A0A0E8SZE3', 'Amidase-A0A0G0CCG0', 'Amidase-A0A0G0YHS7', 'Amidase-A0A0G1KR24', 'Amidase-A0A0G3A1E9', 'Amidase-A0A0J1FV79', 'Amidase-A0A0J1FXL9', 'Amidase-A0A0J1I5B6', 'Amidase-A0A0N0UWP8', 'Amidase-A0A0Q0UMY4', 'Amidase-A0A0Q9B0T8', 'Amidase-A0A0R1KHR0', 'Amidase-A0A0R1SRI3', 'Amidase-A0A0R2AB19', 'Amidase-A0A0R2DGX0', 'Amidase-A0A0R2H2Y4', 'Amidase-A0A0S6U251', 'Amidase-A0A0U5BA96', 'Amidase-A0A0V8DV25', 'Amidase-A0A109LBF6', 'Amidase-A0A125WBN3', 'Amidase-A0A127SWP4', 'Amidase-A0A133QC30', 'Amidase-A0A139MHB2', 'Amidase-A0A139RHN0', 'Amidase-A0A143WQS9', 'Amidase-A0A143YBH3', 'Amidase-A0A150KZN0', 'Amidase-A0A161YNG5', 'Amidase-A0A169PQX9', 'Amidase-A0A173XLG2', 'Amidase-A0A174HCM0', 'Amidase-A0A174V1B3', 'Amidase-A0A174XEP7', 'Amidase-A0A178IKK0', 'Amidase-A0A178TU57', 'Amidase-A0A193QGV4', 'Amidase-A0A193QHI5', 'Amidase-A0A1B9L178', 'Amidase-A0A1C5N059', 'Amidase-A0A1C6BH18', 'Amidase-A0A1C6ET88', 'Amidase-A0A1C6G934', 'Amidase-A0A1C6IGR4', 'Amidase-A0A1C6JPP0', 'Amidase-A0A1C6K4X4', 'Amidase-A0A1D3TP38', 'Amidase-A0A1E5GRD6', 'Amidase-A0A1G0H9T4', 'Amidase-A0A1G2F1W6', 'Amidase-A0A1G5XMQ8', 'Amidase-A0A1G9N1Y1', 'Amidase-A0A1G9R1B4', 'Amidase-A0A1H1YDW9', 'Amidase-A0A1H2TIS4', 'Amidase-A0A1H3QAV4', 'Amidase-A0A1H3SQU2', 'Amidase-A0A1H7HT27', 'Amidase-A0A1H8AKG4', 'Amidase-A0A1H8PCX1', 'Amidase-A0A1I0GGG7', 'Amidase-A0A1I1DKS8', 'Amidase-A0A1I7F2B8', 'Amidase-A0A1J0TQA8', 'Amidase-A0A1M4WFY9', 'Amidase-A0A1M6AI56', 'Amidase-A0A1M6H5K2', 'Amidase-A0A1Q3S3W8', 'Amidase-A0A1Q5PUK4', 'Amidase-A0A1Q6TM79', 'Amidase-A0A1Q8QGC9', 'Amidase-A0A1Q8QXM9', 'Amidase-A0A1R4GAF5', 'Amidase-A0A1S1GT51', 'Amidase-A0A1S8SVE6', 'Amidase-A0A1T5KGS0', 'Amidase-A0A1V2KL28', 'Amidase-A0A1V5IY25', 'Amidase-A0A1V5MWM8', 'Amidase-A0A1V5P9L7', 'Amidase-A0A1V6A5C0', 'Amidase-A0A1V6CBR5', 'Amidase-A0A1V9IRZ0', 'Amidase-A0A1W6MX80', 'Amidase-A0A1W7ABN8', 'Amidase-A0A1X7QVK9', 'Amidase-A0A1X9MHQ9', 'Amidase-A0A1Y2K9D2', 'Amidase-A0A1Y4EN81', 'Amidase-A0A220U202', 'Amidase-A0A233V5Y8', 'Amidase-A0A239IHU5', 'Amidase-A0A243GSF0', 'Amidase-A0A250IAJ5', 'Amidase-A0A257VFN3', 'Amidase-A0A261GNJ8', 'Amidase-A0A268TMA5', 'Amidase-A0A269XKT4', 'Amidase-A0A288GUM4', 'Amidase-A0A2A5SDN1', 'Amidase-A0A2D0BGD7', 'Amidase-A0A2D5EVF5', 'Amidase-A0A2D6BJD8', 'Amidase-A0A2E9I252', 'Amidase-A0A2H0QCB0', 'Amidase-A0A2H0YBH7', 'Amidase-A0A2I1PH08', 'Amidase-A0A2I3EU51', 'Amidase-A0A2I8AIV3', 'Amidase-A0A2K9P053', 'Amidase-A0A2M7A5W4', 'Amidase-A0A2N3I6X9', 'Amidase-A0A2N5GVE0', 'Amidase-A0A2N6UEJ0', 'Amidase-A0A2R8C4N2', 'Amidase-A0A2S5HEZ7', 'Amidase-A0A2S8JGS3', 'Amidase-A0A2T0BP98', 'Amidase-A0A2T0BSM7', 'Amidase-A0A2T4SEC3', 'Amidase-A0A2V3YDB3', 'Amidase-A0A2V8K4Z1', 'Amidase-A0A2V8S043', 'Amidase-A0A2V9WNN9', 'Amidase-A0A2W2GLL7', 'Amidase-A0A2W5XFD7', 'Amidase-A0A2X0QZY0', 'Amidase-A0A2X0RE73', 'Amidase-A0A2X1QJJ0', 'Amidase-A0A2X3CU47', 'Amidase-A0A2X4UUI0', 'Amidase-A0A2X4ZGU8', 'Amidase-A0A2X5CLD8', 'Amidase-A0A2Y9AJB5', 'Amidase-A0A315Y564', 'Amidase-A0A316E1D1', 'Amidase-A0A316SD96', 'Amidase-A0A318EIU5', 'Amidase-A0A327ZS46', 'Amidase-A0A330LGV3', 'Amidase-A0A347ZW42', 'Amidase-A0A348ZD10', 'Amidase-A0A349D4R6', 'Amidase-A0A349HAM4', 'Amidase-A0A352WFF5', 'Amidase-A0A367ZUP3', 'Amidase-A0A371S018', 'Amidase-A0A373D171', 'Amidase-A0A373D712', 'Amidase-A0A373D8A2', 'Amidase-A0A373IZC1', 'Amidase-A0A374AB68', 'Amidase-A0A374BQD9', 'Amidase-A0A374E5V1', 'Amidase-A0A374IQI0', 'Amidase-A0A374NA86', 'Amidase-A0A374VU40', 'Amidase-A0A376L2A3', 'Amidase-A0A376L3D5', 'Amidase-A0A376L9K0', 'Amidase-A0A376VGL9', 'Amidase-A0A376ZL45', 'Amidase-A0A377AEW0', 'Amidase-A0A377AFF2', 'Amidase-A0A377ARD5', 'Amidase-A0A377BU28', 'Amidase-A0A377CDU1', 'Amidase-A0A377FMP1', 'Amidase-A0A377G251', 'Amidase-A0A377K170', 'Amidase-A0A377K339', 'Amidase-A0A377PCJ9', 'Amidase-A0A377PEK5', 'Amidase-A0A377TYT2', 'Amidase-A0A378AI88', 'Amidase-A0A378AP80', 'Amidase-A0A378BE14', 'Amidase-A0A378CET5', 'Amidase-A0A379CEW0', 'Amidase-A0A379CF33', 'Amidase-A0A379WQ87', 'Amidase-A0A380EPW8', 'Amidase-A0A380FIS5', 'Amidase-A0A380JF82', 'Amidase-A0A380KPZ1', 'Amidase-A0A381KMG1', 'Amidase-A0A385NSM3', 'Amidase-A0A388PG48', 'Amidase-A0A395YJQ6', 'Amidase-A0A395YRS9', 'Amidase-A0A396PDF8', 'Amidase-A0A399IPU6', 'Amidase-A0A399IT86', 'Amidase-A0A3A8UJG0', 'Amidase-A0A3B0J7M9', 'Amidase-A0A3C1EJV6', 'Amidase-A0A3D3U8F8', 'Amidase-A0A3D4R4K2', 'Amidase-A0A3D6ABC2', 'Amidase-A0A3E0KKG9', 'Amidase-A0A3E4PZL9', 'Amidase-A0A3G1J182', 'Amidase-A0A3M1TWX2', 'Amidase-A0A3M2BRK7', 'Amidase-A0A3M3ELE5', 'Amidase-A0A3M5V7X2', 'Amidase-A0A3M6PPG7', 'Amidase-A0A3N2CW57', 'Amidase-A0A3N2K3H7', 'Amidase-A0A3N5B293', 'Amidase-A0A3P1V5J5', 'Amidase-A0A3R6L668', 'Amidase-A0A3R6RNK4', 'Amidase-A0A3R6T0U5', 'Amidase-A0A3R6U3R8', 'Amidase-A0A3R6UTV7', 'Amidase-A0A3R6V5M3', 'Amidase-A0A3R9GUJ3', 'Amidase-A0A3R9HRN6', 'Amidase-A0A3R9L190', 'Amidase-A0A3S4VPW1', 'Amidase-A0A412HLN3', 'Amidase-A0A413FTU5', 'Amidase-A0A413TDI8', 'Amidase-A0A414T839', 'Amidase-A0A415TCM1', 'Amidase-A0A416ERY2', 'Amidase-A0A416VUU7', 'Amidase-A0A416ZWG9', 'Amidase-A0A417AD96', 'Amidase-A0A417AYZ1', 'Amidase-A0A417DHI3', 'Amidase-A0A417SAG9', 'Amidase-A0A417VMT3', 'Amidase-A0A419GEF5', 'Amidase-A0A427ZV69', 'Amidase-A0A432L6Q0', 'Amidase-A0A446IA63', 'Amidase-A0A447JKD1', 'Amidase-A0A447NND8', 'Amidase-A0A447TW82', 'Amidase-A0A447TYH6', 'Amidase-A0A448A9B8', 'Amidase-A0A448QT76', 'Amidase-A0A455SSC1', 'Amidase-A0A455T3E6', 'Amidase-A0A485AN42', 'Amidase-A0A485CCP0', 'Amidase-A0A496N986', 'Amidase-A0A497AS56', 'Amidase-A0A4P8HAK8', 'Amidase-A0A4Q5YDB4', 'Amidase-A0A4Q7DNX7', 'Amidase-A0A4R2L1Y6', 'Amidase-A0A4R2N2N3', 'Amidase-A0A4U8Q727', 'Amidase-A0A4U9D676', 'Amidase-A0A4U9D6H0', 'Amidase-A0A4U9DAW4', 'Amidase-A0A4U9HI93', 'Amidase-A0A4U9HUU1', 'Amidase-A0A4U9IGU3', 'Amidase-A0A4V0ET45', 'Amidase-A0A4V2R1G3', 'Amidase-A0A4V3BEQ8', 'Amidase-A0A4V6JH64', 'Amidase-A0A4V6LWM3', 'Amidase-A0A4Y1ZF83', 'Amidase-A0A4Y8KZE2', 'Amidase-A0A4Y8X0Y5', 'Amidase-A0A510WEK9', 'Amidase-A0A521KP95', 'Amidase-A0A523M319', 'Amidase-A0A534TJ56', 'Amidase-A0A535HK62', 'Amidase-A0A535X4W3', 'Amidase-A0A536S610', 'Amidase-A0A538PWY1', 'Amidase-A0A554X977', 'Amidase-A0A558QQW8', 'Amidase-A0A559UAY2', 'Amidase-A0A562HLZ4', 'Amidase-A0A563BFR8', 'Amidase-A0A564SMW6', 'Amidase-A0A5B7XAL7', 'Amidase-A0A5B8TF84', 'Amidase-A0A5C6KTV7', 'Amidase-A0A5C7Q7Q4', 'Amidase-A0A5E7XWZ8', 'Amidase-A0A5F0K4Q7', 'Amidase-A0A5K1J6W4', 'Amidase-A0A5Q2TJ94', 'Amidase-A0A5S9PZB0', 'Amidase-A0A646HSM4', 'Amidase-A0A655EIP8', 'Amidase-A0A655RZV8', 'Amidase-A0A656A205', 'Amidase-A0A661IHI7', 'Amidase-A0A663BB50', 'Amidase-A0A6A7RYW8', 'Amidase-A0A6G1VJS5', 'Amidase-A0A6I1REW9', 'Amidase-A0A6I2U068', 'Amidase-A0A6J4HED9', 'Amidase-A0A6L3JRB4', 'Amidase-A0A6L4AY65', 'Amidase-A0A6L5H4M1', 'Amidase-A0A6L5TXU8', 'Amidase-A0A6L9EQE5', 'Amidase-A0A6N2RTI6', 'Amidase-A0A6N2U056', 'Amidase-A0A6N2UGB3', 'Amidase-A0A6N2UIN0', 'Amidase-A0A6N2ZGP1', 'Amidase-A0A6N2ZVH0', 'Amidase-A0A6N3ADZ5', 'Amidase-A0A6N3B9W7', 'Amidase-A0A6N3BUZ5', 'Amidase-A0A6N3CUE4', 'Amidase-A0A6N3FSR5', 'Amidase-A0A6N7BRA0', 'Amidase-A0A6N7UXE1', 'Amidase-A0A6N8HTQ1', 'Amidase-A0A6N9P4Y1', 'Amidase-A0A6V7RLH1', 'Amidase-A0A6V8LZG1', 'Amidase-A0A7C1P4V8', 'Amidase-A0A7C2P1V6', 'Amidase-A0A7C2UYR9', 'Amidase-A0A7C6SBI0', 'Amidase-A0A7C6UVS0', 'Amidase-A0A7C6ZNH3', 'Amidase-A0A7G3EQI1', 'Amidase-A0A7G9WG71', 'Amidase-A0A7K0ASC9', 'Amidase-A0A7K3KPA7', 'Amidase-A0A7R6PCR6', 'Amidase-A0A7U6H661', 'Amidase-A0A7U6KIQ4', 'Amidase-A0A7U9R3U1', 'Amidase-A0A7U9S8H5', 'Amidase-A0A7U9WR97', 'Amidase-A0A7U9X8I7', 'Amidase-A0A7V0IH32', 'Amidase-A0A7V2MRL5', 'Amidase-A0A7V8XZM4', 'Amidase-A0A7V9C1P4', 'Amidase-A0A7W0ENP4', 'Amidase-A0A7W1IHX0', 'Amidase-A0A7W1RE05', 'Amidase-A0A7W1W1M2', 'Amidase-A0A7W2SPU1', 'Amidase-A0A7W8CWH4', 'Amidase-A0A7W9SUW4', 'Amidase-A0A7X0S9L4', 'Amidase-A0A7X2P4C9', 'Amidase-A0A7X2Y2N6', 'Amidase-A0A7X3P1M2', 'Amidase-A0A7X5DHP9', 'Amidase-A0A7X6D7C2', 'Amidase-A0A7X6J9T2', 'Amidase-A0A7X7HDU7', 'Amidase-A0A7X8EV73', 'Amidase-A0A7X8G0P9', 'Amidase-A0A7X9S5S8', 'Amidase-A0A7Y1V3J3', 'Amidase-A0A7Y2DE71', 'Amidase-A0A7Y5CCF2', 'Amidase-A0A7Y5U384', 'Amidase-A0A7Z7QXY5', 'Amidase-A0A7Z9PKR5', 'Amidase-A0A7Z9XDS8', 'Amidase-A0A829A4C9', 'Amidase-A0A832QCY8', 'Amidase-A0A841LJQ4', 'Amidase-A0A841PWQ3', 'Amidase-A0A844E2U4', 'Amidase-A0A844JUQ0', 'Amidase-A0A844X885', 'Amidase-A0A847PGS9', 'Amidase-A0A848Y5N8', 'Amidase-A0A849E9X1', 'Amidase-A0A849YFD0', 'Amidase-A0A850HLQ7', 'Amidase-A0A851HC77', 'Amidase-A0A851IA66', 'Amidase-A0A854XCG6', 'Amidase-A0A8B3FKV7', 'Amidase-A0ALP4', 'Amidase-A1QZW1', 'Amidase-A4P0M2', 'Amidase-A5ZNJ9', 'Amidase-B0TEC1', 'Amidase-B6FYK5', 'Amidase-B9W4U8', 'Amidase-C9KNZ6', 'Amidase-C9KUE4', 'Amidase-C9RCM7', 'Amidase-D0WBL1', 'Amidase-F3MWW3', 'Amidase-F3UXY6', 'Amidase-G5NAG9', 'Amidase-G9ZXA6', 'Amidase-H0UGJ6', 'Amidase-I7J6A5', 'Amidase-N1UHZ3', 'Amidase-Q38Y95', 'Amidase-Q3EK55', 'Amidase-Q3EQP3', 'Amidase-Q897G3', 'Amidase-Q897P1', 'Amidase-Q93G55', 'Amidase-Q9I2Q1', 'Amidase-R1CGG8', 'Amidase-R5AHW3', 'Amidase-R5GLE2', 'Amidase-R5IRD9', 'Amidase-R5IZP6', 'Amidase-R5J0Q8', 'Amidase-R5JM70', 'Amidase-R5L864', 'Amidase-R5TJC0', 'Amidase-R5ZXG6', 'Amidase-R6H0K7', 'Amidase-R6MVY6', 'Amidase-R6PEQ6', 'Amidase-R6SHQ9', 'Amidase-R6TEU6', 'Amidase-R6XXB6', 'Amidase-R7FK28', 'Amidase-R7H9H9', 'Amidase-R7P2J0', 'Amidase-R9BXU6', 'Amidase-R9RCK7', 'Amidase-T0D5H5', 'Amidase-T0V5P5', 'Amidase-T1DVH4', 'Amidase-T2IUT6', 'Amidase-V6F1Z2', 'Amidase-W0FL63', 'Amidase-W1UEN2', 'Amidase-W1X2M4', 'Amidase-W4UXJ1', 'Amidase-W7D8H4', 'unclustered', 'Muramidase-A0A011N0Q1', 'Muramidase-A0A075R6B1', 'Muramidase-A0A078LJB7', 'Muramidase-A0A083ZYP5', 'Muramidase-A0A087B2V4', 'Muramidase-A0A090N7Q0', 'Muramidase-A0A090QL71', 'Muramidase-A0A0A8FMV8', 'Muramidase-A0A0A8X5Z1', 'Muramidase-A0A0C2S3R7', 'Muramidase-A0A0E2HEG0', 'Muramidase-A0A0E3WFK5', 'Muramidase-A0A0F4VIV9', 'Muramidase-A0A0F4VMI7', 'Muramidase-A0A0F5EYD9', 'Muramidase-A0A0F5I4G4', 'Muramidase-A0A0F5P9U3', 'Muramidase-A0A0G1YBW8', 'Muramidase-A0A0H2Z3S5', 'Muramidase-A0A0J5VE76', 'Muramidase-A0A0J7XYQ7', 'Muramidase-A0A0K4X6P2', 'Muramidase-A0A0K6H947', 'Muramidase-A0A0K9TA50', 'Muramidase-A0A0L1LLS7', 'Muramidase-A0A0L8F022', 'Muramidase-A0A0M2NIX8', 'Muramidase-A0A0N0D2Z2', 'Muramidase-A0A0N0V431', 'Muramidase-A0A0Q5DHU7', 'Muramidase-A0A0Q8QAC5', 'Muramidase-A0A0R1P8F1', 'Muramidase-A0A0R1UCA3', 'Muramidase-A0A0R2DK37', 'Muramidase-A0A0R2IBS7', 'Muramidase-A0A0U1QLU6', 'Muramidase-A0A0U3FW74', 'Muramidase-A0A0U5II31', 'Muramidase-A0A0V8BJX0', 'Muramidase-A0A0V8BZD9', 'Muramidase-A0A0W1KIA0', 'Muramidase-A0A128ET70', 'Muramidase-A0A136L233', 'Muramidase-A0A139MG29', 'Muramidase-A0A141RGP2', 'Muramidase-A0A142INH0', 'Muramidase-A0A150K5H8', 'Muramidase-A0A158J9I7', 'Muramidase-A0A165RMP4', 'Muramidase-A0A173XZ41', 'Muramidase-A0A173ZBW9', 'Muramidase-A0A174BGG6', 'Muramidase-A0A174D8S5', 'Muramidase-A0A174RK63', 'Muramidase-A0A174VG92', 'Muramidase-A0A174ZDU7', 'Muramidase-A0A177LSL6', 'Muramidase-A0A1A7KFR6', 'Muramidase-A0A1C2JG90', 'Muramidase-A0A1C3H387', 'Muramidase-A0A1C3SD71', 'Muramidase-A0A1C3WN92', 'Muramidase-A0A1C3Y8G2', 'Muramidase-A0A1C5L8D3', 'Muramidase-A0A1C5S4K0', 'Muramidase-A0A1C5XZR0', 'Muramidase-A0A1C6AE64', 'Muramidase-A0A1C6CM69', 'Muramidase-A0A1C6CUZ1', 'Muramidase-A0A1C6GJ66', 'Muramidase-A0A1C6JVK9', 'Muramidase-A0A1C9VN45', 'Muramidase-A0A1D2L7R0', 'Muramidase-A0A1D2S554', 'Muramidase-A0A1F3VIL9', 'Muramidase-A0A1F6ETM3', 'Muramidase-A0A1F9PN79', 'Muramidase-A0A1G3JRM4', 'Muramidase-A0A1G6CE93', 'Muramidase-A0A1G6CPZ6', 'Muramidase-A0A1G9CQ73', 'Muramidase-A0A1H0QLS2', 'Muramidase-A0A1H6LY20', 'Muramidase-A0A1H7Z7N1', 'Muramidase-A0A1H9CAE9', 'Muramidase-A0A1H9JMJ9', 'Muramidase-A0A1I3CT30', 'Muramidase-A0A1I5G9K1', 'Muramidase-A0A1I6KS43', 'Muramidase-A0A1I6PGW2', 'Muramidase-A0A1I9X0B9', 'Muramidase-A0A1L8TG29', 'Muramidase-A0A1M3PX98', 'Muramidase-A0A1M4M9Q8', 'Muramidase-A0A1M7LW81', 'Muramidase-A0A1Q6L3T8', 'Muramidase-A0A1Q6RAD2', 'Muramidase-A0A1Q6YS27', 'Muramidase-A0A1Q8YNE5', 'Muramidase-A0A1R4F0V8', 'Muramidase-A0A1S1NQL5', 'Muramidase-A0A1S8L485', 'Muramidase-A0A1T4Q9G8', 'Muramidase-A0A1U7GM02', 'Muramidase-A0A1V4IW64', 'Muramidase-A0A1V5G9Y1', 'Muramidase-A0A1V5GAE1', 'Muramidase-A0A1V5KV49', 'Muramidase-A0A1V5UF42', 'Muramidase-A0A1V5YU78', 'Muramidase-A0A1V5ZPA0', 'Muramidase-A0A1V6HZD5', 'Muramidase-A0A1V8PLK1', 'Muramidase-A0A1W6LMA8', 'Muramidase-A0A1X3A2L7', 'Muramidase-A0A1X3IA24', 'Muramidase-A0A1Y0BZ80', 'Muramidase-A0A1Y0FT80', 'Muramidase-A0A1Y4QK08', 'Muramidase-A0A1Y4QR82', 'Muramidase-A0A1Y6K8I0', 'Muramidase-A0A1Z2SJG6', 'Muramidase-A0A200L0V0', 'Muramidase-A0A220U840', 'Muramidase-A0A223P7U4', 'Muramidase-A0A226BWH4', 'Muramidase-A0A229I452', 'Muramidase-A0A246WM26', 'Muramidase-A0A255TB78', 'Muramidase-A0A257G7K2', 'Muramidase-A0A258HMI9', 'Muramidase-A0A259NLB3', 'Muramidase-A0A261F7G8', 'Muramidase-A0A261UHQ5', 'Muramidase-A0A2A2CB46', 'Muramidase-A0A2A2EFS3', 'Muramidase-A0A2A5S4W0', 'Muramidase-A0A2A7TB69', 'Muramidase-A0A2A9INK4', 'Muramidase-A0A2A9KB92', 'Muramidase-A0A2B9TMH9', 'Muramidase-A0A2D4QW59', 'Muramidase-A0A2D5F729', 'Muramidase-A0A2D7Y4X9', 'Muramidase-A0A2D9XMD3', 'Muramidase-A0A2E2QEV1', 'Muramidase-A0A2E5KWR7', 'Muramidase-A0A2E8RFU8', 'Muramidase-A0A2G6LD42', 'Muramidase-A0A2H5W9N1', 'Muramidase-A0A2J7THI1', 'Muramidase-A0A2K4ZGU8', 'Muramidase-A0A2M8A1B0', 'Muramidase-A0A2M9HSI6', 'Muramidase-A0A2M9ZCG7', 'Muramidase-A0A2N0TKC4', 'Muramidase-A0A2N0WIA4', 'Muramidase-A0A2N2ZW25', 'Muramidase-A0A2N5WBQ5', 'Muramidase-A0A2N7S326', 'Muramidase-A0A2N8I6H3', 'Muramidase-A0A2N8MCM5', 'Muramidase-A0A2P9E797', 'Muramidase-A0A2S0KP84', 'Muramidase-A0A2S3U6T6', 'Muramidase-A0A2S8X4A1', 'Muramidase-A0A2T1ARM9', 'Muramidase-A0A2T3F9E5', 'Muramidase-A0A2T3FWS9', 'Muramidase-A0A2T7YIN2', 'Muramidase-A0A2V1K5W8', 'Muramidase-A0A2V2CCV5', 'Muramidase-A0A2V2SRQ9', 'Muramidase-A0A2V5LFQ3', 'Muramidase-A0A2W7IUA4', 'Muramidase-A0A2X0QXT8', 'Muramidase-A0A2X0WBY3', 'Muramidase-A0A2X1B4M0', 'Muramidase-A0A2X1JGI3', 'Muramidase-A0A2X1QQB9', 'Muramidase-A0A2X2SX28', 'Muramidase-A0A2X2SZ50', 'Muramidase-A0A2X3CUA9', 'Muramidase-A0A2X3E7A0', 'Muramidase-A0A2X3GNH8', 'Muramidase-A0A2X3IGF9', 'Muramidase-A0A2X3IRA4', 'Muramidase-A0A2X3JWG6', 'Muramidase-A0A2X3KBH5', 'Muramidase-A0A2X3KDX5', 'Muramidase-A0A2X4TJ27', 'Muramidase-A0A2X4XL63', 'Muramidase-A0A2Z6B1U6', 'Muramidase-A0A2Z6PZV7', 'Muramidase-A0A317GVF6', 'Muramidase-A0A347ZWQ0', 'Muramidase-A0A349BY18', 'Muramidase-A0A350IAI4', 'Muramidase-A0A352CN15', 'Muramidase-A0A353NQ63', 'Muramidase-A0A353XW06', 'Muramidase-A0A354HIH9', 'Muramidase-A0A357Y4U8', 'Muramidase-A0A359LME5', 'Muramidase-A0A367ZCK7', 'Muramidase-A0A371WIE9', 'Muramidase-A0A373VFH9', 'Muramidase-A0A374NZH9', 'Muramidase-A0A376FGU0', 'Muramidase-A0A376FJT0', 'Muramidase-A0A376L982', 'Muramidase-A0A376LI87', 'Muramidase-A0A376SBK3', 'Muramidase-A0A376U6H5', 'Muramidase-A0A376UEZ2', 'Muramidase-A0A376UGA3', 'Muramidase-A0A376Y679', 'Muramidase-A0A377A418', 'Muramidase-A0A377AGP5', 'Muramidase-A0A377B8V0', 'Muramidase-A0A377BSU4', 'Muramidase-A0A377BXF4', 'Muramidase-A0A377C2B7', 'Muramidase-A0A377C4Q1', 'Muramidase-A0A377C8K4', 'Muramidase-A0A377D7B9', 'Muramidase-A0A377DNF6', 'Muramidase-A0A377E5S9', 'Muramidase-A0A377LFD4', 'Muramidase-A0A377NGP6', 'Muramidase-A0A377NGR8', 'Muramidase-A0A377NH98', 'Muramidase-A0A377PFR3', 'Muramidase-A0A377TNA6', 'Muramidase-A0A377U367', 'Muramidase-A0A377XAN5', 'Muramidase-A0A378AHN3', 'Muramidase-A0A378AYU6', 'Muramidase-A0A378C2N6', 'Muramidase-A0A378LWD9', 'Muramidase-A0A379AGG1', 'Muramidase-A0A379DX85', 'Muramidase-A0A379SB51', 'Muramidase-A0A379SBA0', 'Muramidase-A0A379VR70', 'Muramidase-A0A380L0B2', 'Muramidase-A0A381CJV6', 'Muramidase-A0A381G2F9', 'Muramidase-A0A381GHS2', 'Muramidase-A0A381H835', 'Muramidase-A0A3A1Y9X2', 'Muramidase-A0A3A9FAE2', 'Muramidase-A0A3B8HH68', 'Muramidase-A0A3B9HVM0', 'Muramidase-A0A3C0P180', 'Muramidase-A0A3D0FIP6', 'Muramidase-A0A3D0FJV7', 'Muramidase-A0A3D2W4D2', 'Muramidase-A0A3D4L433', 'Muramidase-A0A3D8PBP5', 'Muramidase-A0A3E2TRV3', 'Muramidase-A0A3G6YV78', 'Muramidase-A0A3L7E2P4', 'Muramidase-A0A3M4K7D9', 'Muramidase-A0A3N2M7Q4', 'Muramidase-A0A3N6ES69', 'Muramidase-A0A3P1VCJ6', 'Muramidase-A0A3P5XF32', 'Muramidase-A0A3P5XN18', 'Muramidase-A0A3Q9FY36', 'Muramidase-A0A3R6PBB4', 'Muramidase-A0A3S3QNA2', 'Muramidase-A0A3S4HZU0', 'Muramidase-A0A3S4KNE9', 'Muramidase-A0A3S4LSY7', 'Muramidase-A0A3S4PBG6', 'Muramidase-A0A3S4YFD5', 'Muramidase-A0A3S5DHT2', 'Muramidase-A0A3T0D2Y2', 'Muramidase-A0A3T2V1H4', 'Muramidase-A0A413UD28', 'Muramidase-A0A413Z4A2', 'Muramidase-A0A414RH06', 'Muramidase-A0A415K0P1', 'Muramidase-A0A415ZT48', 'Muramidase-A0A416S5N0', 'Muramidase-A0A417NBK2', 'Muramidase-A0A417T1Y9', 'Muramidase-A0A417WSJ1', 'Muramidase-A0A418PIP4', 'Muramidase-A0A419NC47', 'Muramidase-A0A433GEU9', 'Muramidase-A0A446ZIB5', 'Muramidase-A0A447JNW3', 'Muramidase-A0A447M3H3', 'Muramidase-A0A447M9P0', 'Muramidase-A0A447P8W4', 'Muramidase-A0A447PAN8', 'Muramidase-A0A447U1U4', 'Muramidase-A0A447WEF5', 'Muramidase-A0A448D759', 'Muramidase-A0A450SW72', 'Muramidase-A0A455VZ36', 'Muramidase-A0A455X1X1', 'Muramidase-A0A484HJJ3', 'Muramidase-A0A484YSY4', 'Muramidase-A0A484Z8J9', 'Muramidase-A0A484ZFN5', 'Muramidase-A0A485AD95', 'Muramidase-A0A485ANB7', 'Muramidase-A0A485ARF8', 'Muramidase-A0A485AS74', 'Muramidase-A0A485B2E3', 'Muramidase-A0A485B3B6', 'Muramidase-A0A485BBT2', 'Muramidase-A0A485BU55', 'Muramidase-A0A485BUM2', 'Muramidase-A0A485BWD7', 'Muramidase-A0A485C067', 'Muramidase-A0A485C8J4', 'Muramidase-A0A485ICV5', 'Muramidase-A0A486DDE4', 'Muramidase-A0A486S997', 'Muramidase-A0A495V7Y5', 'Muramidase-A0A4J1XYS2', 'Muramidase-A0A4P0TMI5', 'Muramidase-A0A4P6DQP2', 'Muramidase-A0A4P6X184', 'Muramidase-A0A4Q0IHD2', 'Muramidase-A0A4Q3M5B0', 'Muramidase-A0A4Q3MM19', 'Muramidase-A0A4R0FKA5', 'Muramidase-A0A4R0PCF0', 'Muramidase-A0A4R2BHD2', 'Muramidase-A0A4R2TDI3', 'Muramidase-A0A4R3JAE4', 'Muramidase-A0A4R3MZR7', 'Muramidase-A0A4R5N7B0', 'Muramidase-A0A4R6SHW0', 'Muramidase-A0A4R6XTX5', 'Muramidase-A0A4S2AN32', 'Muramidase-A0A4S4A5T5', 'Muramidase-A0A4T2ATD2', 'Muramidase-A0A4U8Q8V3', 'Muramidase-A0A4U9WHG4', 'Muramidase-A0A4Y9SMQ5', 'Muramidase-A0A4Z0DE45', 'Muramidase-A0A4Z0S334', 'Muramidase-A0A508W1P5', 'Muramidase-A0A509CW04', 'Muramidase-A0A509CZF1', 'Muramidase-A0A511YKT9', 'Muramidase-A0A512TQB7', 'Muramidase-A0A519BQL5', 'Muramidase-A0A520YPY2', 'Muramidase-A0A521STF0', 'Muramidase-A0A522FJC4', 'Muramidase-A0A522GMT4', 'Muramidase-A0A522MNZ5', 'Muramidase-A0A534PM21', 'Muramidase-A0A534QK47', 'Muramidase-A0A537JPG7', 'Muramidase-A0A537PTU6', 'Muramidase-A0A537YI38', 'Muramidase-A0A538IDP4', 'Muramidase-A0A554JJH7', 'Muramidase-A0A554KEP9', 'Muramidase-A0A556CN56', 'Muramidase-A0A556UTV3', 'Muramidase-A0A561DCD8', 'Muramidase-A0A563DX96', 'Muramidase-A0A564S8J2', 'Muramidase-A0A5C0DVP2', 'Muramidase-A0A5C1DFD6', 'Muramidase-A0A5D0TT65', 'Muramidase-A0A5E4WJ01', 'Muramidase-A0A5E4WL62', 'Muramidase-A0A5E6NVI4', 'Muramidase-A0A5J6PX28', 'Muramidase-A0A5N7MBX0', 'Muramidase-A0A5P2M9D4', 'Muramidase-A0A5Q0BIM9', 'Muramidase-A0A5R9B8G6', 'Muramidase-A0A605RST0', 'Muramidase-A0A628M6N4', 'Muramidase-A0A641BE80', 'Muramidase-A0A654C297', 'Muramidase-A0A656HH03', 'Muramidase-A0A660C797', 'Muramidase-A0A6A7VWJ7', 'Muramidase-A0A6A8VHR4', 'Muramidase-A0A6B8JBQ0', 'Muramidase-A0A6C0PWV3', 'Muramidase-A0A6C8GM04', 'Muramidase-A0A6H0UL77', 'Muramidase-A0A6H1W7P4', 'Muramidase-A0A6H2L001', 'Muramidase-A0A6I6GJB4', 'Muramidase-A0A6M0LFB0', 'Muramidase-A0A6N2THX7', 'Muramidase-A0A6N2U349', 'Muramidase-A0A6N2YZI8', 'Muramidase-A0A6N3E4Q6', 'Muramidase-A0A6P2YFT3', 'Muramidase-A0A6S5JMF8', 'Muramidase-A0A6S5XQK8', 'Muramidase-A0A6S6SHP9', 'Muramidase-A0A6V8MXP5', 'Muramidase-A0A7C1N3A2', 'Muramidase-A0A7C1PEX6', 'Muramidase-A0A7C1TGY1', 'Muramidase-A0A7C2WUQ3', 'Muramidase-A0A7C3H047', 'Muramidase-A0A7C4TYD7', 'Muramidase-A0A7C5GTC2', 'Muramidase-A0A7C5GTR5', 'Muramidase-A0A7C6SMU5', 'Muramidase-A0A7C8CYK6', 'Muramidase-A0A7D5LE34', 'Muramidase-A0A7G5NUR8', 'Muramidase-A0A7H4MMS4', 'Muramidase-A0A7H4MMS7', 'Muramidase-A0A7H4N042', 'Muramidase-A0A7H4NHR3', 'Muramidase-A0A7H4NSS4', 'Muramidase-A0A7H4P4T2', 'Muramidase-A0A7H4PBY9', 'Muramidase-A0A7H4S8X5', 'Muramidase-A0A7H4SHY1', 'Muramidase-A0A7H4YRR5', 'Muramidase-A0A7H9QWB7', 'Muramidase-A0A7I8ZS27', 'Muramidase-A0A7J5JD64', 'Muramidase-A0A7L7WKR6', 'Muramidase-A0A7T2JK76', 'Muramidase-A0A7T4G882', 'Muramidase-A0A7T5ES45', 'Muramidase-A0A7U3LVJ2', 'Muramidase-A0A7U9NT98', 'Muramidase-A0A7U9RWR4', 'Muramidase-A0A7V5EWS4', 'Muramidase-A0A7V7UCF9', 'Muramidase-A0A7V8A8R5', 'Muramidase-A0A7V8Z8E0', 'Muramidase-A0A7V9P2W4', 'Muramidase-A0A7W0MLZ8', 'Muramidase-A0A7W1C7J8', 'Muramidase-A0A7W4NQQ4', 'Muramidase-A0A7W4TYT8', 'Muramidase-A0A7W8SQF1', 'Muramidase-A0A7X2XUP7', 'Muramidase-A0A7X3DJV0', 'Muramidase-A0A7X5L589', 'Muramidase-A0A7X7J836', 'Muramidase-A0A7X7ZLW4', 'Muramidase-A0A7X9L6V6', 'Muramidase-A0A7Y0GBM1', 'Muramidase-A0A7Y2HUU2', 'Muramidase-A0A7Y6TTM5', 'Muramidase-A0A7Y8XNP5', 'Muramidase-A0A7Y9Z657', 'Muramidase-A0A7Z0MN56', 'Muramidase-A0A7Z8R0A6', 'Muramidase-A0A822LL96', 'Muramidase-A0A827KGV1', 'Muramidase-A0A829GN82', 'Muramidase-A0A829HJR8', 'Muramidase-A0A831EVP6', 'Muramidase-A0A832GZH1', 'Muramidase-A0A836ZFR9', 'Muramidase-A0A838QJX5', 'Muramidase-A0A838V9B1', 'Muramidase-A0A839WWJ1', 'Muramidase-A0A840RCT4', 'Muramidase-A0A841QL93', 'Muramidase-A0A843T9T0', 'Muramidase-A0A843YNC1', 'Muramidase-A0A845RNG7', 'Muramidase-A0A847JMP9', 'Muramidase-A0A847KRI9', 'Muramidase-A0A848J9C3', 'Muramidase-A0A849IE73', 'Muramidase-A0A850FBP6', 'Muramidase-A0A8B3UU51', 'Muramidase-A0A8B4MPE7', 'Muramidase-A0A8B4QC66', 'Muramidase-A0A8B4WQB3', 'Muramidase-A0A8B6KSL4', 'Muramidase-A0L421', 'Muramidase-A0LH35', 'Muramidase-A1SKG1', 'Muramidase-A4ENI4', 'Muramidase-A4N4N4', 'Muramidase-A6Q8G4', 'Muramidase-B0NER8', 'Muramidase-B2JF15', 'Muramidase-B2PZV4', 'Muramidase-B6WQQ4', 'Muramidase-B7GS44', 'Muramidase-B8IVK2', 'Muramidase-C0AVY3', 'Muramidase-C0BVQ8', 'Muramidase-C0BXW5', 'Muramidase-C3X361', 'Muramidase-C6XGM8', 'Muramidase-C9MTK8', 'Muramidase-D1AI81', 'Muramidase-D1AQE9', 'Muramidase-D2TW44', 'Muramidase-D3EZ02', 'Muramidase-D4BID4', 'Muramidase-D4CET6', 'Muramidase-D4CQ27', 'Muramidase-D8NA15', 'Muramidase-E0WSU1', 'Muramidase-E0XYZ6', 'Muramidase-E5VDJ6', 'Muramidase-E6BK21', 'Muramidase-E7GN27', 'Muramidase-E7GTP4', 'Muramidase-F3KDT5', 'Muramidase-F4N5M1', 'Muramidase-F8LXE0', 'Muramidase-F9HE68', 'Muramidase-F9VBQ4', 'Muramidase-G1VIU0', 'Muramidase-G7Q957', 'Muramidase-H0HPH5', 'Muramidase-I0T458', 'Muramidase-I2URK2', 'Muramidase-I4MBW6', 'Muramidase-I6CX42', 'Muramidase-I9VAN4', 'Muramidase-J1KG57', 'Muramidase-J4KU53', 'Muramidase-K6U5B8', 'Muramidase-L0D878', 'Muramidase-M6HJZ1', 'Muramidase-Q32EA3', 'Muramidase-Q6AKF3', 'Muramidase-Q8Y464', 'Muramidase-R0BXB3', 'Muramidase-R5D4J2', 'Muramidase-R5IWH4', 'Muramidase-R5MAD1', 'Muramidase-R5VGU4', 'Muramidase-R5Y8J5', 'Muramidase-R5YKQ7', 'Muramidase-R6J1C5', 'Muramidase-R6KXE4', 'Muramidase-S0F861', 'Muramidase-S4B424', 'Muramidase-T0CXG3', 'Muramidase-T0UBX6', 'Muramidase-T0WIP7', 'Muramidase-U2ECY2', 'Muramidase-U7D9Q6', 'Muramidase-V1GVJ2', 'Muramidase-V5EDU1', 'Muramidase-W4UCM7', 'Muramidase-W5XD20', 'Muramidase-W9H589', 'unclustered', 'sample_id']

In [2]:
colnames = clustered_ibd_old.columns

print(colnames)

NameError: name 'clustered_ibd_old' is not defined

In [1]:
duplicates = colnames[colnames.duplicated()]

NameError: name 'colnames' is not defined