In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from functions import *
import os

**Import cluster data and create dicts**

In [2]:
# Directory containing the TSV files
directory_path = '../../data/clustering/cluster_maps/'

# List all files in the directory
file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.tsv')]


# Read each TSV file into a DataFrame and store them in a list
dataframes = []
for file_path in file_paths:
    df = pd.read_csv(file_path, sep='\t')
    dataframes.append(df.iloc[:, 1:])

# Concatenate all DataFrames side by side
combined_df = pd.concat(dataframes, axis=1)

# Save the combined DataFrame to a new TSV file
combined_df.to_csv('combined_clusters.tsv', sep='\t', index=False)

In [None]:
cluster_df = pd.read_csv('../../data/clustering/cluster_maps/combined_clusters.tsv', sep='\t')

headers = list(cluster_df.columns)

enzymes = set([x.split('_')[0] for x in headers])

print(enzymes) 

In [16]:
mmseqs_df = pd.read_csv('../../data/clustering/mmseqs_cleaned_cluster.tsv', sep="\t")
foldseek_df = pd.read_csv('../../data/clustering/foldseek_cleaned_cluster.tsv', sep="\t")

In [17]:
# Group by the representative sequence and aggregate the clustered sequences into lists
foldseek_dict = foldseek_df.groupby('foldseek_cluster')['unclustered'].apply(list).to_dict()

print(len(list(foldseek_dict.items())))

132


In [18]:
# Group by the representative sequence and aggregate the clustered sequences into lists
mmseqs_dict = mmseqs_df.groupby('mmseqs_cluster')['unclustered'].apply(list).to_dict()

print(len(list(mmseqs_dict.items())))

542


In [None]:
# Flatten the list of mmseqs cluster representatives from foldseek_dict
foldseek_mmseqs_reps = set()
for reps in foldseek_dict.values():
    foldseek_mmseqs_reps.update(reps)

# Check if each mmseqs cluster representative is in foldseek_mmseqs_reps
unmapped_mmseqs_reps = [rep for rep in mmseqs_dict if rep not in foldseek_mmseqs_reps]

unmapped_mmseqs_reps

# Some proteins do not have their 3d structures predicted yet and so they are not in the foldseek_dict

**Create Secretion table and dictionary**

In [None]:
signalp = pd.read_csv('C:\\Users\\odesa\\OneDrive - University of Toronto\\CRC\\LatestDataJan\\signalp\\prediction_results.txt', sep='\t')

# signalp = pd.read_csv('/home/oliver/CRConedriveData/LatestDataJan/signalp/prediction_results.txt', sep='\t')

# display(signalp.head()) 

signalp['# ID'] = signalp['# ID'].str.split('_').str[2]

signalp['# ID'] = signalp['# ID'].str.split('|').str[0]

display(signalp.head()) 


In [17]:
foldseek_ids = list(foldseek_dict.keys())

In [18]:
# Create a dictionary containing the sec tag status for each of the original proteins present in the db

sec_dict = {}

for idx, id in enumerate(foldseek_ids):
    proteins = get_proteins(id, foldseek_dict, mmseqs_dict)
    for protein in proteins:
        if protein in signalp['# ID'].values:
            other_column_value = signalp.loc[signalp['# ID'] == protein, 'Prediction'].values[0]
            sec_dict[protein] = other_column_value

print(sec_dict)


{'A0A4V2DZ80': 'OTHER', 'A0A2R2W5C2': 'OTHER', 'A0A0A2VBZ4': 'SP', 'A0A0A2TFV7': 'SP', 'A0A1G8S2X6': 'SP', 'A0A0B7MQS5': 'OTHER', 'A0A0D0RVH7': 'SP', 'A0A098EI80': 'SP', 'A0A8F5H2Q7': 'SP', 'A0A2S5D4V8': 'SP', 'A0A828ZA80': 'SP', 'B1HNB1': 'SP', 'A0A828ZJD3': 'SP', 'A0A0G1MTH1': 'OTHER', 'A0A117EBF4': 'OTHER', 'A0A2S6XDR4': 'OTHER', 'A0A3N6H9U2': 'SP', 'A0A3N6HDX8': 'SP', 'A0A6C0QBM0': 'SP', 'A0A1V5U177': 'OTHER', 'A0A829ZIN9': 'SP', 'A0A151B350': 'SP', 'A0A162TRS5': 'SP', 'A0A1V4IFT6': 'SP', 'A0A161XER4': 'SP', 'A0A162L9K6': 'OTHER', 'A0A0L6Z818': 'SP', 'A0A1V5L937': 'SP', 'A0A163ZA77': 'OTHER', 'A0A165B3M3': 'SP', 'A0A8G0S7I1': 'SP', 'UPI0022864AF2': 'SP', 'A0A0M2PCH6': 'SP', 'UPI00155FFE64': 'SP', 'UPI0008FB71ED': 'SP', 'S7VHP7': 'OTHER', 'A0A0T6BMY2': 'SP', 'A0A179SV92': 'SP', 'UPI00203B6123': 'SP', 'UPI000745E235': 'SP', 'UPI001363BBCC': 'SP', 'UPI0022815CEC': 'SP', 'A0A6H2JRD4': 'SP', 'A0A8G0TKA2': 'SP', 'UPI0007726AF7': 'SP', 'UPI0013748E59': 'SP', 'UPI002281B2C3': 'SP', 'A0A8I1

In [19]:
# Doesnt currently work but need to check if SP is the only possible sec tag

# Flattening lists and extracting unique values
unique_values = set(value for values_list in sec_dict.values() for value in values_list)

print(unique_values)

{'O', 'L', 'S', 'E', 'T', 'P', 'H', 'A', 'I', 'R'}


**Create Domain Table**

In [None]:
IPS = pd.read_csv('/home/oliver/CRConedriveData/LatestDataJan/IPS/all_dl_endo_domains.tsv', sep='\t', header=None)

display(IPS.head())

In [None]:
IPS[0] = IPS[0].str.split('_').str[2]

IPS[0] = IPS[0].str.split('|').str[0]

display(IPS.head())

In [40]:
column_5_list = IPS.iloc[:, 5].tolist()
unique_values = set(column_5_list)
print(unique_values)


{'Transglycosylase-like domain', 'CAP-associated N-terminal', 'NlpC/P60 family', 'Immunoglobulin-like domain of bacterial spore germination', 'Amyloid A4 N-terminal heparin-binding', 'Transglycosylase SLT domain', 'Cell Wall Hydrolase', 'Lysozyme-like', 'SPOR domain', 'Bacterial Ig-like domain (group 2)', 'Zinc carboxypeptidase', 'SH3 domain (SH3b1 type)', 'FG-GAP repeat', 'LysM domain', 'Bacterial protein of unknown function (DUF882)', 'Fibronectin type III domain', 'S-layer homology domain', 'Putative peptidoglycan binding domain', 'Bacterial SH3 domain', 'Copper amine oxidase N-terminal domain', 'Cysteine-rich secretory protein family', 'SH3 domain of SH3b2 type', 'Beta-lactamase', 'Receptor family ligand binding region', 'Peptidase family M23', 'Bacterial Ig-like domain', 'Phage-related minor tail protein', 'Protein of unknown function (DUF5818)', 'Penicillin-insensitive murein endopeptidase', 'NLPC_P60 stabilising domain, N term', 'Bacterial dipeptidyl-peptidase Sh3 domain', 'alph

In [53]:
a = 'A0A4Y7RJ07'
foldseek_id = get_cluster([a], mmseqs_dict, foldseek_dict)[0]
print(foldseek_id)

A0A1C3RCD0


In [None]:
for idx, value in enumerate(IPS[0]):
    foldseek_id = get_cluster([value], mmseqs_dict, foldseek_dict)[0]
    IPS.at[idx, 0] = foldseek_id

display(IPS.head())

**Test the clustering on the PRJEB7774 Data**

In [2]:
PRJEB7774 = pd.read_feather('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB7774/clean_joined_genefamilies_relab_7774.feather')
print(PRJEB7774.shape)

(155, 80749)


In [3]:
clustered_7774 = cluster_humann_table('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB7774/clean_joined_genefamilies_relab_7774.feather',
                                      'C:/Users/odesa/Desktop/Code/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

print(clustered_7774.shape)

369 DL-endopeptidase found
369 DL-endopeptidase found, 9 DL-endopeptidase unclustered
4154 LD-carboxypeptidase found


KeyboardInterrupt: 

In [10]:
clustered_7774.to_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB7774.tsv', sep='\t', index=False)

**Clustering PRJEB10878**

In [3]:
PRJEB10878 = pd.read_feather('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB10878/clean_joined_genefamilies_relab_10878.feather')
print(PRJEB10878.shape)

(127, 76121)


In [4]:
clustered_10878 = cluster_humann_table('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB10878/clean_joined_genefamilies_relab_10878.feather',
                                      'C:/Users/odesa/Desktop/Code/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

print(clustered_10878.shape)

329 DL-endopeptidase found
329 DL-endopeptidase found, 8 DL-endopeptidase unclustered
3935 LD-carboxypeptidase found
3935 LD-carboxypeptidase found, 647 LD-carboxypeptidase unclustered
31 LD-endopeptidase found
31 LD-endopeptidase found, 31 LD-endopeptidase unclustered
2165 Glucosaminidase found
2165 Glucosaminidase found, 652 Glucosaminidase unclustered
11481 DD-carboxypeptidase found
11481 DD-carboxypeptidase found, 956 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
20960 Amidase found
20960 Amidase found, 1172 Amidase unclustered
34465 Muramidase found
34465 Muramidase found, 2549 Muramidase unclustered
(127, 1379)


In [5]:
clustered_10878.to_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB10878.tsv', sep='\t', index=False)

**Clustering DRA008156**

In [None]:
DRA = pd.read_feather('~/Downloads/clean_joined_genefamilies_relab_DRA008156.feather')
print(DRA.shape)

In [2]:
# Test cluster_humann_table function

test = cluster_humann_table('~/Downloads/clean_joined_genefamilies_relab_DRA008156.feather',
                             '~/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

461 DL-endopeptidase found
461 DL-endopeptidase found, 11 DL-endopeptidase unclustered
4871 LD-carboxypeptidase found
4871 LD-carboxypeptidase found, 691 LD-carboxypeptidase unclustered
81 LD-endopeptidase found
81 LD-endopeptidase found, 81 LD-endopeptidase unclustered
3259 Glucosaminidase found
3259 Glucosaminidase found, 1107 Glucosaminidase unclustered
13996 DD-carboxypeptidase found
13996 DD-carboxypeptidase found, 1102 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
26364 Amidase found
26364 Amidase found, 1631 Amidase unclustered
46941 Muramidase found
46941 Muramidase found, 4995 Muramidase unclustered


In [7]:
test.to_csv('~/CRC-Final/data/clustering/humann_clustered/clustered_complete_DRA008156.tsv', sep='\t', index=False)

**Clustering ICI trial PRJEB22893**

In [13]:
ici_22893 = pd.read_feather('E:/ICI/ici_humann/clean_joined_PRJEB22893_relab.feather')

print(ici_22893.shape)

(25, 42545)


In [14]:
ici_22893_clustered = cluster_humann_table('E:/ICI/ici_humann/clean_joined_PRJEB22893_relab.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

249 DL-endopeptidase found
249 DL-endopeptidase found, 6 DL-endopeptidase unclustered
1504 LD-carboxypeptidase found
1504 LD-carboxypeptidase found, 119 LD-carboxypeptidase unclustered
26 LD-endopeptidase found
26 LD-endopeptidase found, 26 LD-endopeptidase unclustered
1266 Glucosaminidase found
1266 Glucosaminidase found, 285 Glucosaminidase unclustered
7003 DD-carboxypeptidase found
7003 DD-carboxypeptidase found, 383 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
12719 Amidase found
12719 Amidase found, 597 Amidase unclustered
17803 Muramidase found
17803 Muramidase found, 1060 Muramidase unclustered


In [15]:
ici_22893_clustered.to_csv('E:/ICI/ici_humann/clustered_complete_PRJEB22893.tsv', sep='\t', index=False)

**Clustering ICI trial PRJNA399742**

In [6]:
ici_399742 = pd.read_feather('E:/ICI/ici_humann/clean_joined_PRJNA399742_relab.feather')

print(ici_399742.shape)

(39, 62621)


In [7]:
ici_399742_clustered = cluster_humann_table('E:/ICI/ici_humann/clean_joined_PRJNA399742_relab.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

317 DL-endopeptidase found
317 DL-endopeptidase found, 8 DL-endopeptidase unclustered
3194 LD-carboxypeptidase found
3194 LD-carboxypeptidase found, 555 LD-carboxypeptidase unclustered
30 LD-endopeptidase found
30 LD-endopeptidase found, 30 LD-endopeptidase unclustered
2118 Glucosaminidase found
2118 Glucosaminidase found, 834 Glucosaminidase unclustered
10072 DD-carboxypeptidase found
10072 DD-carboxypeptidase found, 814 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
18278 Amidase found
18278 Amidase found, 967 Amidase unclustered
26147 Muramidase found
26147 Muramidase found, 2266 Muramidase unclustered


In [None]:
ici_399742_clustered.to_csv('E:/ICI/ici_humann/clustered_complete_PRJEB399742.tsv', sep='\t', index=False)

**Clustering Bariatric Data**

In [5]:
bariatric = pd.read_feather('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather')

print(bariatric['sample_id'])

1      T_103
2      T_103
3      T_103
4      T_103
5      T_107
       ...  
109     T_79
110     T_79
112     T_94
113     T_94
114     T_94
Name: sample_id, Length: 110, dtype: object


In [None]:
# make the timepoint column for metadata

bariatric_ids = pd.DataFrame(bariatric['sample_id'])

bariatric_ids['TimePoint'] = bariatric_ids['sample_id'].apply(lambda x: x.split('_')[2])

print(bariatric_ids)

In [8]:
# Format IDs

bariatric_ids['sample_id'] = bariatric_ids['sample_id'].apply(lambda x: '_'.join(x.split('_')[:2]))

print(bariatric_ids)

    sample_id TimePoint
1       T_103        1M
2       T_103        6M
3       T_103        OR
4       T_103        BL
5       T_107        1M
..        ...       ...
109      T_79        BL
110      T_79        OR
112      T_94        1M
113      T_94        6M
114      T_94        BL

[110 rows x 2 columns]


In [None]:
# Fix stupid naming scheme...

bariatric['base_id'] = bariatric['sample_id'].apply(lambda x: '_'.join(x.split('_')[:-1]))

base_ids_with_211001 = bariatric[bariatric['sample_id'].str.endswith('211001')]['base_id'].unique()

# Mark _pooled entries for removal if their base_id is in the list identified above
bariatric['remove_flag'] = bariatric.apply(lambda row: row['base_id'] in base_ids_with_211001 and row['sample_id'].endswith('pooled'), axis=1)

# Filter out the rows marked for removal
bariatric = bariatric[~bariatric['remove_flag']].drop(columns=['base_id', 'remove_flag'])  # Drop the helper columns

bariatric.head(), bariatric.shape

In [51]:
bariatric.reset_index(drop=True, inplace=True)

In [None]:
bariatric.drop([0, 9, 59, 64, 111, 115, 116, 117], inplace=True)

In [54]:
bariatric['TimePoint'] = bariatric['sample_id'].apply(lambda x: '1M' if '1M' in x else ('6M' if '6M' in x else ('OR' if 'OR' in x else ('BL' if 'L' in x else ''))))

In [55]:
bariatric['sample_id'] = bariatric['sample_id'].apply(lambda x: '_'.join(x.split('_')[0].split('-')[:2]).strip('L'))

In [58]:
bariatric['sample_id'] = bariatric.apply(lambda row: row['sample_id'] + '_' + row['TimePoint'], axis=1)


In [60]:
bariatric.drop(columns=['TimePoint'], inplace=True)

In [None]:
bariatric['sample_id']

In [None]:
bariatric['sample_id'] = bariatric_ids['sample_id']

print(bariatric['sample_id'])

In [11]:
bariatric.to_feather('/media/oliver/PGH_Backup/bariatric/clean_joined_genefamilies_relab_bariatric.feather')
bariatric.to_csv('/media/oliver/PGH_Backup/bariatric/clean_joined_genefamilies_relab_bariatric.tsv', sep='\t', index=False)

In [12]:
make_outputs(bariatric_ids, '/media/oliver/PGH_Backup/bariatric/bariatric_metadata')

'/media/oliver/PGH_Backup/bariatric/bariatric_metadata'

In [6]:
bariatric_clustered = cluster_humann_table('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

432 DL-endopeptidase found
432 DL-endopeptidase found, 8 DL-endopeptidase unclustered
3856 LD-carboxypeptidase found
3856 LD-carboxypeptidase found, 571 LD-carboxypeptidase unclustered
113 LD-endopeptidase found
113 LD-endopeptidase found, 113 LD-endopeptidase unclustered
1889 Glucosaminidase found
1889 Glucosaminidase found, 773 Glucosaminidase unclustered
10083 DD-carboxypeptidase found
10083 DD-carboxypeptidase found, 736 DD-carboxypeptidase unclustered
0 Diadenylate-cyclase found
0 Diadenylate-cyclase found, 0 Diadenylate-cyclase unclustered
17138 Amidase found
17138 Amidase found, 961 Amidase unclustered
29971 Muramidase found
29971 Muramidase found, 2806 Muramidase unclustered


In [7]:
print(bariatric_clustered.shape)
bariatric_clustered = bariatric_clustered.drop(columns=['unclustered'])
print(bariatric_clustered.shape)


(110, 1257)
(110, 1250)


In [8]:
make_outputs(bariatric_clustered, 'E:/bariatric/clustered_complete_bariatric')

'E:/bariatric/clustered_complete_bariatric'

In [None]:
grouped_bariatric = group_humann_table('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather')

print(grouped_bariatric.shape, grouped_bariatric.columns)

In [None]:
grouped_bariatric.rename(columns={'sample': 'sample_id'}, inplace=True)

print(grouped_bariatric.columns)


In [4]:
make_outputs(grouped_bariatric, 'E:/bariatric/grouped_bariatric')

'E:/bariatric/grouped_bariatric'

In [4]:
clustered_7774 = pd.read_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB7774.tsv', sep='\t')

print(clustered_7774.shape)

(155, 1462)
