In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from functions import *
import os

**Import cluster data and create dicts**

In [None]:
# Directory containing the TSV files
directory_path = '../../data/clustering/cluster_maps/'

# List all files in the directory
file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.tsv')]


# Read each TSV file into a DataFrame and store them in a list
dataframes = []
for file_path in file_paths:
    df = pd.read_csv(file_path, sep='\t')
    dataframes.append(df.iloc[:, 1:])

# Concatenate all DataFrames side by side
combined_df = pd.concat(dataframes, axis=1)

# Save the combined DataFrame to a new TSV file
combined_df.to_csv('combined_clusters.tsv', sep='\t', index=False)

In [None]:
cluster_df = pd.read_csv('../../data/clustering/cluster_maps/combined_clusters.tsv', sep='\t')

headers = list(cluster_df.columns)

enzymes = set([x.split('_')[0] for x in headers])

print(enzymes) 

In [None]:
mmseqs_df = pd.read_csv('../../data/clustering/mmseqs_cleaned_cluster.tsv', sep="\t")
foldseek_df = pd.read_csv('../../data/clustering/foldseek_cleaned_cluster.tsv', sep="\t")

In [None]:
# Group by the representative sequence and aggregate the clustered sequences into lists
foldseek_dict = foldseek_df.groupby('foldseek_cluster')['unclustered'].apply(list).to_dict()

print(len(list(foldseek_dict.items())))

In [None]:
# Group by the representative sequence and aggregate the clustered sequences into lists
mmseqs_dict = mmseqs_df.groupby('mmseqs_cluster')['unclustered'].apply(list).to_dict()

print(len(list(mmseqs_dict.items())))

In [None]:
# Flatten the list of mmseqs cluster representatives from foldseek_dict
foldseek_mmseqs_reps = set()
for reps in foldseek_dict.values():
    foldseek_mmseqs_reps.update(reps)

# Check if each mmseqs cluster representative is in foldseek_mmseqs_reps
unmapped_mmseqs_reps = [rep for rep in mmseqs_dict if rep not in foldseek_mmseqs_reps]

unmapped_mmseqs_reps

# Some proteins do not have their 3d structures predicted yet and so they are not in the foldseek_dict

**Create Secretion table and dictionary**

In [None]:
signalp = pd.read_csv('C:\\Users\\odesa\\OneDrive - University of Toronto\\CRC\\LatestDataJan\\signalp\\prediction_results.txt', sep='\t')

# signalp = pd.read_csv('/home/oliver/CRConedriveData/LatestDataJan/signalp/prediction_results.txt', sep='\t')

# display(signalp.head()) 

signalp['# ID'] = signalp['# ID'].str.split('_').str[2]

signalp['# ID'] = signalp['# ID'].str.split('|').str[0]

display(signalp.head()) 


In [None]:
foldseek_ids = list(foldseek_dict.keys())

In [None]:
# Create a dictionary containing the sec tag status for each of the original proteins present in the db

sec_dict = {}

for idx, id in enumerate(foldseek_ids):
    proteins = get_proteins(id, foldseek_dict, mmseqs_dict)
    for protein in proteins:
        if protein in signalp['# ID'].values:
            other_column_value = signalp.loc[signalp['# ID'] == protein, 'Prediction'].values[0]
            sec_dict[protein] = other_column_value

print(sec_dict)


In [None]:
# Doesnt currently work but need to check if SP is the only possible sec tag

# Flattening lists and extracting unique values
unique_values = set(value for values_list in sec_dict.values() for value in values_list)

print(unique_values)

**Create Domain Table**

In [None]:
IPS = pd.read_csv('/home/oliver/CRConedriveData/LatestDataJan/IPS/all_dl_endo_domains.tsv', sep='\t', header=None)

display(IPS.head())

In [None]:
IPS[0] = IPS[0].str.split('_').str[2]

IPS[0] = IPS[0].str.split('|').str[0]

display(IPS.head())

In [None]:
column_5_list = IPS.iloc[:, 5].tolist()
unique_values = set(column_5_list)
print(unique_values)


In [None]:
a = 'A0A4Y7RJ07'
foldseek_id = get_cluster([a], mmseqs_dict, foldseek_dict)[0]
print(foldseek_id)

In [None]:
for idx, value in enumerate(IPS[0]):
    foldseek_id = get_cluster([value], mmseqs_dict, foldseek_dict)[0]
    IPS.at[idx, 0] = foldseek_id

display(IPS.head())

**Test the clustering on the PRJEB7774 Data**

In [None]:
PRJEB7774 = pd.read_feather('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB7774/clean_joined_genefamilies_relab_7774.feather')
print(PRJEB7774.shape)

In [None]:
clustered_7774 = cluster_humann_table('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB7774/clean_joined_genefamilies_relab_7774.feather',
                                      'C:/Users/odesa/Desktop/Code/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

print(clustered_7774.shape)

In [None]:
clustered_7774.to_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB7774.tsv', sep='\t', index=False)

**Clustering PRJEB10878**

In [None]:
PRJEB10878 = pd.read_feather('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB10878/clean_joined_genefamilies_relab_10878.feather')
print(PRJEB10878.shape)

In [None]:
clustered_10878 = cluster_humann_table('C:/Users/odesa/OneDrive - University of Toronto/CRC/LatestDataJan/PRJEB10878/clean_joined_genefamilies_relab_10878.feather',
                                      'C:/Users/odesa/Desktop/Code/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

print(clustered_10878.shape)

In [None]:
clustered_10878.to_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB10878.tsv', sep='\t', index=False)

**Clustering DRA008156**

In [None]:
DRA = pd.read_feather('~/Downloads/clean_joined_genefamilies_relab_DRA008156.feather')
print(DRA.shape)

In [None]:
# Test cluster_humann_table function

test = cluster_humann_table('~/Downloads/clean_joined_genefamilies_relab_DRA008156.feather',
                             '~/CRC-Final/data/clustering/cluster_maps/combined_clusters.tsv')

In [None]:
test.to_csv('~/CRC-Final/data/clustering/humann_clustered/clustered_complete_DRA008156.tsv', sep='\t', index=False)

**Clustering ICI trial PRJEB22893**

In [None]:
ici_22893 = pd.read_feather('E:/ICI/ici_humann/clean_joined_PRJEB22893_relab.feather')

print(ici_22893.shape)

In [None]:
ici_22893_clustered = cluster_humann_table('E:/ICI/ici_humann/clean_joined_PRJEB22893_relab.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

In [None]:
ici_22893_clustered.to_csv('E:/ICI/ici_humann/clustered_complete_PRJEB22893.tsv', sep='\t', index=False)

**Clustering ICI trial PRJNA399742**

In [None]:
ici_399742 = pd.read_feather('E:/ICI/ici_humann/clean_joined_PRJNA399742_relab.feather')

print(ici_399742.shape)

In [None]:
ici_399742_clustered = cluster_humann_table('E:/ICI/ici_humann/clean_joined_PRJNA399742_relab.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

In [None]:
ici_399742_clustered.to_csv('E:/ICI/ici_humann/clustered_complete_PRJEB399742.tsv', sep='\t', index=False)

**Clustering Bariatric Data**

In [None]:
bariatric = pd.read_feather('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather')

print(bariatric['sample_id'])

In [None]:
# make the timepoint column for metadata

bariatric_ids = pd.DataFrame(bariatric['sample_id'])

bariatric_ids['TimePoint'] = bariatric_ids['sample_id'].apply(lambda x: x.split('_')[2])

print(bariatric_ids)

In [None]:
# Format IDs

bariatric_ids['sample_id'] = bariatric_ids['sample_id'].apply(lambda x: '_'.join(x.split('_')[:2]))

print(bariatric_ids)

In [None]:
# Fix stupid naming scheme...

bariatric['base_id'] = bariatric['sample_id'].apply(lambda x: '_'.join(x.split('_')[:-1]))

base_ids_with_211001 = bariatric[bariatric['sample_id'].str.endswith('211001')]['base_id'].unique()

# Mark _pooled entries for removal if their base_id is in the list identified above
bariatric['remove_flag'] = bariatric.apply(lambda row: row['base_id'] in base_ids_with_211001 and row['sample_id'].endswith('pooled'), axis=1)

# Filter out the rows marked for removal
bariatric = bariatric[~bariatric['remove_flag']].drop(columns=['base_id', 'remove_flag'])  # Drop the helper columns

bariatric.head(), bariatric.shape

In [None]:
bariatric.reset_index(drop=True, inplace=True)

In [None]:
bariatric.drop([0, 9, 59, 64, 111, 115, 116, 117], inplace=True)

In [None]:
bariatric['TimePoint'] = bariatric['sample_id'].apply(lambda x: '1M' if '1M' in x else ('6M' if '6M' in x else ('OR' if 'OR' in x else ('BL' if 'L' in x else ''))))

In [None]:
bariatric['sample_id'] = bariatric['sample_id'].apply(lambda x: '_'.join(x.split('_')[0].split('-')[:2]).strip('L'))

In [None]:
bariatric['sample_id'] = bariatric.apply(lambda row: row['sample_id'] + '_' + row['TimePoint'], axis=1)


In [None]:
bariatric.drop(columns=['TimePoint'], inplace=True)

In [None]:
bariatric['sample_id']

In [None]:
bariatric['sample_id'] = bariatric_ids['sample_id']

print(bariatric['sample_id'])

In [None]:
bariatric.to_feather('/media/oliver/PGH_Backup/bariatric/clean_joined_genefamilies_relab_bariatric.feather')
bariatric.to_csv('/media/oliver/PGH_Backup/bariatric/clean_joined_genefamilies_relab_bariatric.tsv', sep='\t', index=False)

In [None]:
make_outputs(bariatric_ids, '/media/oliver/PGH_Backup/bariatric/bariatric_metadata')

In [None]:
bariatric_clustered = cluster_humann_table('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather',
                                            '../../data/clustering/cluster_maps/combined_clusters.tsv')

In [None]:
print(bariatric_clustered.shape)
bariatric_clustered = bariatric_clustered.drop(columns=['unclustered'])
print(bariatric_clustered.shape)


In [None]:
make_outputs(bariatric_clustered, 'E:/bariatric/clustered_complete_bariatric')

In [None]:
grouped_bariatric = group_humann_table('E:/bariatric/clean_joined_genefamilies_relab_bariatric.feather')

print(grouped_bariatric.shape, grouped_bariatric.columns)

In [None]:
grouped_bariatric.rename(columns={'sample': 'sample_id'}, inplace=True)

print(grouped_bariatric.columns)


In [None]:
make_outputs(grouped_bariatric, 'E:/bariatric/grouped_bariatric')

In [None]:
clustered_7774 = pd.read_csv('C:\\Users\\odesa\\Desktop\\Code\\CRC-Final\\data\\clustering\\humann_clustered\\clustered_complete_PRJEB7774.tsv', sep='\t')

print(clustered_7774.shape)

In [None]:
# Cluster large ICI trial

ici = pd.read_feather('~/Downloads/clean_joined_70966_43119.feather')

print(ici.shape)


In [None]:
clustered_ici = cluster_humann_table('~/Downloads/clean_joined_70966_43119.feather',
                                    '../../data/clustering/cluster_maps/combined_clusters.tsv')

In [None]:
print(clustered_ici.shape)

In [None]:
clustered_ici.to_csv('/Users/odesa/OneDrive - University of Toronto/LabWork/ICI/LatestData/clustered_complete_70966_43119.tsv', sep='\t', index=False)

In [None]:
grouped_ici = group_humann_table('~/Downloads/clean_joined_70966_43119.feather')

In [None]:
display(grouped_ici.head())

In [None]:
grouped_ici.to_csv('/Users/odesa/OneDrive - University of Toronto/LabWork/ICI/LatestData/grouped_70966_43119.tsv', sep='\t', index=False)

**iHMP2 Data, IBD**

In [None]:
ibd = pd.read_csv('E:/ibd_data/humann_second_run/ibd_genefamilies_relab_p2.tsv', sep='\t')

display(ibd.head())

In [None]:
clean_ibd = clean_table('E:/ibd_data/humann_second_run/ibd_genefamilies_relab_p2.tsv')

display(clean_ibd.head())

In [None]:
make_outputs(clean_ibd, 'E:/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned')

In [None]:
ibd = pd.read_csv('/Volumes/PGH-Backup/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned.tsv', sep='\t')

display(ibd.head())

In [None]:
clustered_ibd_old = cluster_humann_table('/Volumes/PGH-backup/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned.feather',
                                    '../../data/clustering/cluster_maps/combined_clusters.tsv')

In [None]:
cluster_map = pd.read_csv('../../data/clustering/cluster_maps/combined_clusters.tsv', sep='\t')



In [None]:
mask = np.column_stack([cluster_map[col].str.contains("A0A373P8V8", na=False) for col in cluster_map])

cluster_map.loc[mask.any(axis=1)]

In [None]:
humann_output = pd.read_feather('/Volumes/PGH-backup/ibd_data/humann_second_run/ibd_genefamiles_relab_cleaned.feather')

display(humann_output.head())

In [None]:
colnames = list(humann_output.columns)

colnames = colnames[2:]

colnames = [col.split('_')[2] for col in colnames]

print(colnames[:10])



In [None]:
'A0A373P8V8' in colnames

In [None]:
from collections import Counter

colnames = list(clustered_ibd_old.columns)

def find_duplicates(input_list):
    # Use Counter to count occurrences of each string
    counts = Counter(input_list)
    
    # Extract strings that have a count greater than 1
    duplicates = [item for item, count in counts.items() if count > 1]
    
    return duplicates


find_duplicates(colnames)

In [None]:
# Ensure there are no duplicate column names by resetting the columns index
clustered_ibd_old = clustered_ibd_old.loc[:, ~clustered_ibd_old.columns.duplicated()]

# Now drop the 'unclustered' columns
clustered_ibd_old = clustered_ibd_old.drop(columns='unclustered', errors='ignore')

print(clustered_ibd_old)

In [None]:
find_duplicates(colnames)

In [None]:
make_outputs(clustered_ibd_old, '/Volumes/PGH-backup/ibd_data/humann_second_run/clustered_ibd')

In [None]:
clustered_ibd = pd.read_csv('/Volumes/PGH-backup/ibd_data/humann_second_run/ibd_genefamilies_relab_clustered.tsv', sep='\t')

display(clustered_ibd.head())

In [None]:
clean_ibd_rna = clean_table('/Volumes/PGH-backup/ibd_data/rnaseq/ibd_rnaseq_relab_joined.tsv')

In [None]:
make_outputs(clean_ibd_rna, '/Volumes/PGH-backup/ibd_data/rnaseq/ibd_rnaseq_relab_cleaned')

In [None]:
ibd_rna_clustered = cluster_humann_table('/Volumes/PGH-backup/ibd_data/rnaseq/ibd_rnaseq_relab_cleaned.feather',
                                         '../../data/clustering/cluster_maps/combined_clusters.tsv')

In [None]:
ibd_rna_clustered = ibd_rna_clustered.loc[:, ~ibd_rna_clustered.columns.duplicated()]

# Now drop the 'unclustered' columns
ibd_rna_clustered = ibd_rna_clustered.drop(columns='unclustered', errors='ignore')

print(ibd_rna_clustered)

In [None]:
make_outputs(ibd_rna_clustered, '/Volumes/PGH-backup/ibd_data/rnaseq/ibd_rnaseq_clustered')

In [None]:
metadata = pd.read_csv('/Volumes/PGH-backup/ibd_data/hmp2_metadata_2018-08-20.csv')

display(metadata.head())

In [None]:
metadata = metadata[metadata['data_type'] == 'metagenomics']

In [None]:
metadata.to_csv('/Volumes/PGH-backup/ibd_data/hmp2_metagenomics_metadata.csv', index=False)