In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from functions import *
import os
import subprocess
import requests
import re

In [None]:
enzymes = ['dl_endopeptidase', 'ld_carboxypeptidase', 
               'ld_endopeptidase', 'amidase',
               'dd_carboxypeptidase', 'diadenylate_cyclase',
               'muramidase', 'glucosaminidase']

In [None]:
df = pd.read_csv('../../data/clustering/cluster_maps/dl_endopeptidases.tsv',sep='\t')

df = df.iloc[:, 1:4]

df = df[pd.isna(df[df.columns[2]])]

grouped = df.groupby('dl_endopeptidases-mmseqs_cluster')['dl_endopeptidases-unclustered'].apply(list).reset_index(name='unclustered_list')

display(grouped, len(grouped))

In [None]:
to_download = grouped['unclustered_list'].tolist()

print(to_download)

In [None]:
def download_and_check(ids, output_dir, replacements):
    replacement_found = False  # Flag to track if a replacement is found

    for id in ids:
        url = f"https://alphafold.ebi.ac.uk/files/AF-{id}-F1-model_v4.pdb"
        pdb_file_path = os.path.join(output_dir, f"AF-{id}-F1-model_v4.pdb")

        # Download the file
        response = requests.get(url)
        if response.status_code == 200:
            with open(pdb_file_path, 'wb') as file:
                file.write(response.content)

            # Check if "NoSuchKey" is in the file
            with open(pdb_file_path, 'r') as file:
                content = file.read()
                if "NoSuchKey" in content:
                    os.remove(pdb_file_path)  # Delete the file if the string is found
                else:
                    replacements.append(id)
                    replacement_found = True
                    break  # Exit the loop if the string is not found

    if not replacement_found:
        replacements.append('None')  # Append None if no replacement was found




In [None]:
# Directory containing the output files
output_dir = "C:/Users/odesa/Desktop/PDB_test/"
os.makedirs(output_dir, exist_ok=True)

# Loop over each list of IDs
replacements = []
for id_list in to_download:
    download_and_check(id_list, output_dir, replacements)
    # The function will stop processing the current list if "NoSuchKey" is not found in any file

In [None]:
print(replacements)

In [None]:
grouped['unclustered_list'] = replacements
print(grouped)

In [None]:
original = pd.read_csv('../../data/clustering/cluster_maps/dl_endopeptidases.tsv',sep='\t')

df = df.iloc[:, 1:4]

In [None]:
merged_df = pd.merge(original, grouped, on='dl_endopeptidases-mmseqs_cluster', how='left')

original['replacements'] = merged_df['unclustered_list']

In [None]:
original['dl_endopeptidases-mmseqs_cluster'] = np.where(original['replacements'].notna(), 
                                                        original['replacements'], original['dl_endopeptidases-mmseqs_cluster'])

In [None]:
# Filter the dataframe based on the condition
filtered_df = grouped[grouped['unclustered_list'] == 'None']

# Get the values from the 'dl_endopeptidases-mmseqs_cluster' column
values = filtered_df['dl_endopeptidases-mmseqs_cluster'].values

# Create a text file and write the values to it
with open('E:/PDBs/dl_endopeptidases_representatives/to_delete.txt', 'w') as file:
    for value in values:
        file.write(str(value) + '\n')


In [None]:
# Needs to be made into a function and optimized. Shoul dimplement multiprocessing for
# downloading the PDBs and checking if they are empty


enzymes = ['dd_endopeptidase']

for enzyme in enzymes:
    
    # read in the cluster map, drop the first column
    df = pd.read_csv('../../data/clustering/cluster_maps/' + enzyme + '.tsv',sep='\t')
    df = df.iloc[:, 1:4]

    # drop the rows where the foldseek cluster is not empty
    no_foldseek = df[pd.isna(df[df.columns[2]])]

    # group by the mmseqs cluster and create a list of the unclustered ids
    grouped = no_foldseek.groupby(f'{enzyme}-mmseqs_cluster')[f'{enzyme}-unclustered'].apply(list).reset_index(name='unclustered_list')

    # create a directory for the PDB files
    to_download = grouped['unclustered_list'].tolist()
    output_dir = f"/Users/odesa/Desktop/PDB_test/{enzyme}"
    os.makedirs(output_dir, exist_ok=True)

    # Loop over each list of IDs
    replacements = []
    for id_list in to_download:
        # download new PDBs, check if empty, creating a list of replacements
        download_and_check(id_list, output_dir, replacements)

    # Replace the 'unclustered_list' column with the list of replacements
    grouped['unclustered_list'] = replacements

    # Merge the original dataframe with the new one on mmseqs cluster
    merged_df = pd.merge(df, grouped, on=f'{enzyme}-mmseqs_cluster', how='left')

    # Create a new column with the list of replacements
    df['replacements'] = merged_df['unclustered_list']

    # Replace the mmseqs cluster with the list of replacements if it is not empty
    df[f'{enzyme}-mmseqs_cluster'] = np.where(df['replacements'].notna(), 
                                                df['replacements'], 
                                                df[f'{enzyme}-mmseqs_cluster'])

    # Filter the dataframe based on the condition
    filtered_df = grouped[grouped['unclustered_list'] == 'None']

    # Get the values from the 'dl_endopeptidases-mmseqs_cluster' column
    values = filtered_df[f'{enzyme}-mmseqs_cluster'].values 

    # Create a text file and write the values to it
    with open(f'/Users/odesa/Desktop/PDB_test/{enzyme}/to_delete.txt', 'w') as file:
        for value in values:
            file.write(str(value) + '\n')

    df.to_csv(f'~/Desktop/PDB_test/{enzyme}_cluster_update.tsv', sep='\t', index=False)

In [None]:

for enzyme in enzymes:
    df = pd.read_csv(f'/Users/odesa/Desktop/PDB_test/{enzyme}_cluster_update.tsv', sep='\t')

    # df[f'{enzyme}s-mmseqs_cluster'] = df.apply(lambda row: row[f'{enzyme}s-unclustered'] if pd.isna(row[f'{enzyme}s-mmseqs_cluster']) else row[f'{enzyme}s-mmseqs_cluster'], axis=1)
    
    # df.to_csv(f'C:/Users/odesa/Desktop/{enzyme}_cluster_update.tsv', sep='\t', index=False)
    
    unique_values = df[f'{enzyme}-mmseqs_cluster'].unique().tolist()

    downloaded = []
    with open('/Volumes/PGH-Backup/dd_endopeptidase_clustering/dd_endopeptidases_representatives/all_files.txt', 'r') as file:
        for line in file:
            downloaded.append(line.strip())

    to_remove = [x for x in downloaded if x not in unique_values]
    print(f'{enzyme}: {len(to_remove)} {to_remove}')

    with open('/Volumes/PGH-Backup/dd_endopeptidase_clustering/dd_endopeptidases_representatives/to_remove.txt', 'w') as file:
        for value in to_remove:
            file.write(str(value) + '\n')

In [None]:
enzymes = ['dl_endopeptidase', 'ld_carboxypeptidase', 
               'ld_endopeptidase', 'amidase',
               'dd_carboxypeptidase', 'diadenylate_cyclase',
               'muramidase', 'glucosaminidase']

In [None]:
foldseek_path = '/media/oliver/PGH_Backup/clustering/new_foldseek/'
mmseqs_path = '/media/oliver/PGH_Backup/clustering/maps/'

for enzyme in enzymes:
    df = pd.read_csv(f'{mmseqs_path}{enzyme}_cluster_update.tsv', sep='\t')
    # print(df.head())
    df = df.iloc[:, 0:2]
    # print(df.head())
    cols = df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    df = df[cols]
    
    df.to_csv(f'{mmseqs_path}{enzyme}_cluster_no_foldseek.tsv', sep='\t', index=False)



In [None]:
foldseek_path = '/media/oliver/PGH_Backup/clustering/new_foldseek/'
mmseqs_path = '/media/oliver/PGH_Backup/clustering/maps/'

for enzyme in enzymes:
    create_maps(f'{mmseqs_path}{enzyme}_cluster_no_foldseek.tsv', f'{foldseek_path}{enzyme}/foldseek_result_{enzyme}_cluster.tsv', f'{enzyme}')

In [None]:
unique_values = []

for enzyme in enzymes:
    df = pd.read_csv(f'../../data/clustering/cluster_maps/{enzyme}.tsv', sep='\t')
    
    print(f"Original DataFrame shape for {enzyme}: {df.shape}")

    # Get the counts of each value in the target column
    value_counts = df.iloc[:, 3].value_counts()

    # Identify the values that occur only once
    single_occurrence_values = value_counts[value_counts == 1].index

    # Drop rows where the column value is one of those that occur only once
    df = df[~df.iloc[:, 3].isin(single_occurrence_values)]

    df.to_csv(f'../../data/clustering/cluster_maps/{enzyme}.tsv', sep='\t', index=False)

    print(f"Modified DataFrame shape for {enzyme}: {df.shape}")

In [None]:
create_maps('/Volumes/PGH-Backup/foldseek/dd_endopeptidases_clusters.tsv', '/Volumes/PGH-Backup/foldseek/foldseek_result/foldseek_result_cluster.tsv', 'dd_endopeptidase')

In [None]:
# Read the TSV files into DataFrames
df_10_columns = pd.read_csv('../../data/clustering/cluster_maps/combined_clusters.tsv', sep='\t')
df_3_columns = pd.read_csv('../../data/clustering/cluster_maps/dd_endopeptidase.tsv', sep='\t')

# Concatenate the DataFrames along the columns
combined_df = pd.concat([df_10_columns, df_3_columns], axis=1)

# print(combined_df.columns)

# drop the column called Unnamed: 0

combined_df = combined_df.drop('Unnamed: 0', axis=1)

# print(combined_df.columns)

# Save the combined DataFrame to a TSV file
combined_df.to_csv('../../data/clustering/cluster_maps/combined_clusters.tsv', sep='\t', index=False)

In [None]:
dd_endopeptidase = pd.read_csv('../../data/clustering/cluster_maps/dd_endopeptidase.tsv', sep='\t')

In [None]:
dd_endopeptidase_cluster = pd.read_csv('/Users/odesa/Desktop/PDB_test/dd_endopeptidase_cluster_update.tsv', sep='\t')


display(dd_endopeptidase_cluster)

In [None]:
dd_endopeptidase_cluster.loc[dd_endopeptidase_cluster['dd_endopeptidase-foldseek_cluster'].isna(), 'dd_endopeptidase-foldseek_cluster'] = dd_endopeptidase_cluster['replacements']

In [None]:
display(dd_endopeptidase_cluster)

In [None]:
dd_endopeptidase_cluster.drop(columns=['replacements'], inplace=True)

In [None]:
dd_endopeptidase_cluster.to_csv('../../data/clustering/cluster_maps/dd_endopeptidase.tsv', sep = '\t', index=False)

In [None]:
master_map = pd.read_csv('../../data/clustering/cluster_maps/combined_clusters.tsv', sep = '\t')



In [None]:
master_map.drop(columns=['dd_endopeptidase-unclustered', 'dd_endopeptidase-mmseqs_cluster', 'dd_endopeptidase-foldseek_cluster'], inplace=True)

In [None]:
master_map = pd.concat([master_map, dd_endopeptidase_cluster], axis=1)

In [None]:
master_map.to_csv('../../data/clustering/cluster_maps/combined_clusters.tsv', sep = '\t', index=False)

**Testing for Clustering Function**

In [22]:
def cluster_humann_table_with_detailed_checks(humann_feather, cluster_tsv):
    """Cluster the humann table for each of the PGH enzymes with detailed checks."""
    
    # read in the humann table
    humann_df = pd.read_feather(humann_feather)

    # read in the clustering dataframes
    cluster_df = pd.read_csv(cluster_tsv, sep='\t', low_memory=False)

    # list of enzymes
    enzymes = ['DL-endopeptidase', 'LD-carboxypeptidase', 
               'LD-endopeptidase', 'Glucosaminidase',
               'DD-carboxypeptidase', 'DD-endopeptidase',
               'Amidase', 'Muramidase']

    clustered_df = pd.DataFrame()

    # Create a mapping for each enzyme's clusters beforehand
    cluster_map = {}
    for enzyme in enzymes:
        enzyme_col = f"{enzyme.replace('-', '_').lower()}-unclustered"
        cluster_col = f"{enzyme.replace('-', '_').lower()}-foldseek_cluster"
        enzyme_cluster_map = cluster_df.set_index(enzyme_col)[cluster_col].to_dict()
        cluster_map[enzyme] = enzyme_cluster_map

    # Process each enzyme
    for enzyme in enzymes:
        df = humann_df.loc[:, humann_df.columns.str.startswith(enzyme)]
        column_names = df.columns.tolist()

        print(f'{len(column_names)} {enzyme} found')

        # Extract the UniRef IDs from the column names
        column_ids = [x.split('_')[2] for x in column_names]

        # Get the foldseek cluster for each UniRef ID
        results = []
        unclustered = []
        clusters_info = {}
        for id in column_ids:
            result = cluster_map[enzyme].get(id, "unclustered")
            if result == "unclustered":
                unclustered.append(id)
            else:
                cluster_id = f"{enzyme}-{result}"
                clusters_info.setdefault(cluster_id, []).append(id)
            results.append(f"{enzyme}-{result}" if result != "unclustered" else "unclustered")
        
        print(f"{len(results)} {enzyme} processed, {len(unclustered)} {enzyme} unclustered")

        # Detailed inspection of clusters
        for cluster_id, ids in clusters_info.items():
            print(f"Cluster {cluster_id} contains {len(ids)} UniRef100 IDs: {ids[:5]}...")  # Print first 5 IDs for brevity

        # Print the unclustered IDs
        if unclustered:
            print(f"Unclustered UniRef100 IDs for {enzyme}: {unclustered[:5]}...")  # Print first 5 unclustered IDs for brevity

        # Replace the column names with the foldseek cluster
        df.columns = results

        # Aggregate the columns by foldseek cluster
        agg_df = df.T.groupby(df.columns).sum().T

        # Add the aggregated df to the clustered df
        clustered_df = pd.concat([clustered_df, agg_df], axis=1)
    
    # Add the sample id column back to the dataframe
    clustered_df['sample_id'] = humann_df['sample_id']

    return clustered_df


In [None]:
new_clust = cluster_humann_table_with_detailed_checks("E:\\CRC\\PRJEB7774\\humann\\new_combined\\clean_joined_genefamilies_relab_7774.feather", "../../data/clustering/cluster_maps/combined_clusters.tsv")

In [13]:
print(new_clust.shape)

(155, 1550)


In [14]:
def validate_enzyme_clusters(cluster_tsv_path):
    """Validate that each UniRef100 ID maps to only one unique Foldseek cluster within each enzyme type."""
    
    # Read in the clustering dataframe
    cluster_df = pd.read_csv(cluster_tsv_path, sep='\t', low_memory=False)
    
    # Define the enzymes and their corresponding column groups
    enzymes = ['dl_endopeptidase', 'ld_carboxypeptidase', 'ld_endopeptidase', 'glucosaminidase',
               'diadenylate_cyclase', 'muramidase', 'dd_carboxypeptidase', 'amidase', 'dd_endopeptidase']
    
    # For each enzyme, check if UniRef100 IDs map to more than one Foldseek cluster
    validation_results = {}
    for enzyme in enzymes:
        # Extract columns related to the current enzyme
        unclustered_col = f'{enzyme}-unclustered'
        foldseek_col = f'{enzyme}-foldseek_cluster'
        
        # Group by UniRef100 IDs and check how many unique Foldseek clusters they map to
        duplicate_check = cluster_df.groupby(unclustered_col).agg({foldseek_col: pd.Series.nunique})
        
        # Identify cases where a UniRef100 ID maps to multiple Foldseek clusters
        duplicates = duplicate_check[duplicate_check[foldseek_col] > 1]
        
        # Store the result
        if len(duplicates) > 0:
            validation_results[enzyme] = duplicates
            print(f"Warning: {len(duplicates)} UniRef100 IDs for {enzyme} map to multiple clusters.")
        else:
            print(f"Validation passed for {enzyme}: All UniRef100 IDs map to a single Foldseek cluster.")
    
    return validation_results

In [15]:
res = validate_enzyme_clusters("../../data/clustering/cluster_maps/combined_clusters.tsv")

Validation passed for dl_endopeptidase: All UniRef100 IDs map to a single Foldseek cluster.
Validation passed for ld_carboxypeptidase: All UniRef100 IDs map to a single Foldseek cluster.
Validation passed for ld_endopeptidase: All UniRef100 IDs map to a single Foldseek cluster.
Validation passed for glucosaminidase: All UniRef100 IDs map to a single Foldseek cluster.
Validation passed for diadenylate_cyclase: All UniRef100 IDs map to a single Foldseek cluster.
Validation passed for muramidase: All UniRef100 IDs map to a single Foldseek cluster.
Validation passed for dd_carboxypeptidase: All UniRef100 IDs map to a single Foldseek cluster.
Validation passed for amidase: All UniRef100 IDs map to a single Foldseek cluster.
Validation passed for dd_endopeptidase: All UniRef100 IDs map to a single Foldseek cluster.


In [37]:
def cluster_humann_table_improved(humann_feather, cluster_tsv):
    """Cluster the humann table for each of the PGH enzymes and store cluster information."""
    
    # read in the humann table
    humann_df = pd.read_feather(humann_feather)

    # read in the clustering dataframes
    cluster_df = pd.read_csv(cluster_tsv, sep='\t', low_memory=False)

    # list of enzymes
    enzymes = ['DL-endopeptidase', 'LD-carboxypeptidase', 
               'LD-endopeptidase', 'Glucosaminidase',
               'DD-carboxypeptidase', 'DD-endopeptidase',
               'Amidase', 'Muramidase']
    
    extra_classes = ['Saga', 'UC118']

    clustered_df = pd.DataFrame()
    
    clustered_df = pd.DataFrame()
    
    # This will store information about each cluster
    cluster_info_list = []

    # Create a mapping for each enzyme's clusters beforehand
    cluster_map = {}
    for enzyme in enzymes:
        enzyme_col = f"{enzyme.replace('-', '_').lower()}-unclustered"
        cluster_col = f"{enzyme.replace('-', '_').lower()}-foldseek_cluster"
        enzyme_cluster_map = cluster_df.set_index(enzyme_col)[cluster_col].to_dict()
        cluster_map[enzyme] = enzyme_cluster_map

    # Process each enzyme
    for enzyme in enzymes:
        df = humann_df.loc[:, humann_df.columns.str.startswith(enzyme)]
        column_names = df.columns.tolist()

        print(f'{len(column_names)} {enzyme} found')

        # Extract the UniRef IDs from the column names
        column_ids = [x.split('_')[2] for x in column_names]

        # Get the foldseek cluster for each UniRef ID
        results = []
        clusters_info = {}
        for id in column_ids:
            result = cluster_map[enzyme].get(id, "unclustered")
            if result != "unclustered":
                cluster_id = f"{enzyme}-{result}"
                clusters_info.setdefault(cluster_id, []).append(id)
            results.append(f"{enzyme}-{result}" if result != "unclustered" else "unclustered")
        
        # Replace the column names with the foldseek cluster
        df.columns = results

        # Aggregate the columns by foldseek cluster
        agg_df = df.T.groupby(df.columns).sum().T

        # Add the aggregated df to the clustered df
        clustered_df = pd.concat([clustered_df, agg_df], axis=1)

        # Collect the cluster information for analysis
        for cluster_id, ids in clusters_info.items():
            # Sum the final abundance for this cluster
            final_abundance = agg_df[cluster_id].sum()

            # Add the cluster info
            cluster_info_list.append({
                'cluster_id': cluster_id,
                'enzyme': enzyme,
                'num_uniref_ids': len(ids),
                'final_abundance': final_abundance
            })
    
    # Aggregate the extra classes (Saga and uc118) into single columns each
    for extra_class in extra_classes:
        df_extra = humann_df.loc[:, humann_df.columns.str.startswith(extra_class)]
        
        if not df_extra.empty:
            print(f'{len(df_extra.columns)} {extra_class} found')
            # Sum all columns for the extra class into one column
            extra_class_agg = df_extra.sum(axis=1)
            clustered_df[f'{extra_class}_aggregated'] = extra_class_agg

            # Collect the info for the extra classes
            cluster_info_list.append({
                'cluster_id': f'{extra_class}_aggregated',
                'enzyme': extra_class,
                'num_uniref_ids': df_extra.shape[1],
                'final_abundance': extra_class_agg.sum()
            })
        else:
            print(f'No {extra_class} found')

    # Add the sample id column back to the dataframe
    clustered_df['sample_id'] = humann_df['sample_id']
    
    # Convert cluster info list to DataFrame
    cluster_info_df = pd.DataFrame(cluster_info_list)
    
    return clustered_df, cluster_info_df


In [38]:
new_clust, new_clust_info = cluster_humann_table_improved("E:\\CRC\\PRJEB7774\\humann\\new_combined\\clean_joined_genefamilies_relab_7774.feather", "../../data/clustering/cluster_maps/combined_clusters.tsv")

369 DL-endopeptidase found
4154 LD-carboxypeptidase found
37 LD-endopeptidase found
2602 Glucosaminidase found
11868 DD-carboxypeptidase found
2659 DD-endopeptidase found
21146 Amidase found
34888 Muramidase found
No Saga found
99 UC118 found


In [36]:
group = group_humann_table("E:\\CRC\\PRJEB7774\\humann\\new_combined\\clean_joined_genefamilies_relab_7774.feather")

  def group_humann_table(humann_table):


Original width: 80749, Grouped width: 12
