In [58]:
from functions import *
import pandas as pd
from pathlib import Path

**Read in the data for table generation**

In [46]:
secretion = pd.read_csv('E:\\blastp_hits\\dl_endo_signalp.txt', sep='\t')

dl_map = pd.read_csv('../../data/clustering/cluster_maps/dl_endopeptidase.tsv', sep='\t')

taxonomy = pd.read_csv('E:\\blastp_hits\\taxonomy_mapping.tsv', sep='\t')

uniref_genome = pd.read_csv('E:\\blastp_hits\\uniref_genome_map.txt', sep='\t', header=None, names=['Genome', 'Uniref'])

# print(secretion.head(), secretion.shape)
# print(dl_map.head(), dl_map.shape)
# print(taxonomy.head(), taxonomy.shape)
# print(uniref_genome.head(), uniref_genome.shape)

In [47]:
# Clean the secretion file to include genome IDs and predicitons

secretion['# ID'] = secretion['# ID'].str.split('#').str[0]

secretion['# ID'] = secretion['# ID'].str.split('_').str[0]

secretion = secretion[['# ID', 'Prediction']]

secretion = secretion.rename(columns={'# ID': 'Genome'})

# IDs have extra indicators like this: MGYG000155050_27_22 not found in the mapping file. strip themn off for now 

display(secretion.head()) 

Unnamed: 0,Genome,Prediction
0,MGYG000155050,OTHER
1,MGYG000096674,OTHER
2,MGYG000074457,SP
3,MGYG000074457,SP
4,MGYG000002288,SP


In [48]:
# Clean the uniref map add names to columns

uniref_genome['Genome'] = uniref_genome['Genome'].str.split('_').str[0]

uniref_genome['Uniref'] = uniref_genome['Uniref'].str.split('_').str[1]

display(uniref_genome.head())


Unnamed: 0,Genome,Uniref
0,MGYG000017941,A0A1C5WD06
1,MGYG000040828,A0A1C5LUC3
2,MGYG000089934,A0A174QFW1
3,MGYG000089934,A0A1C5LUC3
4,MGYG000097266,A0A329TNF5


In [49]:
# Merge the secretion data with the taxonomy data

secretion_and_taxa = secretion.merge(taxonomy, left_on='Genome', right_on='Genome', how='left')

# Merge the secretion and taxonomy data with the uniref_genome data

dle_info = secretion_and_taxa.merge(uniref_genome, left_on='Genome', right_on='Genome', how='left')

display(dle_info.head())

Unnamed: 0,Genome,Prediction,Lineage,Uniref
0,MGYG000155050,OTHER,d__Bacteria;p__Firmicutes;c__Bacilli;o__RFN20;...,A0A3S5AQD8
1,MGYG000096674,OTHER,d__Bacteria;p__Firmicutes;c__Bacilli;o__RFN20;...,A0A1C6G2Q5
2,MGYG000074457,SP,d__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...,A0A3P1BK63
3,MGYG000074457,SP,d__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...,A0A2S9XVU2
4,MGYG000074457,SP,d__Bacteria;p__Firmicutes;c__Bacilli;o__Bacill...,A0A3P1BK63


In [50]:
print(secretion.shape, secretion_and_taxa.shape, taxonomy.shape, dle_info.shape)

# (75556, 2) (75556, 3) (289232, 2) (126340, 4)

(75556, 2) (75556, 3) (289232, 2) (126340, 4)


In [51]:
dle_info.isna().sum()

# Genome          0
# Prediction      0
# Lineage       545
# Uniref          0
# dtype: int64

Genome          0
Prediction      0
Lineage       545
Uniref          0
dtype: int64

In [52]:
# Write this data to a file

dle_info.to_csv('E:\\blastp_hits\\dl_endo_info.tsv', sep='\t', index=False)

In [53]:
display(dl_map.head())

Unnamed: 0.1,Unnamed: 0,dl_endopeptidase-unclustered,dl_endopeptidase-mmseqs_cluster,dl_endopeptidase-foldseek_cluster
0,0,A0A7S8CCP0,A0A7S8CCP0,A0A7X2Z4H8
1,1,A0A928LPM8,A0A413G4S5,A0A1C5Q6A7
2,2,A0A413G4S5,A0A413G4S5,A0A1C5Q6A7
3,3,A0A926IJ38,A0A413G4S5,A0A1C5Q6A7
4,4,A0A498CNA2,A0A413G4S5,A0A1C5Q6A7


In [54]:
dl_map.rename(columns={'dl_endopeptidase-unclustered': 'Uniref'}, inplace=True)

display(dl_map.head())

Unnamed: 0.1,Unnamed: 0,Uniref,dl_endopeptidase-mmseqs_cluster,dl_endopeptidase-foldseek_cluster
0,0,A0A7S8CCP0,A0A7S8CCP0,A0A7X2Z4H8
1,1,A0A928LPM8,A0A413G4S5,A0A1C5Q6A7
2,2,A0A413G4S5,A0A413G4S5,A0A1C5Q6A7
3,3,A0A926IJ38,A0A413G4S5,A0A1C5Q6A7
4,4,A0A498CNA2,A0A413G4S5,A0A1C5Q6A7


In [55]:
filtered_dle_info = dle_info[dle_info['Uniref'].isin(dl_map['Uniref'])]

print(filtered_dle_info.shape)

(124263, 4)


In [56]:
dle_grouped = dle_info.groupby(['Uniref']).agg(list).reset_index()

print(dle_grouped.shape)

(397, 4)


In [57]:
dl_and_sec = dl_map.merge(dle_grouped, on='Uniref', how='inner')

display(dl_and_sec.head())

print(dl_and_sec.shape)

Unnamed: 0.1,Unnamed: 0,Uniref,dl_endopeptidase-mmseqs_cluster,dl_endopeptidase-foldseek_cluster,Genome,Prediction,Lineage
0,13,A7VT68,A0A413G4S5,A0A1C5Q6A7,"[MGYG000158508, MGYG000023630, MGYG000011546, ...","[OTHER, OTHER, OTHER, OTHER, OTHER, OTHER, OTH...",[d__Bacteria;p__Firmicutes_A;c__Clostridia;o__...
1,14,A0A1C5YFQ8,A0A413G4S5,A0A1C5Q6A7,"[MGYG000278675, MGYG000147991, MGYG000250343, ...","[OTHER, OTHER, OTHER, OTHER, OTHER, OTHER, OTH...",[d__Bacteria;p__Firmicutes_A;c__Clostridia;o__...
2,17,A0A143ZRA8,A0A413G4S5,A0A1C5Q6A7,"[MGYG000130225, MGYG000000129, MGYG000000129, ...","[OTHER, OTHER, LIPO, OTHER]",[d__Bacteria;p__Firmicutes_A;c__Clostridia;o__...
3,31,A0A174FMR5,A0A174FMR5,A0A348AMN4,"[MGYG000228068, MGYG000000028, MGYG000000028, ...","[SP, SP, SP, SP, SP, SP, OTHER, SP, SP, SP, SP...",[d__Bacteria;p__Bacteroidota;c__Bacteroidia;o_...
4,32,A0A174C4S4,A0A174FMR5,A0A348AMN4,"[MGYG000062274, MGYG000007799, MGYG000007799, ...","[SP, SP, SP, SP, SP, SP, SP, SP, SP, SP, SP, S...",[d__Bacteria;p__Bacteroidota;c__Bacteroidia;o_...


(388, 7)


In [68]:
result = dl_and_sec.loc[dl_and_sec['Uniref'] == 'A0A1C5YFQ8', 'Lineage'].values[0]

print(len(result))

173


In [60]:
dl_and_sec.to_csv('E:\\blastp_hits\\dl_endo_info_grouped.tsv', sep='\t', index=False)

In [64]:
# Now map is created, need to make a function to quickly index clusters and pull important
# information from the table. Get: number species, taxa info, secretion percentage

def describe_cluster(cluster_id: str, info_table: Path):
    """
    This function takes a cluster ID and a table of information about the clusters and returns
    the number of species, the taxa information, and the percentage of the cluster that is predicted
    to be secreted. 
    """

    info_table = pd.read_csv(info_table, sep='\t')

    cluster_info = info_table[info_table['dl_endopeptidase-foldseek_cluster'] == cluster_id]

    print(cluster_info.shape)
    
    num_species = cluster_info['Lineage'].apply(len).sum()



    # taxa_info = cluster_info['Lineage'].unique()
    # secretion_percentage = (cluster_info['Prediction'] == 'Y').sum() / len(cluster_info)
    return num_species, # taxa_info, secretion_percentage
  

In [65]:
print(describe_cluster('A0A1C5Q6A7', 'E:\\blastp_hits\\dl_endo_info_grouped.tsv'))

(37, 7)
(1093912,)


In [13]:
# dl_and_sec.to_csv('dl_and_sec.tsv', sep='\t', index=False)

In [16]:
# Group by the foldseek cluster ID
grouped_data = dl_and_sec.groupby('dl_endopeptidase-foldseek_cluster')

# Initialize a list to hold the statistics for each cluster
cluster_stats = []

# Iterate over each group to calculate statistics
for cluster_id, group in grouped_data:
    total_proteins = group.shape[0]  # Total proteins in the cluster
    secreted_proteins = group[group['Prediction'] == 'SP'].shape[0]  # Proteins predicted as secreted
    percentage_secreted = (secreted_proteins / total_proteins) * 100  # Calculate the percentage
    
    # Append the statistics for this cluster to the list
    cluster_stats.append({
        'Foldseek Cluster ID': cluster_id,
        'Total Proteins': total_proteins,
        'Secreted Proteins': secreted_proteins,
        'Percentage Secreted': percentage_secreted
    })

# Convert the list of statistics into a DataFrame for better visualization
stats_df = pd.DataFrame(cluster_stats)

# Display the statistics DataFrame
stats_df.head()

Unnamed: 0,Foldseek Cluster ID,Total Proteins,Secreted Proteins,Percentage Secreted
0,A0A077MGS3,4,2,50.0
1,A0A078MK14,3,0,0.0
2,A0A0A2TFV7,3,3,100.0
3,A0A0B0HUJ2,5,0,0.0
4,A0A0D0RVH7,7,7,100.0
