In [1]:
import pandas as pd
import numpy as np
import subprocess
import sys
sys.path.append('..')
from src.useful import *

## Notebook to parse Foldseek Hits to PET Catabolism Enzymes

In [2]:
# Load the data
data = pd.read_csv('../PET_Metabolic_genes.csv')
#Drop the last column
data = data.drop(data.columns[-1], axis=1)
#Remove the rows with no Uniprot identifier
data = data.dropna(subset=['Uniprot'])
#remove '\xa0' from the Uniprot identifiers
data['Uniprot'] = data['Uniprot'].str.replace('\xa0', '')
data.head()


Unnamed: 0,protein,pathway,sequence,Uniprot
0,PETase (LCC cutinase),PET degradation,MDGVLWRVRTAALMAALLALAAWALVWASPSVEAQSNPYQRGPNP...,G9BY57
1,MHETase,PET degradation,MQTTVTTMLLASVALAACAGGGSTPLPLPQQQPPQQEPPPPPVPLA...,A0A0K8P8E7
3,tpaK (transporter),PET degradation,MSLAPSRVTLPDFIDSRPVSRYQYIVIALCGVVMFIDGFDTQSISY...,Q0RWE8
4,tphA1,terephthalate funneling,MNHQIHIHDSDIAFTCAPGQSVLDAALQAGIELPYSCRKGSCGNCA...,Q5D0X3
5,tphA2,terephthalate funneling,MQESIIQWHGATNTRVPFGIYTDTANADQEQQRIYRGEVWNYLCLE...,Q5D0X6


In [10]:
#Iterate over the Uniprot identifiers and wget the corresponding pdb files from the PDB database
for uniprot in data['Uniprot']:
    #Create the url
    url='https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.pdb'.format(uniprot)
    #Create the command
    command = 'wget -O {}.pdb {}'.format(uniprot, url)
    #Run the command
    subprocess.run(command, shell=True)


--2024-09-10 14:15:20--  https://alphafold.ebi.ac.uk/files/AF-
Resolving alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)... 34.149.152.8
Connecting to alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)|34.149.152.8|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2024-09-10 14:15:21 ERROR 404: Not Found.

--2024-09-10 14:15:21--  http://g9by57-f1-model_v4.pdb/
Resolving g9by57-f1-model_v4.pdb (g9by57-f1-model_v4.pdb)... failed: Name or service not known.
wget: unable to resolve host address ‘g9by57-f1-model_v4.pdb’
--2024-09-10 14:15:21--  https://alphafold.ebi.ac.uk/files/AF-A0A0K8P8E7-F1-model_v4.pdb
Resolving alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)... 34.149.152.8
Connecting to alphafold.ebi.ac.uk (alphafold.ebi.ac.uk)|34.149.152.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: ‘A0A0K8P8E7.pdb’

     0K .......... .......... .......... .......... ..........  413K
    50K .......... .......... ...

In [11]:
data.shape

(45, 4)

In [4]:
def read_m8(file):
    '''
    Read the m8 file and return a dataframe
    '''
    #Read the file
    data = pd.read_csv(file, sep='\t', header=None)
    #Rename the columns
    data.columns = ['query', 'subject', 'identity', 'length', 'mismatches', 'gap_openings', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bit_score']
    return data

In [38]:
b_4 = read_m8('../foldseek/PM_B4_search_sept10.m8')
#Rename subject column to B4_proteins
b_4 = b_4.rename(columns={'subject':'B4_proteins'})
#Drop '.pdb' from the query column
b_4['query'] = b_4['query'].str.replace('.pdb', '')
#Merge the dataframes
b4_data = data.copy()
b4_data = b4_data.merge(b_4, left_on='Uniprot', right_on='query', how='right')
#Drop the sequence column
b4_data = b4_data.drop('sequence ', axis=1)
#Drop the query column
b4_data = b4_data.drop('query', axis=1)
#Write the data to a csv file
b4_data.to_csv('../foldseek/PM_B4_search_sept10.csv', index=False)
b4_data.head()

  b_4['query'] = b_4['query'].str.replace('.pdb', '')


Unnamed: 0,protein,pathway,Uniprot,B4_proteins,identity,length,mismatches,gap_openings,q_start,q_end,s_start,s_end,evalue,bit_score
0,PcaB,"PCA-3,4-ortho-cleavage pathway",P32427,D4994_C39_H2_Bin_234_scaffold_27712_8.pdb,0.256,417,303,0,5,421,1,408,6.954e-24,1092
1,PedE,ethylene glycol metabolism,Q88JH5,D4994_C39_H2_Bin_234_scaffold_59398_12.pdb,0.159,538,268,0,86,623,408,727,2.9420000000000005e-17,386
2,PedE,ethylene glycol metabolism,Q88JH5,D4994_C39_H2_Bin_234_scaffold_31466_2.pdb,0.099,548,243,0,74,621,9,279,3.843e-06,98
3,PmdU,"PCA-4,5-meta-cleavage pathway",D1MW94,D4994_C39_H2_Bin_234_scaffold_38092_2.pdb,0.119,266,201,0,1,229,19,284,1.091e-11,294
4,PmdU,"PCA-4,5-meta-cleavage pathway",D1MW94,D4994_C39_H2_Bin_234_scaffold_69882_3.pdb,0.131,264,196,0,4,229,20,283,7.028e-11,263


In [39]:
for pdb in b4_data['B4_proteins']:
    command = 'cp ../B4_output_pdbs/{} ../foldseek/B4_PM_discovered_pdb/'.format(pdb)
    subprocess.run(command, shell=True)

In [16]:
#Read in m8 files
m8 = pd.read_csv('../foldseek/PETMET_GuaPAS_search_sept10.m8', sep='\t', header=None)
m8.columns = ['query', 'subject', 'identity', 'length', 'mismatches', 'gap_openings', 'q_start', 'q_end', 's_start', 's_end', 'evalue', 'bit_score']
m8.head()

Unnamed: 0,query,subject,identity,length,mismatches,gap_openings,q_start,q_end,s_start,s_end,evalue,bit_score
0,Q88F01.pdb,D4993_C5_H4_Bin_238_scaffold_57372_6.pdb,0.176,300,222,0,2,271,23,322,3.711e-12,317
1,Q88F01.pdb,D4993_C5_H4_Bin_238_scaffold_56676_5.pdb,0.114,311,257,0,2,292,18,328,7.455e-09,182
2,Q88F01.pdb,D4993_C5_H4_Bin_238_scaffold_30544_2.pdb,0.117,272,237,0,1,272,6,274,1.637e-06,156
3,C4TP01.pdb,D4993_C5_H4_Bin_238_scaffold_103874_3.pdb,0.142,266,205,0,1,266,43,282,2.191e-11,248
4,C4TP05.pdb,D4993_C5_H4_Bin_238_scaffold_32816_6.pdb,0.16,323,227,0,67,337,15,337,1.44e-12,338


In [17]:
#Drop '.pdb' from the subject column
m8['query'] = m8['query'].str.replace('.pdb', '')
#rename subject column to GuaPAS_protein
m8 = m8.rename(columns={'subject': 'GuaPAS_protein'})
m8.head()


  m8['query'] = m8['query'].str.replace('.pdb', '')


Unnamed: 0,query,GuaPAS_protein,identity,length,mismatches,gap_openings,q_start,q_end,s_start,s_end,evalue,bit_score
0,Q88F01,D4993_C5_H4_Bin_238_scaffold_57372_6.pdb,0.176,300,222,0,2,271,23,322,3.711e-12,317
1,Q88F01,D4993_C5_H4_Bin_238_scaffold_56676_5.pdb,0.114,311,257,0,2,292,18,328,7.455e-09,182
2,Q88F01,D4993_C5_H4_Bin_238_scaffold_30544_2.pdb,0.117,272,237,0,1,272,6,274,1.637e-06,156
3,C4TP01,D4993_C5_H4_Bin_238_scaffold_103874_3.pdb,0.142,266,205,0,1,266,43,282,2.191e-11,248
4,C4TP05,D4993_C5_H4_Bin_238_scaffold_32816_6.pdb,0.16,323,227,0,67,337,15,337,1.44e-12,338


In [25]:
#Merge the dataframes
data_m8 = data.copy()
data_m8 = data_m8.merge(m8, left_on='Uniprot', right_on='query', how='right')
data_m8.head()

Unnamed: 0,protein,pathway,sequence,Uniprot,query,GuaPAS_protein,identity,length,mismatches,gap_openings,q_start,q_end,s_start,s_end,evalue,bit_score
0,GlxR,ethylene glycol metabolism,MAKIGFIGTGIMGKPMAQNLQKAGHSLFISTHHDAAPADLIAAGAV...,Q88F01,Q88F01,D4993_C5_H4_Bin_238_scaffold_57372_6.pdb,0.176,300,222,0,2,271,23,322,3.711e-12,317
1,GlxR,ethylene glycol metabolism,MAKIGFIGTGIMGKPMAQNLQKAGHSLFISTHHDAAPADLIAAGAV...,Q88F01,Q88F01,D4993_C5_H4_Bin_238_scaffold_56676_5.pdb,0.114,311,257,0,2,292,18,328,7.455e-09,182
2,GlxR,ethylene glycol metabolism,MAKIGFIGTGIMGKPMAQNLQKAGHSLFISTHHDAAPADLIAAGAV...,Q88F01,Q88F01,D4993_C5_H4_Bin_238_scaffold_30544_2.pdb,0.117,272,237,0,1,272,6,274,1.637e-06,156
3,PraA,"PCA-2,3-meta-cleavage pathway",MSLEMALLAAHVPSICHESNVPDFQQDLVKGLKQMRDRINELQTDV...,C4TP01,C4TP01,D4993_C5_H4_Bin_238_scaffold_103874_3.pdb,0.142,266,205,0,1,266,43,282,2.191e-11,248
4,PraF,"PCA-2,3-meta-cleavage pathway",MTTERDVLITEVALRDGSHAIAHQYTVEQVTKVAKALGEAGVPYIE...,C4TP05,C4TP05,D4993_C5_H4_Bin_238_scaffold_32816_6.pdb,0.16,323,227,0,67,337,15,337,1.44e-12,338


In [27]:
#Drop the query column and the sequence column
data_m8 = data_m8.drop('sequence ', axis=1)
data_m8 = data_m8.drop('query', axis=1)
data_m8.head()

Unnamed: 0,protein,pathway,Uniprot,GuaPAS_protein,identity,length,mismatches,gap_openings,q_start,q_end,s_start,s_end,evalue,bit_score
0,GlxR,ethylene glycol metabolism,Q88F01,D4993_C5_H4_Bin_238_scaffold_57372_6.pdb,0.176,300,222,0,2,271,23,322,3.711e-12,317
1,GlxR,ethylene glycol metabolism,Q88F01,D4993_C5_H4_Bin_238_scaffold_56676_5.pdb,0.114,311,257,0,2,292,18,328,7.455e-09,182
2,GlxR,ethylene glycol metabolism,Q88F01,D4993_C5_H4_Bin_238_scaffold_30544_2.pdb,0.117,272,237,0,1,272,6,274,1.637e-06,156
3,PraA,"PCA-2,3-meta-cleavage pathway",C4TP01,D4993_C5_H4_Bin_238_scaffold_103874_3.pdb,0.142,266,205,0,1,266,43,282,2.191e-11,248
4,PraF,"PCA-2,3-meta-cleavage pathway",C4TP05,D4993_C5_H4_Bin_238_scaffold_32816_6.pdb,0.16,323,227,0,67,337,15,337,1.44e-12,338


In [22]:
#For file listed in the GuaPAS_protein column, copy the pdb file from the GuaPAS_output folder to the Guapas_PM_discovered_pdb folder
for pdb in data_m8['GuaPAS_protein']:
    command = 'cp ../GuaPAS_output/{} ../foldseek/GuaPAS_PM_discovered_pdb/'.format(pdb)
    subprocess.run(command, shell=True)

In [28]:
#Output the data
data_m8.to_csv('../foldseek/PETMET_GuaPAS_search_sept10.csv', index=False)

In [5]:
b_1 = read_m8('../foldseek/PETMET_B1_ESMFold_search_sept17.m8')
#Rename subject column to B4_proteins
b_1 = b_1.rename(columns={'subject':'B1_proteins'})
#Drop '.pdb' from the query column
b_1['query'] = b_1['query'].str.replace('.pdb', '')
#Merge the dataframes
b1_data = data.copy()
b1_data = b1_data.merge(b_1, left_on='Uniprot', right_on='query', how='right')
#Drop the sequence column
b1_data = b1_data.drop('sequence ', axis=1)
#Drop the query column
b1_data = b1_data.drop('query', axis=1)
#Write the data to a csv file
b1_data.to_csv('../foldseek/PETMET_B1_ESMFold_search_sept17.csv', index=False)
b1_data.head()

  b_1['query'] = b_1['query'].str.replace('.pdb', '')


Unnamed: 0,protein,pathway,Uniprot,B1_proteins,identity,length,mismatches,gap_openings,q_start,q_end,s_start,s_end,evalue,bit_score
0,GlcE,ethylene glycol metabolism,P52073,D4998_C1112_H3_Bin_236_scaffold_231732_14.pdb,0.188,411,273,0,11,347,59,469,3.2209999999999997e-20,574
1,GlcE,ethylene glycol metabolism,P52073,D4998_C1112_H3_Bin_236_scaffold_190651_43.pdb,0.149,462,286,0,11,347,55,516,2.3120000000000003e-17,454
2,GlcE,ethylene glycol metabolism,P52073,D4998_C1112_H3_Bin_236_scaffold_151881_19.pdb,0.14,305,218,0,9,313,48,302,1.133e-06,131
3,PedI,ethylene glycol metabolism,J2MVI9,D4998_C1112_H3_Bin_236_scaffold_134544_5.pdb,0.268,474,341,0,28,501,1,467,8.403e-40,1425
4,PedI,ethylene glycol metabolism,J2MVI9,D4998_C1112_H3_Bin_236_scaffold_247625_29.pdb,0.161,450,365,0,51,500,46,481,1.3020000000000001e-22,600


In [6]:
#For file listed in the GuaPAS_protein column, copy the pdb file from the GuaPAS_output folder to the Guapas_PM_discovered_pdb folder
for pdb in b1_data['B1_proteins']:
    command = 'cp ../B1_MAG_ESMFolds/{} ../foldseek/B1_PM_discovered_pdb/'.format(pdb)
    subprocess.run(command, shell=True)