In [7]:
from utils import * 
import tarfile

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
linear_ece_19kb_df = FASTAFile.from_file('../data/data/linear_ece_19kb.fa').to_df(parse_description=False)

In [9]:
def foldseek_load(path):
   '''This loads the result of running Foldseek against all databases accessible on the Foldseek web interface.'''
   archive = tarfile.open(path, mode='r:gz')
   cols = ['query_id','subject_id','identity','alignment_length', 'mismatch', 'gap_open', 'query_alignment_start', 'query_alignment_stop', 'subject_alignment_start', 'subject_alignment_stop']
   cols += ['tm_score', 'e_value','bit_score', 'query_length', 'subject_length',  'query_alignment', 'subject_alignment', 'per_residue_lddt', 'subject_seq', 'subject_taxonomy_id', 'subject_species_name']
   
   foldseek_df = list()
   for member in archive.getmembers():
      if ('report' in member.name):
         continue 
      f = archive.extractfile(member)
      content = f.read().decode('utf-8')
      df = pd.read_csv(io.StringIO(content), sep='\t', names=cols)
      df['database'] = member.name.replace('.m8', '').replace('alis_', '')
      if len(df) > 0:
         foldseek_df.append(df)

   if len(foldseek_df) == 0:
      return pd.DataFrame([])
   
   foldseek_df = pd.concat(foldseek_df).reset_index(drop=True)
   foldseek_df['query_id'] = re.search(r'\d+_\d+', path).group(0)
   foldseek_df['subject_species_name'] = foldseek_df['subject_species_name'].fillna('none')
   return foldseek_df

foldseek_df = pd.concat([foldseek_load(path) for path in glob.glob('../data/foldseek/esmfold/*')])
foldseek_df = pd.concat([foldseek_df] + [foldseek_load(path) for path in glob.glob('../data/foldseek/alphafold/*')])
# foldseek_df.to_csv('../data/foldseek/linear_ece_19kb.csv')

In [10]:
foldseek_df['query_original_length'] = foldseek_df.query_id.map(linear_ece_19kb_df.seq.apply(len)) # Foldseek search was done on trimmed structures, so want to re-add original sequence length. 
foldseek_df['query_alignment_length'] = (foldseek_df.query_alignment_stop - foldseek_df.query_alignment_start)
foldseek_df['query_coverage'] = foldseek_df.query_alignment_length / foldseek_df.query_original_length
# foldseek_df = foldseek_df[~foldseek_df.query_original_length.isnull()].copy() # Some of the reverse-strand ORFs are included.

print(f'Num. {ece_id} proteins with hits:', foldseek_df.query_id.nunique())

mask = (foldseek_df.query_alignment_length > 15) | (foldseek_df.query_coverage > 0.5)
mask = mask & (foldseek_df.tm_score > 0.7)
foldseek_df = foldseek_df[mask].copy()

print(f'Num. {ece_id} proteins with hits passing the filter:', foldseek_df.query_id.nunique())

Num. linear_ece_19kb proteins with hits: 30
Num. linear_ece_19kb proteins with hits passing the filter: 21


In [11]:
# https://data.rcsb.org/rest/v1/core/polymer_entity/3F51/1

# https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/6tri



In [12]:
uniparc_id_pattern = r'(?P<uniparc_id>[a-zA-Z\d]{4,})'
uniparc_id_pattern_protvar = r'(?P<uniparc_id_1>[a-zA-Z\d]+)(?P<uniparc_version_1>\-[0-9])*_(?P<uniparc_id_2>[a-zA-Z\d]+)(?P<uniparc_version_2>\-[0-9])*_(?P<uniparc_chain>[A-Z])'
uniparc_version_pattern = r'(?P<uniparc_version>\d+)'
description_pattern = r'(?P<description>.*)'
pdb_id_pattern = r'(?P<pdb_id>[a-z0-9]{4})'
pdb_chain_pattern = r'(?P<pdb_chain>([A-Z0-9a-z])|([A-Z0-9a-z]-\d))'
pdb_assembly_pattern = r'(?P<pdb_assembly>\d)'
# From ChatGPT (so take with a grain of salt), not all Mgnify (and other) IDs are in UniParc; some are directly from metagenome FASTA files. 
gmgc_id_pattern = r'(?P<gmgc_id>GMGC.+)\.pdb*' # These are not often not associated with UniParc (or other database) entries.
mgnify_id_pattern = r'(?P<mgnify_id>MGYP\d+)'


subject_id_patterns = list()
subject_id_patterns += [rf'LevyLab_{uniparc_id_pattern}_V\d_\d_relaxed_{pdb_chain_pattern}']
subject_id_patterns += [rf'AF-{uniparc_id_pattern}-{uniparc_version_pattern}-F1-model_v\d {description_pattern}'] # These IDs show up in UniParc and have descriptions attached.
subject_id_patterns += [rf'AF-{uniparc_id_pattern}-F1-model_v\d {description_pattern}'] # These IDs show up in UniParc and have descriptions attached.
subject_id_patterns += [rf'{mgnify_id_pattern}\.pdb\.gz'] # Unclear how many of these have UniParc entries. They correspond to Mgnify IDs.
subject_id_patterns += [rf'{uniparc_id_pattern}_unrelaxed_rank_\d+_alphafold\d_ptm_model_\d_seed_\d+'] # These IDs show up in UniParc.
subject_id_patterns += [rf'{uniparc_id_pattern}_{uniparc_version_pattern}_unrelaxed_rank_\d+_alphafold\d_ptm_model_\d_seed_\d+'] # These IDs show up in UniParc.
subject_id_patterns += [rf'af_{uniparc_id_pattern}.+'] # These IDs show up in UniParc and have descriptions attached.
subject_id_patterns += [rf'{pdb_id_pattern}{pdb_chain_pattern}\d\d'] # Not sure what the two integers at the end correspond to (possibly the complex), but the capital letter (or first integer) is probably the chain.
subject_id_patterns += [rf'{pdb_id_pattern}-assembly{pdb_assembly_pattern}\.cif\.gz_{pdb_chain_pattern} {description_pattern}'] # These have the PDB ID, assembly number, chain, and a description.
subject_id_patterns += [rf'{pdb_id_pattern}-assembly{pdb_assembly_pattern}\.cif\.gz_{pdb_chain_pattern} {description_pattern}'] # These have the PDB ID, assembly number, chain, and a description.
subject_id_patterns += [rf'ProtVar_{uniparc_id_pattern_protvar}']
subject_id_patterns += [mgnify_id_pattern]
subject_id_patterns += [gmgc_id_pattern]

subject_id_df = list()

def get_uniparc_id_from_protvar_id(info:dict):
    chain = info['uniparc_chain']
    if chain == 'A':
        return {'uniparc_id':info['uniparc_id_1']}
    elif chain == 'B':
        return {'uniparc_id':info['uniparc_id_2']}

subject_ids_no_match = list()
for subject_id in foldseek_df.subject_id:

    # These IDs are messing things up, easier to just skip them. 
    if subject_id.startswith('ModelArchive') or subject_id.startswith('Predictome'):
        continue

    for pattern in subject_id_patterns:
        match = re.search(pattern, subject_id)
        if match is not None:
            info = match.groupdict()
            if 'uniparc_id_1' in info:
                info = get_uniparc_id_from_protvar_id(info)
            info['original_id'] = subject_id
            subject_id_df.append(info)
            break
subject_id_df = pd.DataFrame(subject_id_df)

conditions = [~subject_id_df.uniparc_id.isnull()]
conditions += [~subject_id_df.mgnify_id.isnull()]
conditions += [~subject_id_df.pdb_id.isnull()]
conditions += [~subject_id_df.gmgc_id.isnull()]

categories = ['UniParc', 'Mgnify', 'PDB', 'GMGC']
subject_id_df['database'] = np.select(conditions, categories, default='none')

print(f'Matched {len(subject_id_df)} out of {len(foldseek_df)} Foldseek hits.')

Matched 38324 out of 38411 Foldseek hits.


In [None]:
def _uniparc_get_metadata(query_id:str):
    df = list()

    url = f'https://rest.uniprot.org/uniparc/search?query="{query_id}"'
    result = requests.get(url).text 
    result = json.loads(result)['results'][0]

    info = dict()
    info['query_id'] = query_id
    info['uniparc_id'] = result['uniParcId'] # The actual U* UniParc ID. 
    # Note that this only grabs info for the first taxon.
    info['taxonomy_toplevel'] = result['commonTaxons'][0]['topLevel']
    info['taxonomy'] = result['commonTaxons'][0]['commonTaxon']
    info['seq'] = result['sequence']['value']

    annotations = result.get('sequenceFeatures', [])
    info['n_annotations'] = len(annotations)
    if len(annotations) == 0:
        return pd.DataFrame([info])
    
    for annotation in annotations:
        annotation = annotation
        info_ = info.copy()
        info_['annotation_interpro_id'] = annotation.get('interproGroup', {}).get('id', 'none')
        info_['annotation_interpro_description'] = annotation.get('interproGroup', {}).get('name', 'none')
        info_['annotation_database'] = annotation['database']
        info_['annotation_id'] = annotation['databaseId']
        info_['annotation_start'] = annotation['locations'][0]['start']
        info_['annotation_stop'] = annotation['locations'][0]['end']
        df.append(info_)

    return pd.DataFrame(df)


def uniparc_get_metadata(query_ids:list):
    query_ids = np.unique(query_ids) # Make sure not to download stuff twice. 
    metadata_df = list()
    for query_id in tqdm(query_ids, desc='uniparc_get_metadata: Downloading UniParc metadata.'):
        try:
            metadata_df.append(_uniparc_get_metadata(query_id))
        except:
            print(f'uniparc_get_metadata: Failed on UniParc ID {query_id}.')
    metadata_df = pd.concat(metadata_df)
    return metadata_df

if not os.path.exists('../data/foldseek/uniparc_metadata.csv'):
    query_ids = subject_id_df[subject_id_df.database == 'UniParc'].uniparc_id.drop_duplicates()
    uniparc_metadata_df = uniparc_get_metadata(query_ids)
    uniparc_metadata_df.to_csv('../data/foldseek/uniparc_metadata.csv')
    
uniparc_metadata_df = pd.read_csv('../data/foldseek/uniparc_metadata.csv', index_col=0)

In [30]:
def _pdb_get_uniprot_mappings(**pdb_info):
    pdb_id = pdb_info.get('pdb_id')

    url = f'https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_id}'
    result = json.loads(requests.get(url).text)[pdb_id]['UniProt']

    mappings_df = list()
    for uniprot_id_, mappings in result.items():
        mappings_df += [{'pdb_id':pdb_id, 'uniprot_id':uniprot_id_, 'pdb_entity':mapping['entity_id'], 'pdb_chain':mapping['chain_id']} for mapping in mappings['mappings']]

    if len(mappings_df) > 0:
        mappings_df = pd.DataFrame(mappings_df)
        return mappings_df


def pdb_get_uniprot_mappings(pdb_subject_id_df:pd.DataFrame, mappings_path:str='pdb-uniprot_mappings.csv', **pdb_info):

    if not os.path.exists(mappings_path):
        for pdb_info in tqdm(pdb_subject_id_df.to_dict(orient='records'), desc='pdb_get_uniprot_mappings: Downloading PDB mappings.'):
            try:
                mappings_df = _pdb_get_uniprot_mappings(**pdb_info)
                mappings_df.set_index('pdb_id').to_csv(mappings_path, mode='a', header=not os.path.exists(mappings_path))
            except:
                pdb_id, pdb_chain = pdb_info.get('pdb_id'), pdb_info.get('pdb_chain')
                print(f'pdb_get_uniprot_mappings: Failed on PDB ID {pdb_id}, chain {pdb_chain}.')

    mappings_df = pd.read_csv(mappings_path)
    mappings_df = mappings_df.merge(pdb_subject_id_df[['original_id', 'pdb_id', 'pdb_chain']], on=['pdb_id', 'pdb_chain'], how='inner')
    return mappings_df

if not os.path.exists('../data/foldseek/pdb_metadata.csv'):
    pdb_subject_id_df = subject_id_df[subject_id_df.database == 'PDB'].copy()
    pdb_subject_id_df = pdb_subject_id_df.drop_duplicates('original_id')

    pdb_metadata_df = pdb_get_uniprot_mappings(pdb_subject_id_df).rename(columns={'uniprot_id':'query_id'})
    query_ids = pdb_metadata_df.query_id.unique()
    pdb_metadata_df = pdb_metadata_df.merge(uniparc_get_metadata(query_ids), on='query_id', how='left')
    pdb_metadata_df.to_csv('../data/foldseek/pdb_metadata.csv')

pdb_metadata_df = pd.read_csv('../data/foldseek/pdb_metadata.csv', index_col=0)

In [32]:
foldseek_df

Unnamed: 0,query_id,subject_id,identity,alignment_length,mismatch,gap_open,query_alignment_start,query_alignment_stop,subject_alignment_start,subject_alignment_stop,...,query_alignment,subject_alignment,per_residue_lddt,subject_seq,subject_taxonomy_id,subject_species_name,database,query_original_length,query_alignment_length,query_coverage
0,1_35,A0A345MYA4_unrelaxed_rank_001_alphafold2_ptm_m...,17.9,139,98,5,15,141,20,154,...,YVCFVGGRYSRGKSLILTFLTLMDMILNNRM-----------K-VY...,KSYLFYGNKGNGKSLVYA--SIVDSLLSEYRYIERKYPKLPKRVLY...,"-15.109,-14.000,7.969,-18.312,-11.844,7.973,-1...",MITPKLGFFSPSGFNFRGLKSYLFYGNKGNGKSLVYASIVDSLLSE...,2219103.0,Inoviridae sp.,BFVD,256,126,0.492188
1,1_35,A0A8S5V634_unrelaxed_rank_001_alphafold2_ptm_m...,17.5,131,92,7,16,141,27,146,...,VCFVGGRYSRGKSLILTFLTLM-DMILNNRMKVYSN---MPLNFEQ...,VYFVCGRQGSGKGYYAVKLLLAQDK--KTCASVYTNVHSLKLP---...,"14.672,-34.875,-23.812,16.344,-32.531,-21.250,...",MEFLNRRIEFKSFFKKGLPKIDDRFGVYFVCGRQGSGKGYYAVKLL...,2825777.0,Inoviridae sp. ctDDr4,BFVD,256,125,0.488281
2,1_35,B3GAL5_unrelaxed_rank_001_alphafold2_ptm_model...,15.7,133,95,5,16,141,2,124,...,VCFVGGRYSRGKSLIL-TFLTLMDMILNNRMK-VYSNMPLNFEQSG...,NFIFYGNTGTGKTLSLIIMAEMIREMNKDKEMLIFTDI--------...,"-8.602,-2.791,2.189,-5.020,-3.770,1.630,-3.117...",MNFIFYGNTGTGKTLSLIIMAEMIREMNKDKEMLIFTDIDYKYKNY...,340016.0,uncultured virus,BFVD,256,125,0.488281
3,1_35,A0A8S5UKR1_unrelaxed_rank_001_alphafold2_ptm_m...,14.8,135,93,7,16,141,2,123,...,VCFVGGRYSRGKSLILTFLT---LMDMILNNRMK--VYSNMPLNFE...,ITCFFGLPGCGKSTMLAKIAAKELKRIRKGKSKYKRVFCNYYIK--...,"-2.293,-5.918,6.906,-4.043,-4.227,4.046,-4.445...",MITCFFGLPGCGKSTMLAKIAAKELKRIRKGKSKYKRVFCNYYIKG...,2825776.0,Inoviridae sp. ct4fI15,BFVD,256,125,0.488281
4,1_35,A0A894JLK4_unrelaxed_rank_001_alphafold2_ptm_m...,18.4,141,100,5,2,141,13,139,...,VQEKEILELILSRYVCFVGGRYSRGKSLILTFLTLMDMILNNRMKV...,LFVYIDNYCKNPYKLEAVVGSKGSGKSLYMSR-VADRWLRSNKGLI...,"-47.812,-9.352,-15.625,-46.750,-12.000,-12.914...",MLYGILIFCVCWLFVYIDNYCKNPYKLEAVVGSKGSGKSLYMSRVA...,2202560.0,Inovirus sp.,BFVD,256,139,0.542969
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2982,1_26,MGYP000689662163.pdb.gz,9.8,51,36,2,3,52,70,111,...,NYKFIELDEFGNIPYVDSYQY-EVILNGNFLGYYSKNEHIYYPNNS...,YIRTFFV---------ESYEEGLLMVDGNYVRKLKPGRYDFWKNEH...,"49.376,36.043,-34.272,47.307,34.170,-31.998,44...",MKSLLEIVTVKDNEITLVFKDGNFLKTLKPGRYAFWKGVAKFDFET...,,none,mgnify_esm30,143,49,0.342657
2983,1_26,MGYP000959723854.pdb.gz,22.5,31,24,0,22,52,184,214,...,QYEVILNGNFLGYYSKNEHIYYPNNSNLTII,SFDLYVNGKLVANDSLGFSISVPYDSTYEIK,"49.656,31.867,-49.601,48.289,28.566,-48.142,45...",HCGGHANNYSIYFYGNGGNVKAPNLTLTFHSSNYWNISNYVPTRTG...,,none,mgnify_esm30,143,30,0.209790
2984,1_26,MGYP001551521406.pdb.gz,16.2,37,29,1,21,55,396,432,...,YQYEVILNGNFLGYYSKNEHIYYP--NNSNLTIIIPS,DRAHVRVDGALEAIVERTGDVEIPVSGPCEVEIIVES,"-10.803,-16.007,26.250,-11.850,-18.380,23.496,...",MTLTYDQDGFALGGHPHRIFAGAAHYFRSHPEQWPQLMRSLAAMGL...,,none,mgnify_esm30,143,34,0.237762
62,1_2,AF-A0A0F9P4R2-F1-model_v6 Uncharacterized protein,31.8,91,59,1,1,88,1,91,...,MSDKPRGRKAGVK---TAPDGETKADKFKRLGNKRLVKTQKAMKQI...,MKGVICMAKQSKRRGNPPPKDETKEARFIRVCSMRVKKAVKAINNI...,"42.186,10.803,26.272,38.398,10.923,26.906,36.0...",MKGVICMAKQSKRRGNPPPKDETKEARFIRVCSMRVKKAVKAINNI...,412755.0,marine sediment metagenome,afdb50,88,87,0.988636


In [34]:
pdb_metadata_df[pdb_metadata_df.annotation_start.isnull()]

Unnamed: 0,pdb_id,query_id,pdb_entity,pdb_chain,original_id,uniparc_id,taxonomy_toplevel,taxonomy,seq,n_annotations,annotation_interpro_id,annotation_interpro_description,annotation_database,annotation_id,annotation_start,annotation_stop
1267,2zpa,P76562,1,B,2zpaB02,UPI001FC70572,cellular organisms,Hyalomma asiaticum,MQASDQADLEAGGDLLPHGGTPVVVSVPALLSSPERTGTKPQGFGW...,0.0,,,,,,
3841,6vvo,P40937,3,C,6vvo-assembly1.cif.gz_C Structure of the human...,UPI001FC6F1EB,cellular organisms,Haemaphysalis longicornis,MKYYTTEDGWHHPVRQTQSGEPRKLILRAPGSLQLSQITPLALHRM...,0.0,,,,,,
14919,6af3,A0A0H2UQ20,2,H,6af3-assembly1.cif.gz_H Toxin-Antitoxin module...,,,,,,,,,,,
15125,6af4,A0A0H2UQ20,2,F,6af4-assembly2.cif.gz_F Toxin-Antitoxin module...,,,,,,,,,,,
32026,4v4c,P80564,2,F,4v4c-assembly3.cif.gz_F Crystal Structure of P...,UPI00248F19A5,cellular organisms,Biomphalaria glabrata,MYQCQKHLSEQHSREIKAYDKSRDIPLQVGIVIGVLSVATSAASIL...,0.0,,,,,,
35017,4ahc,P61875,1,A,4ahc-assembly1.cif.gz_A Crystal Structure of a...,UPI001FC72EE3,cellular organisms,Hyalomma asiaticum,MSHQDLNSLCYYLNLLHPVYLHRIDDFRFSSWPRLSGSTRERAVRD...,0.0,,,,,,
35018,4ahc,P61875,1,B,4ahc-assembly2.cif.gz_B Crystal Structure of a...,UPI001FC72EE3,cellular organisms,Hyalomma asiaticum,MSHQDLNSLCYYLNLLHPVYLHRIDDFRFSSWPRLSGSTRERAVRD...,0.0,,,,,,
35055,1wns,P77933,1,A,1wns-assembly1.cif.gz_A Crystal structure of f...,UPI00248D977B,cellular organisms,Biomphalaria glabrata,MIAVSPMSLLAALLGMSLVGLIEPKLHVEFGISPLDGDAETSSDVC...,0.0,,,,,,
35110,4ahc,P61875,1,A,4ahc-assembly1.cif.gz_A Crystal Structure of a...,UPI001FC72EE3,cellular organisms,Hyalomma asiaticum,MSHQDLNSLCYYLNLLHPVYLHRIDDFRFSSWPRLSGSTRERAVRD...,0.0,,,,,,
35111,4ahc,P61875,1,B,4ahc-assembly2.cif.gz_B Crystal Structure of a...,UPI001FC72EE3,cellular organisms,Hyalomma asiaticum,MSHQDLNSLCYYLNLLHPVYLHRIDDFRFSSWPRLSGSTRERAVRD...,0.0,,,,,,
