In [14]:
from utils import * 
import tarfile
import json

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
linear_ece_19kb_df = FASTAFile.from_file('../data/data/linear_ece_19kb.fa').to_df(parse_description=False)

In [16]:
def foldseek_load(path):
   '''This loads the result of running Foldseek against all databases accessible on the Foldseek web interface.'''
   archive = tarfile.open(path, mode='r:gz')
   cols = ['id_','subject_id','identity','alignment_length', 'mismatch', 'gap_open', 'query_alignment_start', 'query_alignment_stop', 'subject_alignment_start', 'subject_alignment_stop']
   cols += ['tm_score', 'e_value','bit_score', 'query_length', 'subject_length',  'query_alignment', 'subject_alignment', 'per_residue_lddt', 'subject_seq', 'subject_taxonomy_id', 'subject_species_name']
   
   foldseek_df = list()
   for member in archive.getmembers():
      if ('report' in member.name):
         continue 
      f = archive.extractfile(member)
      content = f.read().decode('utf-8')
      df = pd.read_csv(io.StringIO(content), sep='\t', names=cols)
      df['database'] = member.name.replace('.m8', '').replace('alis_', '')
      if len(df) > 0:
         foldseek_df.append(df)

   if len(foldseek_df) == 0:
      return pd.DataFrame([])
   
   foldseek_df = pd.concat(foldseek_df).reset_index(drop=True)
   foldseek_df['id_'] = re.search(r'\d+_\d+', path).group(0)
   foldseek_df['subject_species_name'] = foldseek_df['subject_species_name'].fillna('none')
   return foldseek_df

foldseek_df = pd.concat([foldseek_load(path) for path in glob.glob('../data/foldseek/esmfold/*')])
foldseek_df = pd.concat([foldseek_df] + [foldseek_load(path) for path in glob.glob('../data/foldseek/alphafold/*')])
# foldseek_df.to_csv('../data/foldseek/linear_ece_19kb.csv')

In [17]:
foldseek_df['query_original_length'] = foldseek_df.id_.map(linear_ece_19kb_df.seq.apply(len)) # Foldseek search was done on trimmed structures, so want to re-add original sequence length. 
foldseek_df['query_alignment_length'] = (foldseek_df.query_alignment_stop - foldseek_df.query_alignment_start)
foldseek_df['query_coverage'] = foldseek_df.query_alignment_length / foldseek_df.query_original_length
# foldseek_df = foldseek_df[~foldseek_df.query_original_length.isnull()].copy() # Some of the reverse-strand ORFs are included.

print(f'Num. {ece_id} proteins with hits:', foldseek_df.id_.nunique())

mask = (foldseek_df.query_alignment_length > 15) | (foldseek_df.query_coverage > 0.5)
mask = mask & (foldseek_df.tm_score > 0.7)
foldseek_df = foldseek_df[mask].copy()

print(f'Num. {ece_id} proteins with hits passing the filter:', foldseek_df.id_.nunique())

Num. linear_ece_19kb proteins with hits: 30
Num. linear_ece_19kb proteins with hits passing the filter: 21


In [19]:
# From ChatGPT (so take with a grain of salt), not all Mgnify (and other) IDs are in UniParc; some are directly from metagenome FASTA files. 
uniparc_id_pattern = r'(?P<id>[a-zA-Z\d]{4,})'
uniparc_id_pattern_protvar = r'(?P<id_1>[a-zA-Z\d]+)(\-[0-9])*_(?P<id_2>[a-zA-Z\d]+)(\-[0-9])*_(?P<chain>[A-Z])'
uniparc_version_pattern = r'(?P<version>\d+)'
description_pattern = r'(?P<description>.*)'
pdb_id_pattern = r'(?P<id>[a-z0-9]{4})'
pdb_chain_pattern = r'(?P<chain>([A-Z0-9a-z])|([A-Z0-9a-z]-\d))'
pdb_assembly_pattern = r'(?P<assembly>\d)'
gmgc_id_pattern = r'(?P<id>GMGC.+)\.pdb*' # These are not often not associated with UniParc (or other database) entries.
mgnify_id_pattern = r'(?P<id>MGYP\d+)'


subject_id_patterns = dict() 
subject_id_patterns[rf'LevyLab_{uniparc_id_pattern}_V\d_\d_relaxed_{pdb_chain_pattern}'] = 'UniParc'
subject_id_patterns[rf'AF-{uniparc_id_pattern}-{uniparc_version_pattern}-F1-model_v\d {description_pattern}'] = 'UniParc'
subject_id_patterns[rf'AF-{uniparc_id_pattern}-F1-model_v\d {description_pattern}'] = 'UniParc'
subject_id_patterns[rf'{mgnify_id_pattern}\.pdb\.gz'] = 'Mgnify'
subject_id_patterns[rf'{uniparc_id_pattern}_unrelaxed_rank_\d+_alphafold\d_ptm_model_\d_seed_\d+'] = 'UniParc'
subject_id_patterns[rf'{uniparc_id_pattern}_{uniparc_version_pattern}_unrelaxed_rank_\d+_alphafold\d_ptm_model_\d_seed_\d+'] = 'UniParc'
subject_id_patterns[rf'af_{uniparc_id_pattern}.+'] = 'UniParc'
subject_id_patterns[rf'{pdb_id_pattern}{pdb_chain_pattern}\d\d'] = 'PDB'
subject_id_patterns[rf'{pdb_id_pattern}-assembly{pdb_assembly_pattern}\.cif\.gz_{pdb_chain_pattern} {description_pattern}'] = 'PDB'
subject_id_patterns[rf'{pdb_id_pattern}-assembly{pdb_assembly_pattern}\.cif\.gz_{pdb_chain_pattern} {description_pattern}'] = 'PDB'
subject_id_patterns[rf'ProtVar_{uniparc_id_pattern_protvar}'] = 'UniParc'
subject_id_patterns[mgnify_id_pattern] = 'Mgnify'
subject_id_patterns[gmgc_id_pattern] = 'GMGC'

subject_id_df = list()

def get_uniparc_id_from_protvar_id(info:dict):
    '''In these cases, chain is not the actual chain, but seems to indicate the specific sequence in the pair.'''
    if 'id_1' not in info:
        return info
    return {'id':info['id_1']} if (info['chain'] == 'A') else {'id':info['id_2']}

subject_ids_no_match = list()

for subject_id in foldseek_df.subject_id:
    
    if subject_id.startswith('ModelArchive') or subject_id.startswith('Predictome'):
        continue # These IDs are messing things up, easier to just skip them. 

    for pattern, database in subject_id_patterns.items():
        match = re.search(pattern, subject_id)
        if match is not None:
            info = get_uniparc_id_from_protvar_id(match.groupdict())
            info['foldseek_id'] = subject_id # Store the original ID. 
            info['database'] = database
            subject_id_df.append(info)
            break

subject_id_df = pd.DataFrame(subject_id_df)
subject_id_df['id'] = [f'{row.id}_{row.chain}' if (row.database == 'PDB') else row.id for row in subject_id_df.itertuples()]

print(f'Matched {len(subject_id_df)} out of {len(foldseek_df)} Foldseek hits.')

Matched 38324 out of 38411 Foldseek hits.


In [None]:
def _uniparc_get_metadata(id_:str):
    df = list()

    url = f'https://rest.uniprot.org/uniparc/search?query="{id_}"'
    result = requests.get(url).text 
    result = json.loads(result)['results'][0]

    info = dict()
    info['id'] = id_
    info['uniparc_id'] = result['uniParcId'] # The actual U* UniParc ID. 
    # Note that this only grabs info for the first taxon.
    info['taxonomy_toplevel'] = result['commonTaxons'][0]['topLevel']
    info['taxonomy'] = result['commonTaxons'][0]['commonTaxon']
    info['seq'] = result['sequence']['value']

    annotations = result.get('sequenceFeatures', [])
    info['n_annotations'] = len(annotations)
    if len(annotations) == 0:
        return pd.DataFrame([info])
    
    for annotation in annotations:
        annotation = annotation
        info_ = info.copy()
        info_['annotation_interpro_id'] = annotation.get('interproGroup', {}).get('id', 'none')
        info_['annotation_interpro_description'] = annotation.get('interproGroup', {}).get('name', 'none')
        info_['annotation_database'] = annotation['database']
        info_['annotation_id'] = annotation['databaseId']
        info_['annotation_start'] = annotation['locations'][0]['start']
        info_['annotation_stop'] = annotation['locations'][0]['end']
        df.append(info_)

    return pd.DataFrame(df)


def uniparc_get_metadata(ids:list):
    ids = np.unique(ids) # Make sure not to download stuff twice. 
    metadata_df = list()
    for id_ in tqdm(ids, desc='uniparc_get_metadata: Downloading UniParc metadata.'):
        try:
            metadata_df.append(_uniparc_get_metadata(id_))
        except:
            print(f'uniparc_get_metadata: Failed on ID {id_}.')
    metadata_df = pd.concat(metadata_df)
    return metadata_df



In [None]:
def _pdb_get_mappings(id_:str):
    id_, chain = id_.split('_')
    url = f'https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{id_}'
    result = json.loads(requests.get(url).text)[id_]['UniProt']
    result = {uniprot_id:mappings['mappings'] for uniprot_id, mappings in result.items()}
    mappings = {f"{id_}_{mapping['chain_id']}":uniprot_id  for uniprot_id, mappings in result.items() for mapping in mappings}
    return mappings


def pdb_get_mappings(ids:pd.DataFrame, path:str='../data/foldseek/pdb-uniprot_map.json'):
    mappings = dict()
    for id_ in tqdm(ids, desc='pdb_get_mappings: Downloading PDB mappings.'):
        try:
            mappings.update(_pdb_get_mappings(id_))
        except:
            print(f'pdb_get_mappings: Failed on PDB ID {id_}.')
    with open(path, 'wb') as f:
            json.dump(mappings, f)
    return mappings

ids = subject_id_df[subject_id_df.database == 'PDB']['id'].unique() 
pdb_mappings = pdb_get_mappings(ids)


# pdb_metadata_df = pd.read_csv('../data/foldseek/pdb_metadata.csv', index_col=0)

pdb_get_mappings: Downloading PDB mappings.:   0%|          | 0/943 [00:00<?, ?it/s]

pdb_get_mappings: Downloading PDB mappings.:  31%|███       | 288/943 [03:58<08:31,  1.28it/s]

In [None]:
ids = list(pdb_mappings.keys()) + subject_id_df[subject_id_df.database == 'UniParc']['id'].tolist()
uniparc_metadata_df = uniparc_get_metadata(ids)
uniparc_metadata_df.to_csv('../data/foldseek/uniparc_metadata.csv')

In [None]:
if not os.path.exists('../data/foldseek/uniparc_metadata.csv'):
    ids = subject_id_df[subject_id_df.database == 'UniParc']['id'].unique()
    
    uniparc_metadata_df.to_csv('../data/foldseek/uniparc_metadata.csv')
    
uniparc_metadata_df = pd.read_csv('../data/foldseek/uniparc_metadata.csv', index_col=0)

In [None]:
subject_id_map = dict()
for row in subject_id_df.itertuples():
    ids = np.array([row.uniparc_id, row.mgnify_id, row.pdb_id, row.gmgc_id])
    subject_id_map[row.original_id] = str(ids[~(ids == 'nan')][0])

foldseek_df['original_subject_id'] = foldseek_df.subject_id
foldseek_df['subject_id'] = foldseek_df.subject_id.map(subject_id_map)
print(f'Removing {foldseek_df.subject_id.isnull().sum()} hits with no parsed ID.')
foldseek_df = foldseek_df[~foldseek_df.subject_id.isnull()].copy()

Removing 87 hits with no parsed ID.


In [None]:
metadata_df = pd.concat([pdb_metadata_df, uniparc_metadata_df]).rename(columns={'id_':'subject_id'})
# foldseek_df['merge_column'] = foldseek_df.subject_id.copy()
foldseek_df.merge(metadata_df, on='subject_id', how='left')[['subject_id', 'original_subject_id']]

Unnamed: 0,subject_id,original_subject_id
0,A0A345MYA4,A0A345MYA4_unrelaxed_rank_001_alphafold2_ptm_m...
1,A0A8S5V634,A0A8S5V634_unrelaxed_rank_001_alphafold2_ptm_m...
2,A0A8S5V634,A0A8S5V634_unrelaxed_rank_001_alphafold2_ptm_m...
3,A0A8S5V634,A0A8S5V634_unrelaxed_rank_001_alphafold2_ptm_m...
4,B3GAL5,B3GAL5_unrelaxed_rank_001_alphafold2_ptm_model...
...,...,...
179176,MGYP000689662163,MGYP000689662163.pdb.gz
179177,MGYP000959723854,MGYP000959723854.pdb.gz
179178,MGYP001551521406,MGYP001551521406.pdb.gz
179179,A0A0F9P4R2,AF-A0A0F9P4R2-F1-model_v6 Uncharacterized protein


In [None]:
metadata_df

Unnamed: 0,pdb_id,subject_id,pdb_entity,pdb_chain,original_id,uniparc_id,taxonomy_toplevel,taxonomy,seq,n_annotations,annotation_interpro_id,annotation_interpro_description,annotation_database,annotation_id,annotation_start,annotation_stop
0,2r2a,Q9JRY6,1.0,B,2r2aB00,UPI00000C4DA3,cellular organisms,Neisseria meningitidis,MAEICLITGTPGSGKTLKMVSMMANDEMFKPDENGIRRKVFTNIKG...,2.0,IPR008900,"Zona occludens toxin, N-terminal",Pfam,PF05707,4.0,193.0
1,2r2a,Q9JRY6,1.0,B,2r2aB00,UPI00000C4DA3,cellular organisms,Neisseria meningitidis,MAEICLITGTPGSGKTLKMVSMMANDEMFKPDENGIRRKVFTNIKG...,2.0,IPR027417,P-loop containing nucleoside triphosphate hydr...,Gene3D,G3DSA:3.40.50.300,1.0,196.0
2,2r2a,Q9JRY6,1.0,B,2r2a-assembly2.cif.gz_B Crystal structure of N...,UPI00000C4DA3,cellular organisms,Neisseria meningitidis,MAEICLITGTPGSGKTLKMVSMMANDEMFKPDENGIRRKVFTNIKG...,2.0,IPR008900,"Zona occludens toxin, N-terminal",Pfam,PF05707,4.0,193.0
3,2r2a,Q9JRY6,1.0,B,2r2a-assembly2.cif.gz_B Crystal structure of N...,UPI00000C4DA3,cellular organisms,Neisseria meningitidis,MAEICLITGTPGSGKTLKMVSMMANDEMFKPDENGIRRKVFTNIKG...,2.0,IPR027417,P-loop containing nucleoside triphosphate hydr...,Gene3D,G3DSA:3.40.50.300,1.0,196.0
4,3bos,A1S6W5,1.0,A,3bosA01,UPI000054F469,cellular organisms,Shewanella amazonensis,MRSNRVTQHPPLQLSLPVHLPDDETFTSYYPAAGNDELIGALKSAA...,10.0,IPR017788,DnaA regulatory inactivator Hda,NCBIfam,TIGR03420,13.0,239.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17,,Q6IE37,,,,UPI000164912D,cellular organisms,Homo sapiens,MHVHVCVCLCVCIYTSSCVCACVHMCMRDALLAEGRGGGLAAADDF...,22.0,IPR047565,"Alpha-macroglobulin-like, thiol-ester bond-for...",SMART,SM01419,772.0,803.0
18,,Q6IE37,,,,UPI000164912D,cellular organisms,Homo sapiens,MHVHVCVCLCVCIYTSSCVCACVHMCMRDALLAEGRGGGLAAADDF...,22.0,none,none,Gene3D,G3DSA:1.50.10.20,902.0,1042.0
19,,Q6IE37,,,,UPI000164912D,cellular organisms,Homo sapiens,MHVHVCVCLCVCIYTSSCVCACVHMCMRDALLAEGRGGGLAAADDF...,22.0,IPR009048,"Alpha-macroglobulin, receptor-binding",Pfam,PF07677,1104.0,1184.0
20,,Q6IE37,,,,UPI000164912D,cellular organisms,Homo sapiens,MHVHVCVCLCVCIYTSSCVCACVHMCMRDALLAEGRGGGLAAADDF...,22.0,none,none,Gene3D,G3DSA:2.60.40.1940,201.0,293.0


In [None]:
uniparc_metadata_df.set_index('id_').uniparc_id.to_dict()

{'A0A345MYA4': 'UPI000E297719',
 'A0A8S5V634': 'UPI00204E4DEF',
 'B3GAL5': 'UPI0001754A4E',
 'A0A8S5UKR1': 'UPI002045E24F',
 'A0A894JLK4': 'UPI001AF0A22A',
 'Q6Q0J1': 'UPI000035DACD',
 'A0A6J5MA74': 'UPI0014628B5F',
 'A0A1X9VNW4': 'UPI000A3006E9',
 'A0A5K0UAK9': 'UPI00126C24B1',
 'A0A7G8LRL9': 'UPI001860E5C8',
 'A0A0G2Y190': 'UPI000656351E',
 'A0A3G4ZLE4': 'UPI000F6F810D',
 'A0A345MR48': 'UPI000E28CFE2',
 'A0A4D5XEZ6': 'UPI0010C3C1A2',
 'UPI001F13D714': 'UPI001F13D714',
 'A0A0K1Y7P8': 'UPI0006BCFCCB',
 'A0A1V0SL68': 'UPI0009DC5AF7',
 'A0A5J6VI98': 'UPI00129FB84C',
 'A0A0M4JTD0': 'UPI0006B2DCD6',
 'A0A7D4VS28': 'UPI00159933BE',
 'H6WFX2': 'UPI00024EB699',
 'A0A481YZH0': 'UPI0010B9FDC6',
 'A0A481YTC7': 'UPI0010B905E8',
 'M4JFA3': 'UPI0002C5EF1D',
 'A0A7S7YEC9': 'UPI0018C074D0',
 'A0A5K0U9M5': 'UPI001278E667',
 'H9YAG8': 'UPI000060374D',
 'A0A811AAF7': 'UPI001AFB89A3',
 'UPI001F13DEE1': 'UPI001F13DEE1',
 'Q8SDJ0': 'UPI000009AEF8',
 'A0A3G5AB64': 'UPI000F6E0705',
 'A0A5E3ZZS4': 'UPI001251C