In [94]:
from utils import * 
import tarfile
import json

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [95]:
linear_ece_19kb_df = FASTAFile.from_file('../data/data/linear_ece_19kb.fa').to_df(parse_description=False)
linear_ece_19kb_df.index = [id_.split('.')[-1] for id_ in linear_ece_19kb_df.index]

In [96]:
def foldseek_load(path):
   '''This loads the result of running Foldseek against all databases accessible on the Foldseek web interface.'''
   archive = tarfile.open(path, mode='r:gz')
   cols = ['id_','subject_id','identity','alignment_length', 'mismatch', 'gap_open', 'query_alignment_start', 'query_alignment_stop', 'subject_alignment_start', 'subject_alignment_stop']
   cols += ['tm_score', 'e_value','bit_score', 'query_length', 'subject_length',  'query_alignment', 'subject_alignment', 'per_residue_lddt', 'subject_seq', 'subject_taxonomy_id', 'subject_species_name']
   
   foldseek_df = list()
   for member in archive.getmembers():
      if ('report' in member.name):
         continue 
      f = archive.extractfile(member)
      content = f.read().decode('utf-8')
      df = pd.read_csv(io.StringIO(content), sep='\t', names=cols)
      df['database'] = member.name.replace('.m8', '').replace('alis_', '')
      if len(df) > 0:
         foldseek_df.append(df)

   if len(foldseek_df) == 0:
      return pd.DataFrame([])
   
   foldseek_df = pd.concat(foldseek_df).reset_index(drop=True)
   foldseek_df['id_'] = re.search(r'\d+_\d+', path).group(0)
   foldseek_df['subject_species_name'] = foldseek_df['subject_species_name'].fillna('none')
   return foldseek_df

foldseek_df = pd.concat([foldseek_load(path) for path in glob.glob('../data/foldseek/esmfold/*')])
foldseek_df = pd.concat([foldseek_df] + [foldseek_load(path) for path in glob.glob('../data/foldseek/alphafold/*')])
# foldseek_df.to_csv('../data/foldseek/linear_ece_19kb.csv')

In [97]:
foldseek_df['query_original_length'] = foldseek_df.id_.map(linear_ece_19kb_df.seq.apply(len)) # Foldseek search was done on trimmed structures, so want to re-add original sequence length. 
foldseek_df['query_alignment_length'] = (foldseek_df.query_alignment_stop - foldseek_df.query_alignment_start)
foldseek_df['query_coverage'] = foldseek_df.query_alignment_length / foldseek_df.query_original_length
# foldseek_df = foldseek_df[~foldseek_df.query_original_length.isnull()].copy() # Some of the reverse-strand ORFs are included.

print(f'Num. {ece_id} proteins with hits:', foldseek_df.id_.nunique())

mask = (foldseek_df.query_alignment_length > 15) | (foldseek_df.query_coverage > 0.5)
mask = mask & (foldseek_df.tm_score > 0.7)
foldseek_df = foldseek_df[mask].copy()

print(f'Num. {ece_id} proteins with hits passing the filter:', foldseek_df.id_.nunique())

Num. linear_ece_19kb proteins with hits: 30
Num. linear_ece_19kb proteins with hits passing the filter: 21


In [98]:
# From ChatGPT (so take with a grain of salt), not all Mgnify (and other) IDs are in UniParc; some are directly from metagenome FASTA files. 
uniparc_id_pattern = r'(?P<id>[a-zA-Z\d]{4,})'
uniparc_id_pattern_protvar = r'(?P<id_1>[a-zA-Z\d]+)(\-[0-9])*_(?P<id_2>[a-zA-Z\d]+)(\-[0-9])*_(?P<chain>[A-Z])'
uniparc_version_pattern = r'(?P<version>\d+)'
description_pattern = r'(?P<description>.*)'
pdb_id_pattern = r'(?P<id>[a-z0-9]{4})'
pdb_chain_pattern = r'(?P<chain>([A-Z0-9a-z])|([A-Z0-9a-z]-\d))'
pdb_assembly_pattern = r'(?P<assembly>\d)'
gmgc_id_pattern = r'(?P<id>GMGC.+)\.pdb*' # These are not often not associated with UniParc (or other database) entries.
mgnify_id_pattern = r'(?P<id>MGYP\d+)'


subject_id_patterns = dict() 
subject_id_patterns[rf'LevyLab_{uniparc_id_pattern}_V\d_\d_relaxed_{pdb_chain_pattern}'] = 'UniParc'
subject_id_patterns[rf'AF-{uniparc_id_pattern}-{uniparc_version_pattern}-F1-model_v\d {description_pattern}'] = 'UniParc'
subject_id_patterns[rf'AF-{uniparc_id_pattern}-F1-model_v\d {description_pattern}'] = 'UniParc'
subject_id_patterns[rf'{mgnify_id_pattern}\.pdb\.gz'] = 'Mgnify'
subject_id_patterns[rf'{uniparc_id_pattern}_unrelaxed_rank_\d+_alphafold\d_ptm_model_\d_seed_\d+'] = 'UniParc'
subject_id_patterns[rf'{uniparc_id_pattern}_{uniparc_version_pattern}_unrelaxed_rank_\d+_alphafold\d_ptm_model_\d_seed_\d+'] = 'UniParc'
subject_id_patterns[rf'af_{uniparc_id_pattern}.+'] = 'UniParc'
subject_id_patterns[rf'{pdb_id_pattern}{pdb_chain_pattern}\d\d'] = 'PDB'
subject_id_patterns[rf'{pdb_id_pattern}-assembly{pdb_assembly_pattern}\.cif\.gz_{pdb_chain_pattern} {description_pattern}'] = 'PDB'
subject_id_patterns[rf'{pdb_id_pattern}-assembly{pdb_assembly_pattern}\.cif\.gz_{pdb_chain_pattern} {description_pattern}'] = 'PDB'
subject_id_patterns[rf'ProtVar_{uniparc_id_pattern_protvar}'] = 'UniParc'
subject_id_patterns[mgnify_id_pattern] = 'Mgnify'
subject_id_patterns[gmgc_id_pattern] = 'GMGC'

subject_id_df = list()

def get_uniparc_id_from_protvar_id(info:dict):
    '''In these cases, chain is not the actual chain, but seems to indicate the specific sequence in the pair.'''
    if 'id_1' not in info:
        return info
    return {'id':info['id_1']} if (info['chain'] == 'A') else {'id':info['id_2']}

subject_ids_no_match = list()

for subject_id in foldseek_df.subject_id:
    
    if subject_id.startswith('ModelArchive') or subject_id.startswith('Predictome'):
        continue # These IDs are messing things up, easier to just skip them. 

    for pattern, database in subject_id_patterns.items():
        match = re.search(pattern, subject_id)
        if match is not None:
            info = get_uniparc_id_from_protvar_id(match.groupdict())
            info['foldseek_id'] = subject_id # Store the original ID. 
            info['database'] = database
            subject_id_df.append(info)
            break

subject_id_df = pd.DataFrame(subject_id_df)
subject_id_df['id'] = [f'{row.id}_{row.chain}' if (row.database == 'PDB') else row.id for row in subject_id_df.itertuples()]

print(f'Matched {len(subject_id_df)} out of {len(foldseek_df)} Foldseek hits.')

Matched 38324 out of 38411 Foldseek hits.


In [99]:
def _uniparc_get_metadata(id_:str):
    df = list()

    url = f'https://rest.uniprot.org/uniparc/search?query="{id_}"'
    result = requests.get(url).text 
    result = json.loads(result)['results'][0]

    info = dict()
    info['id'] = id_
    info['uniparc_id'] = result['uniParcId'] # The actual U* UniParc ID. 
    # Note that this only grabs info for the first taxon.
    info['taxonomy_toplevel'] = result['commonTaxons'][0]['topLevel']
    info['taxonomy'] = result['commonTaxons'][0]['commonTaxon']
    info['seq'] = result['sequence']['value']

    annotations = result.get('sequenceFeatures', [])
    info['n_annotations'] = len(annotations)
    if len(annotations) == 0:
        return pd.DataFrame([info])
    
    for annotation in annotations:
        annotation = annotation
        info_ = info.copy()
        info_['annotation_interpro_id'] = annotation.get('interproGroup', {}).get('id', 'none')
        info_['annotation_interpro_description'] = annotation.get('interproGroup', {}).get('name', 'none')
        info_['annotation_database'] = annotation['database']
        info_['annotation_id'] = annotation['databaseId']
        info_['annotation_start'] = annotation['locations'][0]['start']
        info_['annotation_stop'] = annotation['locations'][0]['end']
        df.append(info_)

    return pd.DataFrame(df)


def uniparc_get_metadata(ids:list):
    ids = np.unique(ids) # Make sure not to download stuff twice. 
    metadata_df = list()
    for id_ in tqdm(ids, desc='uniparc_get_metadata: Downloading UniParc metadata.'):
        try:
            metadata_df.append(_uniparc_get_metadata(id_))
        except:
            print(f'uniparc_get_metadata: Failed on ID {id_}.')
    metadata_df = pd.concat(metadata_df)
    return metadata_df



In [100]:
def _pdb_get_mappings(id_:str):
    id_, chain = id_.split('_')
    url = f'https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{id_}'
    print(requests.get(url).text)
    result = json.loads(requests.get(url).text)[id_]['UniProt']
    result = {uniprot_id:mappings['mappings'] for uniprot_id, mappings in result.items()}
    mappings = {f"{id_}_{mapping['chain_id']}":uniprot_id  for uniprot_id, mappings in result.items() for mapping in mappings}
    return mappings


def pdb_get_mappings(ids:pd.DataFrame, path:str='../data/foldseek/pdb-uniprot_map.json'):
    if os.path.exists(path):
         with open(path, 'rb') as f:
            mappings = json.load(f)
            return mappings
         
    mappings = dict()
    for id_ in tqdm(ids, desc='pdb_get_mappings: Downloading PDB mappings.'):
        try:
            mappings.update(_pdb_get_mappings(id_))
        except:
            print(f'pdb_get_mappings: Failed on PDB ID {id_}.')
    with open(path, 'w') as f:
            json.dump(mappings, f)
    return mappings

ids = subject_id_df[subject_id_df.database == 'PDB']['id'].unique() 
pdb_mappings = pdb_get_mappings(ids)

subject_id_df['pdb_id'] = np.where(subject_id_df.database == 'PDB', subject_id_df['id'], 'none')
subject_id_df['id'] = np.where(subject_id_df.database == 'PDB', subject_id_df['id'].map(pdb_mappings), subject_id_df['id'])
print(f'Removing {subject_id_df['id'].isnull().sum()} PDB entries with no UniProt accession.')
subject_id_df = subject_id_df[~subject_id_df['id'].isnull()].copy()

Removing 140 PDB entries with no UniProt accession.


In [101]:
if not os.path.exists('../data/foldseek/uniparc_metadata.csv'):
    ids = subject_id_df[subject_id_df.database.isin(['UniParc', 'PDB'])]['id'].tolist()
    uniparc_metadata_df = uniparc_get_metadata(ids)
    uniparc_metadata_df.to_csv('../data/foldseek/uniparc_metadata.csv')
uniparc_metadata_df = pd.read_csv('../data/foldseek/uniparc_metadata.csv')

In [102]:
uniparc_metadata_df

Unnamed: 0.1,Unnamed: 0,id,uniparc_id,taxonomy_toplevel,taxonomy,seq,n_annotations,annotation_interpro_id,annotation_interpro_description,annotation_database,annotation_id,annotation_start,annotation_stop
0,0,A0A009IFH5,UPI0004484EB2,cellular organisms,Acinetobacter baumannii,MARIRLTTGGIGAGKTYLNVKLADEAHKKGQYTKIYSNIRAHSELT...,3,IPR027417,P-loop containing nucleoside triphosphate hydr...,SUPFAM,SSF52540,6.0,117.0
1,1,A0A009IFH5,UPI0004484EB2,cellular organisms,Acinetobacter baumannii,MARIRLTTGGIGAGKTYLNVKLADEAHKKGQYTKIYSNIRAHSELT...,3,IPR027417,P-loop containing nucleoside triphosphate hydr...,Gene3D,G3DSA:3.40.50.300,1.0,174.0
2,2,A0A009IFH5,UPI0004484EB2,cellular organisms,Acinetobacter baumannii,MARIRLTTGGIGAGKTYLNVKLADEAHKKGQYTKIYSNIRAHSELT...,3,IPR008900,"Zona occludens toxin, N-terminal",Pfam,PF05707,43.0,175.0
3,0,A0A009IPH3,UPI00044793A4,cellular organisms,Acinetobacter calcoaceticus/baumannii complex,MIYLITATPGSGKTLWAVKEIFARANEAEPWNIFSNIDGLKLDTAQ...,2,IPR027417,P-loop containing nucleoside triphosphate hydr...,Gene3D,G3DSA:3.40.50.300,1.0,174.0
4,1,A0A009IPH3,UPI00044793A4,cellular organisms,Acinetobacter calcoaceticus/baumannii complex,MIYLITATPGSGKTLWAVKEIFARANEAEPWNIFSNIDGLKLDTAQ...,2,IPR008900,"Zona occludens toxin, N-terminal",Pfam,PF05707,59.0,172.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97891,0,Z9JXA4,UPI00044C5CEB,cellular organisms,Brachybacterium phenoliresistens,MTDLESVPGRAPEVRERRLAIPDHLSPFQVLGENDQALAALEDVVP...,5,IPR027417,P-loop containing nucleoside triphosphate hydr...,Gene3D,G3DSA:3.40.50.300,121.0,327.0
97892,1,Z9JXA4,UPI00044C5CEB,cellular organisms,Brachybacterium phenoliresistens,MTDLESVPGRAPEVRERRLAIPDHLSPFQVLGENDQALAALEDVVP...,5,IPR003714,PhoH-like protein,Pfam,PF02562,124.0,326.0
97893,2,Z9JXA4,UPI00044C5CEB,cellular organisms,Brachybacterium phenoliresistens,MTDLESVPGRAPEVRERRLAIPDHLSPFQVLGENDQALAALEDVVP...,5,IPR027417,P-loop containing nucleoside triphosphate hydr...,SUPFAM,SSF52540,123.0,290.0
97894,3,Z9JXA4,UPI00044C5CEB,cellular organisms,Brachybacterium phenoliresistens,MTDLESVPGRAPEVRERRLAIPDHLSPFQVLGENDQALAALEDVVP...,5,none,none,FunFam,G3DSA:3.40.50.300:FF:000013,121.0,325.0


In [103]:
foldseek_df['id'] = foldseek_df['subject_id'].map(subject_id_df.drop_duplicates('foldseek_id').set_index('foldseek_id')['id'])
foldseek_df['subject_database'] = foldseek_df['id'].map(subject_id_df.drop_duplicates('id').set_index('id')['database'])
foldseek_df['subject_description'] = foldseek_df['id'].map(subject_id_df.drop_duplicates('id').set_index('id')['description'])

if 'annotation_database' not in foldseek_df.columns:
    foldseek_df = foldseek_df.merge(uniparc_metadata_df, left_on='id', right_on='id', how='left')

get_annotation_overlap = lambda row : max(min(row.subject_alignment_stop, row.annotation_stop) - max(row.subject_alignment_start, row.annotation_start), 0)
has_annotation = lambda row : row.annotation_start in np.arange(0, 1000)
foldseek_df['annotation_overlap'] = [get_annotation_overlap(row) if has_annotation(row) else np.nan for row in foldseek_df.itertuples()]
foldseek_df['annotation_coverage'] = foldseek_df.annotation_overlap / (foldseek_df.annotation_stop - foldseek_df.annotation_start)
foldseek_df['query_coverage'] = (foldseek_df.query_alignment_stop - foldseek_df.query_alignment_start) / foldseek_df.query_length

# Remove annotations that don't overlap the alignment. 
# mask = (foldseek_df.n_annotations > 1) & (foldseek_df.annotation_overlap < 10)
# print(f'Removing {mask.sum()} useless annotations.')


In [104]:
ece_gene_ids = [f'1_{i + 1}' for i in range(38)]
print('ECE genes with no good hits:', ' '.join([gene_id for gene_id in ece_gene_ids if (gene_id not in foldseek_df.id_.unique())]))
ece_gene_ids_no_hits = '1_4 1_5 1_6 1_7 1_9 1_13 1_15 1_16 1_18 1_22 1_24 1_27 1_29 1_30 1_31 1_34 1_36'.split()
# len(ece_gene_ids_no_hits)

ECE genes with no good hits: 1_4 1_5 1_6 1_7 1_9 1_13 1_15 1_16 1_18 1_22 1_24 1_27 1_29 1_30 1_31 1_34 1_36


In [None]:
def print_info(id_:str, min_annotation_overlap:int=10, min_annotation_coverage:float=0.5, annotations_only:bool=False):
    df = foldseek_df[foldseek_df.id_ == id_].copy()
    print('Query sequence length:', df.query_original_length.iloc[0], f'(trimmed length {df.query_length.iloc[0]})')
    print('Num hits:', df['id'].nunique())
    print('Num. Mgnify hits:', (df.drop_duplicates('id').subject_database == 'Mgnify').sum())
    df = df[df.subject_database != 'Mgnify'].copy()
    print('Num. hits to uncharacterized proteins:', (df.drop_duplicates('id').n_annotations == 0).sum())
    if annotations_only:
        df = df[(df.n_annotations > 0) & (df.annotation_overlap > min_annotation_overlap) & (df.annotation_coverage > min_annotation_coverage)].copy()
    return df.sort_values('query_coverage', ascending=False)

print_info('1_14')# .annotation_interpro_description.value_counts()

Query sequence length: 44 (trimmed length 25)
Num hits: 107
Num. Mgnify hits: 27
Num. hits to uncharacterized proteins: 30


annotation_interpro_description
Zinc finger C2H2-type                                            141
none                                                             103
Zinc finger C2H2 superfamily                                      28
ROS/MUCR transcriptional regulator superfamily                    11
Zinc Finger C2H2-type Transcription Regulators                    10
Zinc finger, AD-type                                               8
ROS/MUCR transcriptional regulator                                 6
Drought induced 19 protein type, zinc-binding domain               5
SANT/Myb domain                                                    4
BTB/POZ domain                                                     3
Zinc finger                                                        3
Snail/Krueppel C2H2-type Zinc-finger                               3
SKP1/BTB/POZ domain superfamily                                    2
Insect cuticle protein                                             2
Ru

In [106]:
foldseek_df.columns

Index(['id_', 'subject_id', 'identity', 'alignment_length', 'mismatch',
       'gap_open', 'query_alignment_start', 'query_alignment_stop',
       'subject_alignment_start', 'subject_alignment_stop', 'tm_score',
       'e_value', 'bit_score', 'query_length', 'subject_length',
       'query_alignment', 'subject_alignment', 'per_residue_lddt',
       'subject_seq', 'subject_taxonomy_id', 'subject_species_name',
       'database', 'query_original_length', 'query_alignment_length',
       'query_coverage', 'id', 'subject_database', 'subject_description',
       'Unnamed: 0', 'uniparc_id', 'taxonomy_toplevel', 'taxonomy', 'seq',
       'n_annotations', 'annotation_interpro_id',
       'annotation_interpro_description', 'annotation_database',
       'annotation_id', 'annotation_start', 'annotation_stop',
       'annotation_overlap', 'annotation_coverage'],
      dtype='object')