In [1]:
import os
import json
import pandas as pd
from tqdm import tqdm
import re

# Introduction
- written by: [GAMA - @miangoar on (X)Twitter](https://twitter.com/miangoar)
- date: 22.03.2024

This notebook contains the code needed to download the follow data from the [AlphaFold databse](https://alphafold.ebi.ac.uk/) given a list of Uniprot IDs:
1. json files: contains 20 decriptions about the model. [For example](https://alphafold.ebi.ac.uk/api/prediction/Q5VSL9): uniprotAccession, sequence, pdbUrl, etc.
2. PDB files: protein structures in .pdb format



You will need a file that contains a list of uniprot IDs that have a match in the AFDB. Here are an example of how your file must be:
- https://raw.githubusercontent.com/miangoar/scratch/main/files/uniport_ids_example.txt

In [4]:
! wget -q https://raw.githubusercontent.com/miangoar/scratch/main/files/uniport_ids_example.txt .
! cat uniport_ids_example.txt

A0A009H6M6
A0A009NI36
A0A009QHH1
A0A009SEN1
A0A009Z9U8
A0A010JTE9
A0A010RDN1
A0A010SKW9
A0A011A179
A0A011MII1


In [5]:
# change the name of your file
df = pd.read_csv("uniport_ids_example.txt", names=["Entry"], sep="\t")
df

Unnamed: 0,Entry
0,A0A009H6M6
1,A0A009NI36
2,A0A009QHH1
3,A0A009SEN1
4,A0A009Z9U8
5,A0A010JTE9
6,A0A010RDN1
7,A0A010SKW9
8,A0A011A179
9,A0A011MII1


In [14]:
# create a set of functions to download the json and pdf files
def download_json(df, json_path, df_name, metrics_path):
    """
    Download json files from the AFDB for a set of UniProt IDs
      df: DataFrame with uniprot ids
      json_path: directory to save the json files
      metrics_path: directory to save the parsed json files as a single df in parquet format
      df_name: desired name to store the parquet dataset
    """

    # set a progress bar
    pbar = tqdm(total=len(df), desc="Downloading json files from AFDB", leave=True)

    # create the url for api request
    for entry in df["Entry"]:
        url = f'https://alphafold.ebi.ac.uk/api/prediction/{entry}'
        output_file = os.path.join(json_path, f'{entry}.json')

        # download the json file with curl
        command = f'curl -s -X "GET" "{url}" -H "accept: application/json" > {output_file}'
        os.system(command)
        pbar.update(1)
    pbar.close()

    # parse all json files into a single df
    print("Parsing json files ...")
    data_list = []
    for file in os.listdir(json_path):
        if file.endswith('.json'):
            filepath = os.path.join(json_path, file)

            # read each json files and append data
            with open(filepath, 'r') as f:
                data = json.load(f)
                data_list.append({
                    "entryId":                data[0]["entryId"],
                    "gene":                   data[0]["gene"],
                    "uniprotAccession":       data[0]["uniprotAccession"],
                    "uniprotId":              data[0]["uniprotId"],
                    "uniprotDescription":     data[0]["uniprotDescription"],
                    "taxId":                  data[0]["taxId"],
                    "organismScientificName": data[0]['organismScientificName'],
                    "uniprotStart":           data[0]['uniprotStart'],
                    "uniprotEnd":             data[0]['uniprotEnd'],
                    "uniprotSequence":        data[0]['uniprotSequence'],
                    "modelCreatedDate":       data[0]['modelCreatedDate'],
                    "latestVersion":          data[0]['latestVersion'],
                    "allVersions":            data[0]['allVersions'],
                    "isReviewed":             data[0]['isReviewed'],
                    "isReferenceProteome":    data[0]['isReferenceProteome'],
                    "cifUrl":                 data[0]['cifUrl'],
                    "bcifUrl":                data[0]['bcifUrl'],
                    "pdbUrl":                 data[0]['pdbUrl'],
                    "paeImageUrl":            data[0]['paeImageUrl'],
                    "paeDocUrl":              data[0]['paeDocUrl']
                })
    # create the parsed df and export it
    df_metrics = pd.DataFrame(data_list)
    filename = ".".join([df_name, "parquet"])
    metrics = os.path.join(metrics_path, filename)
    df_metrics.to_parquet(metrics, index=False)
    print(f"Quality metrics saved at {metrics}")


def download_pdbs(parsed_df, pdb_path):
    """
    Download PDB files from the parsed df with all json files
    """
    # read the parsed df
    df_url = pd.read_parquet(parsed_df)

    # set a progress bar
    pbar2 = tqdm(total=len(df_url['pdbUrl']), desc="Downloading pdb files from AFDB", leave=True)

    # extract the filename from the URL and download the pdb
    for url in df_url['pdbUrl']:
        filename =  re.search(r"/files/(.+\.pdb)$", url).group(1)
        filename = pdb_path + filename
        command = f'curl -s -o {filename} {url}'
        os.system(command)
        pbar2.update(1)
    pbar2.close()

In [7]:
# create directories to store the data
! mkdir metrics_dir pdb_dir json_dir

In [9]:
! ls metrics_dir pdb_dir json_dir

json_dir:

metrics_dir:

pdb_dir:


In [13]:
download_json(df=df, json_path="json_dir", df_name="test_name_for_afdb_data", metrics_path="metrics_dir")

Downloading json files from AFDB: 100%|██████████| 10/10 [00:03<00:00,  3.09it/s]

Parsing json files ...
Quality metrics saved at metrics_dir/test_name_for_afdb_data.parquet





In [17]:
download_pdbs(parsed_df="metrics_dir/test_name_for_afdb_data.parquet", pdb_path="pdb_dir/")

Downloading pdb files from AFDB: 100%|██████████| 10/10 [00:05<00:00,  1.79it/s]


In [18]:
! ls metrics_dir pdb_dir json_dir

json_dir:
A0A009H6M6.json  A0A009QHH1.json  A0A009Z9U8.json  A0A010RDN1.json  A0A011A179.json
A0A009NI36.json  A0A009SEN1.json  A0A010JTE9.json  A0A010SKW9.json  A0A011MII1.json

metrics_dir:
test_name_for_afdb_data.parquet

pdb_dir:
AF-A0A009H6M6-F1-model_v4.pdb  AF-A0A009Z9U8-F1-model_v4.pdb  AF-A0A011A179-F1-model_v4.pdb
AF-A0A009NI36-F1-model_v4.pdb  AF-A0A010JTE9-F1-model_v4.pdb  AF-A0A011MII1-F1-model_v4.pdb
AF-A0A009QHH1-F1-model_v4.pdb  AF-A0A010RDN1-F1-model_v4.pdb
AF-A0A009SEN1-F1-model_v4.pdb  AF-A0A010SKW9-F1-model_v4.pdb


In [19]:
# example oh how looks the parsed df
pd.read_parquet("metrics_dir/test_name_for_afdb_data.parquet")

Unnamed: 0,entryId,gene,uniprotAccession,uniprotId,uniprotDescription,taxId,organismScientificName,uniprotStart,uniprotEnd,uniprotSequence,modelCreatedDate,latestVersion,allVersions,isReviewed,isReferenceProteome,cifUrl,bcifUrl,pdbUrl,paeImageUrl,paeDocUrl
0,AF-A0A011A179-F1,glsA,A0A011A179,A0A011A179_9BACL,Glutaminase,915437,Saccharibacillus sacchari DSM 19268,1,315,MTNKEARLEEVRQFLPQWLEESRKEAASGEVASYIPELSKASHTAL...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A011A17...,https://alphafold.ebi.ac.uk/files/AF-A0A011A17...,https://alphafold.ebi.ac.uk/files/AF-A0A011A17...,https://alphafold.ebi.ac.uk/files/AF-A0A011A17...,https://alphafold.ebi.ac.uk/files/AF-A0A011A17...
1,AF-A0A010SKW9-F1,glsA,A0A010SKW9,A0A010SKW9_PSEFL,Glutaminase,1042209,Pseudomonas fluorescens HK44,1,302,MQALLNEILDTVRPLIGQGKVADYIPALGSVPADQLGIAVYGNDGE...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A010SKW...,https://alphafold.ebi.ac.uk/files/AF-A0A010SKW...,https://alphafold.ebi.ac.uk/files/AF-A0A010SKW...,https://alphafold.ebi.ac.uk/files/AF-A0A010SKW...,https://alphafold.ebi.ac.uk/files/AF-A0A010SKW...
2,AF-A0A009NI36-F1,glsA,A0A009NI36,A0A009NI36_9GAMM,Glutaminase,1310652,Acinetobacter sp. 1475718,1,440,MQTPLPDYLANVIEACDIDNSGHLADYIPELANANPNRLALAMSTV...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A009NI3...,https://alphafold.ebi.ac.uk/files/AF-A0A009NI3...,https://alphafold.ebi.ac.uk/files/AF-A0A009NI3...,https://alphafold.ebi.ac.uk/files/AF-A0A009NI3...,https://alphafold.ebi.ac.uk/files/AF-A0A009NI3...
3,AF-A0A009Z9U8-F1,glsA,A0A009Z9U8,A0A009Z9U8_9GAMM,Glutaminase,1310601,Acinetobacter sp. 479375,1,427,MKTPLPDYLEHVLDECDGDDSGHLADYIPELANASPHRLALAMSTV...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A009Z9U...,https://alphafold.ebi.ac.uk/files/AF-A0A009Z9U...,https://alphafold.ebi.ac.uk/files/AF-A0A009Z9U...,https://alphafold.ebi.ac.uk/files/AF-A0A009Z9U...,https://alphafold.ebi.ac.uk/files/AF-A0A009Z9U...
4,AF-A0A010RDN1-F1,CFIO01_00341,A0A010RDN1,A0A010RDN1_9PEZI,Glutaminase,1445577,Colletotrichum fioriniae PJ7,1,466,MKSPIPDYLNRVLENARPNEAGAPAGYIDVLAKADTSKMAVALAMV...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A010RDN...,https://alphafold.ebi.ac.uk/files/AF-A0A010RDN...,https://alphafold.ebi.ac.uk/files/AF-A0A010RDN...,https://alphafold.ebi.ac.uk/files/AF-A0A010RDN...,https://alphafold.ebi.ac.uk/files/AF-A0A010RDN...
5,AF-A0A011MII1-F1,glsA1_2,A0A011MII1,A0A011MII1_9PROT,Glutaminase,1454001,Candidatus Accumulibacter sp. SK-12,1,197,MYTPLPDDGLPSLISTGHLPPPEEVETLVRQVYARYRDLDEGKVAD...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A011MII...,https://alphafold.ebi.ac.uk/files/AF-A0A011MII...,https://alphafold.ebi.ac.uk/files/AF-A0A011MII...,https://alphafold.ebi.ac.uk/files/AF-A0A011MII...,https://alphafold.ebi.ac.uk/files/AF-A0A011MII...
6,AF-A0A009H6M6-F1,glsA,A0A009H6M6,A0A009H6M6_9GAMM,Glutaminase,1310608,Acinetobacter sp. 1295259,1,440,MQTPLPDYLANVIEACDIDNSGHLADYIPELANANPNRLALAMSTV...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A009H6M...,https://alphafold.ebi.ac.uk/files/AF-A0A009H6M...,https://alphafold.ebi.ac.uk/files/AF-A0A009H6M...,https://alphafold.ebi.ac.uk/files/AF-A0A009H6M...,https://alphafold.ebi.ac.uk/files/AF-A0A009H6M...
7,AF-A0A009SEN1-F1,glsA,A0A009SEN1,A0A009SEN1_ACIBA,Glutaminase,1310630,Acinetobacter baumannii 99063,1,440,MKTPLPDYLANVIEACDIDNSGHLADYIPELANANPNRLALAMSTV...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A009SEN...,https://alphafold.ebi.ac.uk/files/AF-A0A009SEN...,https://alphafold.ebi.ac.uk/files/AF-A0A009SEN...,https://alphafold.ebi.ac.uk/files/AF-A0A009SEN...,https://alphafold.ebi.ac.uk/files/AF-A0A009SEN...
8,AF-A0A010JTE9-F1,glsA,A0A010JTE9,A0A010JTE9_9GAMM,Glutaminase,1310681,Acinetobacter sp. 1542444,1,437,MKTPLPEYLANVIEACDIDNSGHLADYIPELANANPNRLALAMSTV...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A010JTE...,https://alphafold.ebi.ac.uk/files/AF-A0A010JTE...,https://alphafold.ebi.ac.uk/files/AF-A0A010JTE...,https://alphafold.ebi.ac.uk/files/AF-A0A010JTE...,https://alphafold.ebi.ac.uk/files/AF-A0A010JTE...
9,AF-A0A009QHH1-F1,glsA,A0A009QHH1,A0A009QHH1_ACIBA,Glutaminase,1310607,Acinetobacter baumannii 625974,1,440,MKTPLPDYLANVIEACDIDNSGHLADYIPELANANPNRLALAMSTV...,2022-06-01,4,"[3, 4]",False,True,https://alphafold.ebi.ac.uk/files/AF-A0A009QHH...,https://alphafold.ebi.ac.uk/files/AF-A0A009QHH...,https://alphafold.ebi.ac.uk/files/AF-A0A009QHH...,https://alphafold.ebi.ac.uk/files/AF-A0A009QHH...,https://alphafold.ebi.ac.uk/files/AF-A0A009QHH...
