<a href="https://colab.research.google.com/github/paulynamagana/AFDB_notebooks/blob/main/AFDB_API.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Accessing AlphaFold DB structures via API**


This Colab notebook allows you to search for and download protein structures from the AlphaFold database using the AlphaFold API. You can also filter entries based on pLDDT score.

To use this Colab notebook, you will need to have a Google account and be logged in to Google Colab.

---


  ## How to use Google Colab <a name="Quick Start"></a>
1. To run a code cell, click on the cell to select it. You will notice a play button (▶️) on the left side of the cell. Click on the play button or press Shift+Enter to run the code in the selected cell.
2. The code will start executing, and you will see the output, if any, displayed below the code cell.
3. Move to the next code cell and repeat steps 2 and 3 until you have executed all the desired code cells in sequence.
4. The currently running step is indicated by a circle with a stop sign next to it.
If you need to stop or interrupt the execution of a code cell, you can click on the stop button (■) located next to the play button.

*Remember to run the code cells in the correct order, as their execution might depend on variables or functions defined in previous cells. You can modify the code in a code cell and re-run it to see updated results.*

## Contact us

If you experience any bugs please contact afdbhelp@ebi.ac.uk

---

In [None]:
#@title ##1.&nbsp; Input Organism name or Taxonomy ID
import requests, sys, json
import re
from requests.adapters import HTTPAdapter, Retry
from requests.packages.urllib3.util.retry import Retry
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor
import time


#@markdown You need to provide Organism name
Organism = "" #@param {type:"string"}
#### IF TAXONOMY ID WAS PROVIDED THEN SAVE IT WAS TAX ID
#@markdown OR Taxonomy ID
tax_ID = "574521wdqwed" #@param {type:"string"}

Protein_status = "Reviewed" #@param ["Reviewed", "Unreviewed", "All"]
#@markdown - `Reviewed` will query only Reviewed entries `Unreviewed` will query only Unreviewed proteins

pLDDT_filter = ">95" #@param {type:"string"}
pLDDT_filter = pLDDT_filter.strip()

# Extract the comparison operator and value from user input
operator, value = pLDDT_filter[:1], int(pLDDT_filter[1:])

# Build the condition string dynamically
condition = f"confidence_score {operator} {value}"

#@markdown - This is the minimum average pLDDT to filter the structures from th ORganism or tax_ID

folder_save = "test" #@param {type:"string"}
#@markdown - This will create a folder inside a folder names AFDB_API_files


if len(Organism.split()) == 2:
  word1, word2 = Organism.split()
  formatted_Organism = f"{word1}%20{word2}"

else:
  formatted_Organism = Organism

######## ADD AN IF, IF THE ORGANISM IF NOT EMPTY, THEN RUN THE CODE

# Define the base URL
BASE_URL = "https://www.ebi.ac.uk/proteins/api/proteomes?offset=0&size=100&name="
# Construct the full URL
WEBSITE_API = BASE_URL + formatted_Organism



# Check if at least one variable is provided
if Organism and tax_ID:
    print("Error, please provide either organism or taxonomy ID, not both.")
elif Organism:
# Make a GET request to the URL
  response = requests.get(WEBSITE_API)
# Check if the request was successful (status code 200)
  if response.status_code != 200:
    # Parse the JSON response
      print(json.dumps(response.json(), indent=3))
# Access the the JSON data
  response_data = response.json()
  data= list(response_data)[0]
  #extract the value from the key taxonomy
  if 'taxonomy' in data:
      tax_ID = data['taxonomy']
      print(f"Using taxonomy ID: {tax_ID} for Organism: {Organism}")
  else:
      print("taxonomy ID not found in the JSON data.") #if not found in the JSON
elif tax_ID: # Use the provided taxid
    print(f"Using provided taxonomy ID: {tax_ID}")
else:
    print("Error, you need to provide organism or taxonomy ID.") #user didn't provide organism or tax_ID


#####
if Protein_status == "Reviewed":
  status = "+AND+%28reviewed%3Atrue%29"
elif Protein_status == "Unreviewed":
  status = "+AND+%28reviewed%3Afalse%29"
else:
  status = ""

Using provided taxonomy ID: 574521wdqwed


In [None]:
#@title ##2.&nbsp; Entries to Scan on UniProt
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re

BASE_URL_API = "https://rest.uniprot.org/uniprotkb/search?fields=accession%2Cid%2Cprotein_name%2Cgene_names&format=tsv&query=%28%28taxonomy_id%3A+"
QUERY_UniprotIDs = f"{BASE_URL_API}{tax_ID}%29%29{status}&size=500"

re_next_link = re.compile(r'<(.+)>; rel="next"')
retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))

def get_next_link(headers):
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)

def get_batch(batch_url):
    total = 0  # Initialize total outside of the loop
    printed_total = False  # Flag to track whether the total has been printed
    while batch_url:
        try:
            response = session.get(batch_url)
            response.raise_for_status()
            total = response.headers["x-total-results"]

            # Print total only once before the loop
            if not printed_total:
                print(f"Total items to retrieve: {total} for Organism `{Organism}` and Protein status `{Protein_status}`")
                printed_total = True

                # Ask for user confirmation
                user_input = input("Do you want to proceed with the download? (y/n): ").strip().lower()
                if user_input != 'y':
                    print("Download aborted.")
                    return

            yield response, total
            batch_url = get_next_link(response.headers)
        except KeyboardInterrupt:
            print("\nInterrupted by user.")
            return

primary_accessions = set()  # Use a set to ensure unique primaryAccessions

try:
    for batch, total in get_batch(QUERY_UniprotIDs):
        for line in batch.text.splitlines()[1:]:
            primary_accession = line.split('\t')[0]
            primary_accessions.add(primary_accession)

        print(f'{len(primary_accessions)} / {total}')

except KeyboardInterrupt:
    print("\nInterrupted by user.")

# Convert set to list
primary_accessions_list = list(primary_accessions)


Total items to retrieve: 779 for Organism `` and Protein status `Reviewed`
Do you want to proceed with the download? (y/n): y
500 / 779
779 / 779


In [None]:
#@title ##3.&nbsp; Filter pLDDT entries
import requests
import concurrent.futures
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import time

#API URL to retrieve the summary
api_url = "https://alphafold.ebi.ac.uk/api/uniprot/summary/"

def get_data(accession):
    url = f"{api_url}{accession}.json" #Construct the URl for API

    # Configure a retry mechanism with exponential backoff
    retries = Retry(total=3, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    adapter = HTTPAdapter(max_retries=retries)

    # Make a request using a session with the configured adapter
    with requests.Session() as session:
        session.mount('https://', adapter)
        response = session.get(url)

    #Check if the request was successfull
    if response.status_code == 200:
        data = response.json() #Parse the JSON response

        # Check confidence score first
        structures = data.get("structures", [])
        first_structure = structures[0] if structures else {}

        summary = first_structure.get("summary", {})
        confidence_score = summary.get("confidence_avg_local_score", 0)

        # Evaluate the confidence condition and extract data if met
        if eval(condition):
            # Confidence score is sufficient, extract and return "ac"
            uniprot_entry = data.get("uniprot_entry", {})
            ac_value = uniprot_entry.get("ac")
            return ac_value

    # Confidence score not met or an issue with the API request
    return None

# Use ThreadPoolExecutor for concurrent API calls within a chunk
def get_data_chunk(chunk):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(get_data, chunk))
    return [result for result in results if result is not None]

#Specify chunk size and sleep interval
chunk_size = 40
sleep_interval = 1.5

#Split the accessions into chunks
accession_chunks = [primary_accessions_list[i:i + chunk_size] for i in range(0, len(primary_accessions_list), chunk_size)]

#Start result storage and track total chunks
all_results = []
total_chunks = len(accession_chunks)

#Start processed chunk count
processed_chunks = 0

#Iterate through chunks
for chunk in accession_chunks:
    retry_count = 0
    while retry_count < 3:  # Maximum retries
        try:
          #Get and store data for the current chunk
            results = get_data_chunk(chunk)
            all_results.extend(results)
            break
        except requests.RequestException as e:
            #print(f"Error: {e}") # for testing errors
            retry_count += 1 # Handle API request, introduce a delay
            time.sleep(sleep_interval)

    #Print progress after processing each chunk
    processed_chunks += 1
    print(f"Processed {processed_chunks} / {total_chunks} chunks")

    #Delay between chunks
    time.sleep(sleep_interval)

# Print final progress
print(f"Processed {processed_chunks} / {total_chunks} chunks")
#print(all_results) #test entries after pLDDT filter


print(f"Total entries after filtering by pLDDT: {len(all_results)}")


Processed 1 / 20 chunks
Processed 2 / 20 chunks
Processed 3 / 20 chunks
Processed 4 / 20 chunks
Processed 5 / 20 chunks
Processed 6 / 20 chunks
Processed 7 / 20 chunks
Processed 8 / 20 chunks
Processed 9 / 20 chunks
Processed 10 / 20 chunks
Processed 11 / 20 chunks
Processed 12 / 20 chunks
Processed 13 / 20 chunks
Processed 14 / 20 chunks
Processed 15 / 20 chunks
Processed 16 / 20 chunks
Processed 17 / 20 chunks
Processed 18 / 20 chunks
Processed 19 / 20 chunks
Processed 20 / 20 chunks
Processed 20 / 20 chunks
Total entries after filtering by pLDDT: 306


In [None]:
#@title ##4.&nbsp; Download files to Google Drive
#@markdown **Note:** This will download all the mmCIF files and PAE for the accessions with specified filter for average pLDDT <br>
#@markdown You will find a all files in folder "AFDB_API_files" and a file called summary.csv
import os
import csv
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
destination_path = f"/content/drive/MyDrive/AFDB_API_files/{folder_save}"

isExist = os.path.exists(destination_path)
if not isExist:
    os.makedirs(destination_path)
    print("The new directory was created!")

# Function to download a file from a given URL and save it to the Google Drive
def download_file(url):
    os.chdir(destination_path)
    !wget "$url" -q

# List to store data for CSV
data_list = []

BASE_URL = "https://alphafold.ebi.ac.uk/api/prediction/"

# Function to process an entry
def process_entry(entry):
    api_url = BASE_URL + entry
    response = requests.get(api_url)  # Make a GET request to the URL

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Parse the JSON response
        api_data_list = response.json()

        # Assuming there is only one dictionary in the list
        api_data = api_data_list[0]

        # Extract relevant information from the API response
        uniprot_description = api_data.get("uniprotDescription", "")
        uniprot_accession = api_data.get("uniprotAccession", "")
        organism_scientific_name = api_data.get("organismScientificName", "")
        model_created_date = api_data.get("modelCreatedDate", "")
        reviewed = api_data.get("isReviewed", "")

        ##download
        download_file(api_data.get("cifUrl", ""))
        download_file(api_data.get("paeDocUrl", ""))

        # Append data to the list
        data_list.append([uniprot_description, uniprot_accession, organism_scientific_name, model_created_date, reviewed])

# Specify the number of threads
num_threads = 4

# Use ThreadPoolExecutor to parallelize the downloading process
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(process_entry, all_results)

# Specify the CSV file path
csv_file_path = '/content/drive/MyDrive/AFDB_API_files/summary.csv'

# Writing data to CSV
with open(csv_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write header
    writer.writerow(['Uniprot Description', 'Uniprot Accession', 'Organism Scientific Name', 'Model Created Date', 'Reviewed'])

    # Write data
    writer.writerows(data_list)

print(f"Data has been downloaded to {csv_file_path}")


Mounted at /content/drive
Data has been downloaded to /content/drive/MyDrive/AFDB_API_files/summary.csv
