# Get Protein Data
This notebook uses the list of entry IDs in each of the text files in the protein-data folder to fetch the protein data from PDB and Uniprot.

In [85]:
import requests
import json
import os
import time
from requests.exceptions import ConnectionError
from datetime import datetime

## Fetch Data from PDB
Fetch data from PDB using the entity IDs from the text files in the protein-data folder.

### Get list of Protein Entry IDs

In [52]:
def read_protein_entry_ids(folder_path):
    """
    Reads the list of protein entry IDs in each file of the folder path
    Input: folder_path (str) - path to the folder containing the files
    Output: protein_entry_ids (list) - list of separated protein entry IDs
    """
    protein_entry_ids = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            full_path = os.path.join(folder_path, filename)
            with open(full_path, 'r') as file:
                for line in file:
                    # Splitting each line by comma and stripping whitespace
                    ids = line.strip().split(',')
                    # Extending the list by adding all ids found in the line
                    protein_entry_ids.extend([id.strip() for id in ids if id.strip()])
                
    return protein_entry_ids

### Make API call to PDB for each Protein Entry ID
Use the list protein_entry_ids to fetch the protein information for each protein from PDB.

In [78]:
def get_uniprot_annotations(entry_id, entity_id):
    """
    Get UniProt annotations for a given entry ID and entity ID
    Input: entry_id (str) - Protein entry ID
           entity_id (str) - Polymer entity ID
    Output: uniprot_data (dict) - Dictionary containing UniProt annotations
    """
    url = f'https://data.rcsb.org/rest/v1/core/uniprot/{entry_id}/{entity_id}'
    uniprot_data = {}  # Initialize uniprot_data outside try-except

    try:
        response = requests.get(url)
        response.raise_for_status()  # This will raise an HTTPError for bad responses

    except requests.HTTPError as e:  # Catching HTTP errors
        if e.response.status_code == 404:  # Specific check for a 'Not Found' error
            print(f"Data for {entry_id} - {entity_id} not found (404). Skipping...")
            return uniprot_data  # Return the empty dictionary since there's no data
        else:
            print(f"Failed to fetch data for {entry_id} - {entity_id}. Response status code: {e.response.status_code}")
            return uniprot_data  # Return the empty dictionary since there's no data

    # If the response is successful, parse the JSON and extract the data
    data = response.json()
    rcsb_uniprot_protein = data[0].get('rcsb_uniprot_protein', {})
    rcsb_uniprot_feature = data[0].get('rcsb_uniprot_feature', {})
    rcsb_uniprot_annotation = data[0].get('rcsb_uniprot_annotation', {})
    uniprot_data = {
        'rcsb_uniprot_protein': rcsb_uniprot_protein,
        'rcsb_uniprot_feature': rcsb_uniprot_feature,
        'rcsb_uniprot_annotation': rcsb_uniprot_annotation
    }

    return uniprot_data


In [79]:
def get_protein_data(entry_id):
    """
    Get protein data for a given entry ID
    """
        
    url = f"https://data.rcsb.org/rest/v1/core/entry/{entry_id}"
    response = requests.get(url)
    data = response.json()
    
    # Extracting required information
    protein_info = {
        'entry_id': entry_id,
        'polymer_entities': [],
        'binding_affinity': data.get('rcsb_binding_affinity', []),
        'uniprot_annotations': {}  # Initialize as empty dictionary
    }

    if 'rcsb_entry_container_identifiers' in data:
        entity_ids = data['rcsb_entry_container_identifiers'].get('polymer_entity_ids', [])

        # Extract polymer entity information if available
        for entity_id in entity_ids:
            polymer_url = f"https://data.rcsb.org/rest/v1/core/polymer_entity/{entry_id}/{entity_id}"
            polymer_response = requests.get(polymer_url)
            if polymer_response.status_code == 200:
                polymer_data = polymer_response.json()
                sequence = polymer_data.get('entity_poly', {}).get('pdbx_seq_one_letter_code', 'N/A').replace('\n', '')
                entity_type = polymer_data.get('entity_poly', {}).get('type', 'N/A')
                protein_info['polymer_entities'].append({
                    'entity_id': entity_id,
                    'type': entity_type,
                    'sequence': sequence
                })

            # Get uniprot annotations for each entity ID and add to protein_info
            protein_info['uniprot_annotations'][entity_id] = get_uniprot_annotations(entry_id, entity_id)
    
    return protein_info

### Get data loop

In [117]:
def data_fetcher(input_dir, output_dir, last_print_time=None):
    """
    Fetches protein data for given entry IDs and saves it as JSON.
    """
    protein_entry_ids = read_protein_entry_ids(input_dir)
    max_retries = 5  # Set maximum number of retries per entry

    for entry_id in protein_entry_ids:
        # Skip if the data has already been fetched
        if os.path.exists(os.path.join(output_dir, f"{entry_id}.json")):
            print(f"Data for {entry_id} already exists. Skipping...")
            continue
        for attempt in range(max_retries):
            try:
                # Attempt to fetch data
                protein_data = get_protein_data(entry_id)
                # If successful, break out of the retry loop
                break
            except ConnectionError:
                # If a connection error occurred, wait and then retry
                print(f"Connection error for {entry_id}, attempt {attempt + 1}/{max_retries}. Retrying...")
                time.sleep(5)  # Wait for 5 seconds before retrying
            except Exception as e:
                # If another type of exception occurred, break and don't retry
                print(f"An error occurred for {entry_id}: {e}. Skipping...")
                break
        else:
            # If the loop completes normally, without breaking, then all attempts failed.
            print(f"Failed to fetch data for {entry_id} after {max_retries} attempts.")
            continue  # Skip to the next entry ID
        
        # Save the successfully fetched data
        num_files = len(os.listdir(output_dir))
        output_file = os.path.join(output_dir, f"{entry_id}.json")
        with open(output_file, 'w') as file:
            json.dump(protein_data, file, indent=4)
            current_print_time = time.time()
            print(f"Data successfully saved to {output_file}. Time:{datetime.now().strftime('%m-%d %H:%M:%S')} Total files:{num_files}")

        if last_print_time and current_print_time - last_print_time > 300:
            print("More than 5 minutes have passed since the last fetch. Re-starting the fetch process...")
            # exit loop and start the function over again
            return data_fetcher(input_dir, output_dir, current_print_time)
    
        # Wait for 1 second before fetching the next entry
        # time.sleep(1)

        # update the last_print_time with the current_print_time
        last_print_time = current_print_time
    
    print("Finished fetching data for all protein entry IDs.")

In [122]:
data_fetcher('protein_entry_ids', '../protein_data')

Data for 5AK4 already exists. Skipping...
Data for 5AK5 already exists. Skipping...
Data for 5AK6 already exists. Skipping...
Data for 5AKE already exists. Skipping...
Data for 5AKG already exists. Skipping...
Data for 5AKH already exists. Skipping...
Data for 5AKI already exists. Skipping...
Data for 5AKJ already exists. Skipping...
Data for 5AKK already exists. Skipping...
Data for 5AKL already exists. Skipping...
Data for 5AKS already exists. Skipping...
Data for 5AKT already exists. Skipping...
Data for 5AKU already exists. Skipping...
Data for 5AKV already exists. Skipping...
Data for 5AKW already exists. Skipping...
Data for 5AKX already exists. Skipping...
Data for 5AKY already exists. Skipping...
Data for 5AKZ already exists. Skipping...
Data for 5AL0 already exists. Skipping...
Data for 5AL1 already exists. Skipping...
Data for 5AL2 already exists. Skipping...
Data for 5AL3 already exists. Skipping...
Data for 5AL4 already exists. Skipping...
Data for 5AL5 already exists. Skip