In [3]:
import requests
import csv
import os

def download_pdb(pdb_id, structures_folder):
    # Define the URL for the PDB file
    url = f"https://files.rcsb.org/download/{pdb_id}.pdb"

    try:
        # Send a request to the RCSB PDB website
        response = requests.get(url)
        response.raise_for_status()  # Check if the request was successful
    except requests.exceptions.HTTPError as e:
        print(f"Error: {e}")
        return

    # Save the PDB file in the 'pdb_files' folder
    file_path = os.path.join(structures_folder, "pdb_files", f"{pdb_id}.pdb")
    with open(file_path, "wb") as pdb_file:
        pdb_file.write(response.content)
    print(f"PDB file for {pdb_id} has been downloaded and saved as '{file_path}'.")

def get_pdb_with_best_resolution(uniprot_id, sequence_length):
    url = f"https://www.uniprot.org/uniprot/{uniprot_id}.txt"
    response = requests.get(url)

    if response.ok:
        data_lines = response.text.splitlines()
        best_pdb_id = None
        best_resolution = float('inf')
        best_length = 0
        chains = "NULL"  # Default value for chains

        for line in data_lines:
            if line.startswith("DR   PDB; "):
                pdb_id, method, resolution, length, chains = parse_pdb_line(line)
                if length == sequence_length and (length > best_length or (length == best_length and resolution < best_resolution)):
                    best_pdb_id = pdb_id
                    best_resolution = resolution
                    best_length = length

        if best_pdb_id:
            return best_pdb_id, best_length, best_resolution, chains
        else:
            return "NULL", "NULL", "NULL", "NULL"
    else:
        print(f"Failed to retrieve data from Uniprot for UniProt ID: {uniprot_id}")
        return None

def parse_pdb_line(line):
    line_parts = line.strip().split("; ")
    pdb_id = line_parts[1]
    method = line_parts[2]
    resolution_str = line_parts[3].split(" ")[0]
    resolution = None
    if resolution_str != '-':
        resolution = float(resolution_str)
    length_parts = line_parts[4].split("=")[1].split("-")
    length = extract_numeric_length(length_parts[1]) - extract_numeric_length(length_parts[0]) + 1
    chain_ids = line_parts[4]
    return pdb_id, method, resolution, length, chain_ids

def extract_numeric_length(length_str):
    numeric_length = ''
    for char in length_str:
        if char.isdigit():
            numeric_length += char
    return int(numeric_length)

# Prompt for CSV file path
file_name = "Gene-Uniprot-Data.csv"

if not os.path.exists(file_name):
    print(f"Warning: '{file_name}' not found in the current directory.")
    file_name = input("Please provide the full path to the csv file with Gene and UniProt ID columns: ")

# Check if the user-provided path exists
if not os.path.exists(file_name):
    print(f"Error: The file '{file_name}' does not exist. Please check the path and try again.")
else:
    # Create output directory
    structures_folder = "structures"

    if os.path.exists(structures_folder):
        print(f"Warning: The folder '{structures_folder}' already exists.")
        structures_folder = input("Please provide a new name for the folder: ")

    os.makedirs(os.path.join(structures_folder, "pdb_files"), exist_ok=True)
    os.makedirs(os.path.join(structures_folder, "alphafold_structures"), exist_ok=True)

    output_file_name = os.path.join(structures_folder, "PDB-Structure-Results.csv")
    unavailable_structures_file = os.path.join(structures_folder, "Unavailable-Structures.csv")
    retrieved_structures_file = os.path.join(structures_folder, "Retrieved-Structure-Details.csv")

    with open(file_name, "r") as csvfile:
        csv_reader = csv.DictReader(csvfile)  # Use DictReader for easier column access
        # Check for required columns
        if "Gene" not in csv_reader.fieldnames or "UniProt ID" not in csv_reader.fieldnames:
            print("Error: The CSV file must contain 'Gene' and 'UniProt ID' columns.")
            exit()

        with open(output_file_name, "w", newline='') as result_csvfile:
            csv_writer = csv.writer(result_csvfile)
            csv_writer.writerow(['Gene', 'UniProt ID', 'Best PDB ID', 'PDB Length', 'Resolution', 'Chains'])

            # List to track unavailable structures
            unavailable_structures = []
            # List to track retrieved structures details
            retrieved_structures = []

            for row in csv_reader:
                gene = row["Gene"]
                uniprot_id = row["UniProt ID"]
                
                print(f"Processing Gene: {gene}, UniProt ID: {uniprot_id}")
                best_pdb_id, best_length, best_resolution, chains = get_pdb_with_best_resolution(uniprot_id, 0)  # sequence_length set to 0

                # Initialize structure retrieval status
                pdb_status = "Not Available"
                alpha_status = "Not Available"

                if best_pdb_id != "NULL":
                    # Assuming a resolution threshold for retrieved structures
                    if best_resolution < 2.0:  
                        print(f"Best PDB ID: {best_pdb_id}")
                        download_pdb(best_pdb_id, structures_folder)
                        pdb_status = "Retrieved"
                    else:
                        print(f"PDB structure for {best_pdb_id} is skipped due to lower structure quality.")
                else:
                    print(f"No PDB ID found or PDB structure is of poor quality.")
                    print(f"Checking AlphaFold Structure through UniProt ID: {uniprot_id}")
                    pdb_url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.pdb"
                    file_path = os.path.join(structures_folder, "alphafold_structures", f"AF-{uniprot_id}-F1-model_v4.pdb")

                    # Use requests to download the AlphaFold structure
                    response = requests.get(pdb_url)
                    if response.ok:
                        with open(file_path, "wb") as alpha_file:
                            alpha_file.write(response.content)
                        print(f"AlphaFold structure for accession {uniprot_id} downloaded as '{file_path}'.")
                        alpha_status = "Retrieved"
                    else:
                        print(f"Failed to download AlphaFold structure for accession {uniprot_id}.")
                        # Log unavailable structures
                        unavailable_structures.append([gene, uniprot_id, "Not available", "Not available"])

                csv_writer.writerow([gene, uniprot_id, best_pdb_id, best_length, best_resolution, chains])
                
                # Track the retrieval status for structures
                retrieved_structures.append([gene, uniprot_id, pdb_status, alpha_status])

            # Write unavailable structures to CSV if any
            if unavailable_structures:
                with open(unavailable_structures_file, "w", newline='') as unavailable_csvfile:
                    unavailable_csv_writer = csv.writer(unavailable_csvfile)
                    unavailable_csv_writer.writerow(['Gene', 'UniProt ID', 'PDB', 'AlphaFold'])  # Header for unavailable structures
                    unavailable_csv_writer.writerows(unavailable_structures)
                print(f"Unavailable structure details logged in '{unavailable_structures_file}'.")

            # Write retrieved structures status to CSV
            with open(retrieved_structures_file, "w", newline='') as retrieved_csvfile:
                retrieved_csv_writer = csv.writer(retrieved_csvfile)
                retrieved_csv_writer.writerow(['Gene', 'UniProt ID', 'PDB', 'AlphaFold'])  # Header for retrieved structures
                retrieved_csv_writer.writerows(retrieved_structures)
            print(f"Structure details logged in '{retrieved_structures_file}'.")



Please provide the full path to the csv file with Gene and UniProt ID columns:  C:\Users\FA19-BSI-034.cui\Desktop\Structure-Results.csv




Please provide a new name for the folder:  last


Processing Gene: APOE, UniProt ID: P02649
No PDB ID found or PDB structure is of poor quality.
Checking AlphaFold Structure through UniProt ID: P02649
AlphaFold structure for accession P02649 downloaded as 'last\alphafold_structures\AF-P02649-F1-model_v4.pdb'.
Structure details logged in 'last\Retrieved-Structure-Details.csv'.
