In [12]:
from Bio.PDB import PDBParser
import pandas as pd
import requests
import os
import sqlite3
import json
import gc
pd.set_option('display.max_columns', None)

tf_folder = "pdb_TF_files"
non_tf_folder = "pdb_non_TF_files"

os.makedirs(tf_folder, exist_ok=True)
os.makedirs(non_tf_folder, exist_ok=True)

conn = sqlite3.connect("dissertation_SQLitev1.db")
cursor = conn.cursor()

create_table_SQLite = """
CREATE TABLE IF NOT EXISTS protein_summary_table (
    UniProt_id TEXT PRIMARY KEY,
    Is_TF INTEGER NOT NULL,
    Entry_Name TEXT NOT NULL,
    DBD TEXT,
    Length_UniProt INTEGER NOT NULL,
    Length_AlphaFold INTEGER NOT NULL,
    AlphaFold_Species TEXT NOT NULL,
    AlphaFold_UniProt_id TEXT NOT NULL,
    AlphaFold_Entry_Name TEXT NOT NULL,
    AlphaFold_is_reference_proteome INTEGER NOT NULL,
    average_pLDDT_for_entire_protein REAL NOT NULL,
    num_disordered_res INTEGER NOT NULL,
    num_ordered_res INTEGER NOT NULL,
    percent_disordered_res_for_entire_protein REAL NOT NULL,
    N_terminal_IDR_presence INTEGER NOT NULL,
    N_IDR_length_over_10 INTEGER NOT NULL,
    C_terminal_IDR_presence INTEGER NOT NULL,
    C_IDR_length_over_10 INTEGER NOT NULL,
    Disordered_tail_N_or_C_presence INTEGER NOT NULL,
    Disordered_tails_N_and_C_presence INTEGER NOT NULL,
    n_IDR_pLDDT_mean REAL,
    c_IDR_pLDDT_mean REAL,
    number_of_dis_regions_over_10_res INTEGER NOT NULL,
    disordered_regions TEXT,
    average_length_dis_region REAL NOT NULL,
    longest_length_dis_region INTEGER NOT NULL,
    shortest_length_dis_region INTEGER NOT NULL
    
    
);

"""

cursor.execute(create_table_SQLite)
conn.commit()


parser = PDBParser(QUIET=True)



def fetch_protein_pdb_file(uniprot_acc):
    url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_acc}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data[0]["pdbUrl"]

    else:
        print(f"Error fetching {uniprot_acc}: {response.status_code}")
        return None


def fetch_AlphaFold_data(uniprot_acc):
    url = f"https://alphafold.ebi.ac.uk/api/prediction/{uniprot_acc}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        AlphaFold_dict = {
            "AlphaFold_Species": data[0]["organismScientificName"],
            "AlphaFold_UniProt_id": data[0]["uniprotAccession"],
            "AlphaFold_Entry_Name": data[0]["uniprotId"],
            "AlphaFold_is_reference_proteome": int(data[0]["isReferenceProteome"]),
            "AlphaFold_pdbUrl": data[0]["pdbUrl"]
        }
        return AlphaFold_dict

    else:
        print(f"Error fetching {uniprot_acc}: {response.status_code}")
        return None


def download_pdb_file(pdb_URL, pdb_file_name):
    response = requests.get(pdb_URL)
    if response.status_code == 200:
        with open(pdb_file_name, "wb") as file:
            file.write(response.content)
        print(f"file {pdb_file_name} successfully downloaded")
        return True
    else:
        print(f"Failed to download the file for {pdb_file_name}. Status code: {response.status_code}")
        return False

        
def parse_pdb_file(pdb_file_path):
    structure = parser.get_structure("AlphaFold", pdb_file_path)

    data = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.has_id("CA"):
                    ca_atom = residue["CA"]
                    data.append({
                        "residue": residue.get_resname(),
                        "chain": chain.id,
                        "residue_id": residue.id[1],
                        "pLDDT": ca_atom.get_bfactor(),
                    })


    df = pd.DataFrame(data)

    return df




def check_IDR_over_10_in_termini(pLDDT_df_n_and_c):
    df_sorted = pLDDT_df_n_and_c.sort_values("residue_id")
    threshold_pLDDT_score = 68.8

    n_terminal_df = df_sorted.head(10)
    n_terminal_disordered = (n_terminal_df["pLDDT"] < threshold_pLDDT_score).all()
    n_terminal_disordered_integer = int(n_terminal_disordered)
    n_IDR_length_over_10 = 0
    n_IDR_pLDDT_mean = None
    if n_terminal_disordered:
        for _, row in df_sorted.iterrows():
            if row["pLDDT"] < threshold_pLDDT_score:
                n_IDR_length_over_10 += 1
            else:
                break

    if n_terminal_disordered:
        n_IDR_df = df_sorted.iloc[:n_IDR_length_over_10]
        n_IDR_pLDDT_mean = n_IDR_df["pLDDT"].mean()
            
            

    c_terminal_df = df_sorted.tail(10)
    c_terminal_disordered = (c_terminal_df["pLDDT"] < threshold_pLDDT_score).all()
    c_terminal_disordered_integer = int(c_terminal_disordered)
    c_IDR_length_over_10 = 0
    c_IDR_pLDDT_mean = None
    if c_terminal_disordered:
        for _, row in df_sorted.iloc[::-1].iterrows():
            if row["pLDDT"] < threshold_pLDDT_score:
                c_IDR_length_over_10 += 1
            else:
                break

    if c_terminal_disordered:
        c_IDR_df = df_sorted.iloc[:-c_IDR_length_over_10:-1]
        c_IDR_pLDDT_mean = c_IDR_df["pLDDT"].mean()

    return {
        "N_terminal_IDR_presence": n_terminal_disordered_integer,
        "N_IDR_length_over_10": n_IDR_length_over_10,
        "C_terminal_IDR_presence": c_terminal_disordered_integer,
        "C_IDR_length_over_10": c_IDR_length_over_10,
        "Disordered_tail_N_or_C_presence": int(n_terminal_disordered or c_terminal_disordered),
        "Disordered_tails_N_and_C_presence": int(n_terminal_disordered and c_terminal_disordered),
        "n_IDR_pLDDT_mean": n_IDR_pLDDT_mean if n_terminal_disordered else None,
        "c_IDR_pLDDT_mean": c_IDR_pLDDT_mean if c_terminal_disordered else None
    }


def analyse_disorder_of_whole_protein(pLDDT_df, additional_dict=None):
    termini_function_results = check_IDR_over_10_in_termini(pLDDT_df)

    disordered_residues = pLDDT_df.loc[pLDDT_df.pLDDT < 68.8]
    disordered_residues = disordered_residues.sort_values("residue_id")

    prev_res_id = disordered_residues.residue_id.min() - 1
    consecutive_count = 0
    region_count = 0
    min_length = 10
    segments = []

    for index, row in disordered_residues.iterrows():
        current_res_id = row["residue_id"]
        
        if current_res_id == prev_res_id + 1:
            consecutive_count += 1
        else:
            if consecutive_count >= min_length:
                region_count += 1
                segments.append(consecutive_count)
            consecutive_count = 1
            
        
        prev_res_id = current_res_id

    if consecutive_count >= min_length:
        region_count += 1
        segments.append(consecutive_count)

    average_pLDDT_for_entire_protein = pLDDT_df["pLDDT"].mean()
    num_disordered_res = len(disordered_residues)
    percent_disordered_res_for_entire_protein = (num_disordered_res / len(pLDDT_df)) * 100
    number_of_dis_regions_over_10_res = region_count
    disordered_regions = segments
    disordered_regions_serialized = json.dumps(disordered_regions)
    average_length_dis_region = sum(segments) / len(segments) if len(segments) > 0 else 0
    longest_length_dis_region = max(segments) if len(segments) > 0 else 0
    shortest_length_dis_region = min(segments) if len(segments) > 0 else 0

    if additional_dict is not None:
        results_dict = {
        "UniProt_id": additional_dict["UniProt_id"],
        "Is_TF": additional_dict["Is_TF"],
        "Entry_Name":additional_dict["Entry_Name"],
        "DBD": additional_dict["DBD"],
        "Length_UniProt": additional_dict["Length"],
        "Length_AlphaFold": len(pLDDT_df),
        "AlphaFold_Species": additional_dict["AlphaFold_Species"],
        "AlphaFold_UniProt_id": additional_dict["AlphaFold_UniProt_id"],
        "AlphaFold_Entry_Name": additional_dict["AlphaFold_Entry_Name"],
        "AlphaFold_is_reference_proteome": additional_dict["AlphaFold_is_reference_proteome"], 
        "average_pLDDT_for_entire_protein": average_pLDDT_for_entire_protein,
        "num_disordered_res": num_disordered_res,
        "num_ordered_res": len(pLDDT_df) - num_disordered_res,
        "percent_disordered_res_for_entire_protein": percent_disordered_res_for_entire_protein,
        "N_terminal_IDR_presence": termini_function_results["N_terminal_IDR_presence"],
        "N_IDR_length_over_10": termini_function_results["N_IDR_length_over_10"],
        "C_terminal_IDR_presence": termini_function_results["C_terminal_IDR_presence"],
        "C_IDR_length_over_10": termini_function_results["C_IDR_length_over_10"],
        "Disordered_tail_N_or_C_presence": termini_function_results["Disordered_tail_N_or_C_presence"],
        "Disordered_tails_N_and_C_presence": termini_function_results["Disordered_tails_N_and_C_presence"],
        "n_IDR_pLDDT_mean": termini_function_results["n_IDR_pLDDT_mean"],
        "c_IDR_pLDDT_mean": termini_function_results["c_IDR_pLDDT_mean"],
        "number_of_dis_regions_over_10_res": number_of_dis_regions_over_10_res,
        "disordered_regions": disordered_regions_serialized,
        "average_length_dis_region": average_length_dis_region,
        "longest_length_dis_region": longest_length_dis_region,
        "shortest_length_dis_region": shortest_length_dis_region
        
    }
    else:
        results_dict = {
        
        "average_pLDDT_for_entire_protein": average_pLDDT_for_entire_protein,
        "Length_AlphaFold": len(pLDDT_df),
        "num_disordered_res": num_disordered_res,
        "num_ordered_res": len(pLDDT_df) - num_disordered_res,
        "percent_disordered_res_for_entire_protein": percent_disordered_res_for_entire_protein,
        "N_terminal_IDR_presence": termini_function_results["N_terminal_IDR_presence"],
        "N_IDR_length_over_10": termini_function_results["N_IDR_length_over_10"],
        "C_terminal_IDR_presence": termini_function_results["C_terminal_IDR_presence"],
        "C_IDR_length_over_10": termini_function_results["C_IDR_length_over_10"],
        "Disordered_tail_N_or_C_presence": termini_function_results["Disordered_tail_N_or_C_presence"],
        "Disordered_tails_N_and_C_presence": termini_function_results["Disordered_tails_N_and_C_presence"],
        "n_IDR_pLDDT_mean": termini_function_results["n_IDR_pLDDT_mean"],
        "c_IDR_pLDDT_mean": termini_function_results["c_IDR_pLDDT_mean"],
        "number_of_dis_regions_over_10_res": number_of_dis_regions_over_10_res,
        "disordered_regions": disordered_regions,
        "average_length_dis_region": average_length_dis_region,
        "longest_length_dis_region": longest_length_dis_region,
        "shortest_length_dis_region": shortest_length_dis_region
        
    }
        
    

    df = pd.DataFrame([results_dict])
    return df



def extract_UniProt_DBD_isTF_Entry_Name_and_Length(input_df_row):
    information = {
        "UniProt_id": input_df_row["UniProt_id"],
        "DBD": input_df_row["DBD"],
        "Is_TF": input_df_row["Is_TF"],
        "Entry_Name":input_df_row["Entry_Name"],
        "Length": input_df_row["Length"]
    }
    return information

def insert_protein_summary_results_into_SQLite(input_df, conn, table_name="protein_summary_table"):
    try:
        UniProt = input_df["UniProt_id"].iat[0]
        input_df.to_sql(table_name, conn, if_exists="append", index = False)
        conn.commit()
        print(f"data for {UniProt} added to {table_name}")
        return True
    except Exception as e:
        print(f"""

        
        unable to add {UniProt} to {table_name}: {e}

        
        """)
        return e

def combine_metadata_dictionaries(UniProt_dict, AlphaFold_dict):
    combined_dict = {
        "UniProt_id": UniProt_dict["UniProt_id"],
        "DBD": UniProt_dict["DBD"],
        "Is_TF": UniProt_dict["Is_TF"],
        "Entry_Name":UniProt_dict["Entry_Name"],
        "Length": UniProt_dict["Length"],
        "AlphaFold_Species": AlphaFold_dict["AlphaFold_Species"],
        "AlphaFold_UniProt_id": AlphaFold_dict["AlphaFold_UniProt_id"],
        "AlphaFold_Entry_Name": AlphaFold_dict["AlphaFold_Entry_Name"],
        "AlphaFold_is_reference_proteome": AlphaFold_dict["AlphaFold_is_reference_proteome"],
    }
    return combined_dict
    








def master_function(input_data_type):
    try:
        if isinstance(input_data_type, list):
            results = []
            for accession in input_data_type:
                file_name = f"{accession}.pdb"
                pdb_url = fetch_protein_pdb_file(accession)
                download_pdb_file(pdb_url, file_name)
                dataframes = parse_pdb_file(file_name)
                whole_protein_analysis = analyse_disorder_of_whole_protein(dataframes)
                results.append(whole_protein_analysis)
            return results

        elif isinstance(input_data_type, str):
            file_name = f"{input_data_type}.pdb"
            pdb_url = fetch_protein_pdb_file(input_data_type)
            download_pdb_file(pdb_url, file_name)
            dataframes = parse_pdb_file(file_name)
            analysis_summary = analyse_disorder_of_whole_protein(dataframes)
            return analysis_summary
            
        elif isinstance(input_data_type, pd.DataFrame):
            unfound_protein_accessions = []
            for _, row in input_data_type.iterrows():
                row_information = extract_UniProt_DBD_isTF_Entry_Name_and_Length(row)
                accession = row["UniProt_id"]
                is_TF = row["Is_TF"]
                if (is_TF == 1):
                    folder_name = "pdb_TF_files"
                elif (is_TF == 0):
                    folder_name = "pdb_non_TF_files"
                file_name = os.path.join(folder_name, f"{accession}.pdb")
                AlphaFold_pdb_and_metadata = fetch_AlphaFold_data(accession)

                if AlphaFold_pdb_and_metadata is None:
                    unfound_protein_accessions.append({"accession": accession, "reason": "failed API call"})
                    print(f"""

                    
                    Skipping {accession} - not found in AlphaFold API call.
                    
                    
                    """)
                    continue

                if (AlphaFold_pdb_and_metadata["AlphaFold_UniProt_id"] != accession):
                    unfound_protein_accessions.append({"accession": accession, "reason": "wrong UniProt id"})
                    print(f"""

                    
                    Skipping {accession} - wrong UniProt id
                    
                    
                    """)
                    continue
                    
                
                downloaded_status = download_pdb_file(AlphaFold_pdb_and_metadata["AlphaFold_pdbUrl"], file_name)
                if downloaded_status == False:
                    unfound_protein_accessions.append({"accession": accession, "reason": "failed download"})
                    print(f"""

                    
                    Skipping {accession} - download failed.
                    
                    
                    """)
                    continue

                try:
                    metadata_dict = combine_metadata_dictionaries(row_information, AlphaFold_pdb_and_metadata)
                    dataframes = parse_pdb_file(file_name)
                    whole_protein_analysis = analyse_disorder_of_whole_protein(dataframes, metadata_dict)
                    SQLite_status = insert_protein_summary_results_into_SQLite(whole_protein_analysis, conn)
                    if SQLite_status != True:
                        unfound_protein_accessions.append({"accession": accession, "reason": SQLite_status})

                finally:
                    try:
                        del dataframes
                        del whole_protein_analysis
                    except NameError:
                        pass
                    gc.collect()
                        
                
            if (len(unfound_protein_accessions) > 0):
                bad_accessions = "\n".join(f"{i+1}: {f['accession']}, {f['reason']}" for i, f in enumerate(unfound_protein_accessions))
                print(f"""
                {len(unfound_protein_accessions)} protein/s failed to be added to SQLite
                {bad_accessions}
                """)
            else:
                print("ALL proteins added successfully")
                
            return None
                
            
        else:
            raise TypeError("Input not a string, list, or dataframe")
            
    except Exception as e:
        print(f"Error processing {input_data_type}: {e}")
