In [8]:
import pandas as pd
import seaborn as sns
import sys
sys.path.append('../../src/utils/')
sys.path.append('../../bin/utils')
sns.set_context("talk")
sns.set_theme(style="white")
from Bio import SeqIO
import warnings
import glob
import os
import warnings
warnings.filterwarnings("ignore")

# Structures AF2 
def get_seq(PDBFile):
    with open(PDBFile, 'r') as pdb_file:
        for record in SeqIO.parse(pdb_file, 'pdb-atom'):
            return(str(record.seq))

def get_structure_df(structure_path):
    df = pd.DataFrame(columns=["id", "seq", "family"])
    for structure in structure_path:
        seq = get_seq(structure)
        name = structure.split("/")[-1].split(".")[0]
        # add to dataframe
        df.loc[len(df)] = {"id": name, "seq": seq, "family": structure.split("/")[-2]}
    return(df)

# Real sequence
def get_real_sequence(path_fasta):
    df_references = pd.DataFrame(columns=["id", "seq", "family"])
    for reference in path_fasta:
        # save the sequence and id in dataframe
        for record in SeqIO.parse(reference, "fasta"):
            df_references.loc[len(df_references)] = {"id": record.id, "seq": str(record.seq), "family": reference.split("/")[-1].split(".")[0] }
    return(df_references)

def calculate_percentage_identity(seq1, seq2):
    # if seq1 or seq2 are nan, return 0
    if pd.isna(seq1) or pd.isna(seq2):
        percentage_identity = 0
        return percentage_identity
    if len(seq1) != len(seq2):
        percentage_identity = 0

    identical_count = sum(1 for a, b in zip(seq1, seq2) if a == b)

    percentage_identity = (identical_count / len(seq1)) * 100
    return percentage_identity

In [9]:
#path_structures = glob.glob("../../../titration_pdb_leila/pred_pdbs/structures/fetched_preprocessed/UniProtKB/id_0.99_cov_1.0/"+family+"*/pdbs/*.pdb")
path_structures = glob.glob("../../../pred_pdbs/FETCHED_STRUCTURES/0.99_1/*/*.pdb")
path_fasta = glob.glob("../../../fasta/*.fasta")
df_sequences  = get_real_sequence(path_fasta)
df_structures = get_structure_df(path_structures)

In [11]:
df_structures = df_structures.merge(df_sequences, on=["family", "id"], how="outer", suffixes=("_af2", "_ref"))
df_structures["perc_id"] = df_structures.apply(lambda x: calculate_percentage_identity(x.seq_ref, x.seq_af2), axis=1)

In [12]:
# Check wether there is anything below the said threshold
IDENTITY_THRESHOLD = 99
missing_sequences = df_structures[df_structures.perc_id < IDENTITY_THRESHOLD].sort_values(by="perc_id", ascending=False)
missing_sequences.family.unique()

array(['PF00698', 'PF03727', 'PF00740', 'PF00759', 'PF00285', 'PF01425',
       'PF00331'], dtype=object)

Prepare template files for titration 

In [13]:
full_families = set(df_structures.family.unique()) - set(missing_sequences.family.unique())

In [19]:
outdir_for_prep_files = "../../.."

In [20]:
# list_families_fully_retrieved
with open(outdir_for_prep_files+"list_families_fully_retrieved_uniprot", "w") as f:
    for family in full_families:
        f.write(family+"\n")

In [22]:
# Input list pred 
# Create directory input_pred if it does not exist 
os.system("mkdir -p "+ outdir_for_prep_files+"/02-POSTFETCHING/input_list_pred")
for family in full_families:
    with open(outdir_for_prep_files+"/02-POSTFETCHING/input_list_pred/"+family+"_input_list", "w") as f:
        df_fam = df_structures[df_structures.family == family]
        # print in each line family.id.pdb
        for index, row in df_fam.iterrows():
                f.write(df_fam.loc[index, "family"]+"."+df_fam.loc[index, "id"]+".pdb")
                f.write("\n")

In [23]:
# Templates for pred 
os.system("mkdir -p "+ outdir_for_prep_files+"/02-POSTFETCHING/templates_for_pred")
for family in full_families:
    with open(outdir_for_prep_files+"/02-POSTFETCHING/templates_for_pred/"+family+".template_list", "w") as f:
        df_fam = df_structures[df_structures.family == family]
        # print in each line family.id.pdb
        for index, row in df_fam.iterrows():
                f.write(">"+df_fam.loc[index, "id"]+".pdb _P_ "+df_fam.loc[index, "family"]+"."+df_fam.loc[index, "id"]+".pdb")
                f.write("\n")

In [26]:
# Prepare pdbs pred 
os.system("mkdir -p "+ outdir_for_prep_files+"/02-POSTFETCHING/pred_pdbs")
for structure in path_structures:
    family = structure.split("/")[-2]
    id = structure.split("/")[-1]
    if family in full_families:
        os.system("cp "+structure+" "+outdir_for_prep_files+"/02-POSTFETCHING/pred_pdbs/"+family+"."+id)
        print(family)


PF13714
PF13714
PF13714
PF13714
PF13714
PF13714
PF13714
PF13714
PF13714
PF13714
PF13714
PF01026
PF01026
PF01026
PF01026
PF01026
PF01026
PF01026
PF01026
PF01026
PF01026
PF00464
PF00464
PF00464
PF00464
PF00464
PF00464
PF00464
PF00464
PF00464
PF00464
PF00464
PF05544
PF05544
PF05544
PF05544
PF05544
PF05544
PF05544
PF05544
PF05544
PF05544
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF00180
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480
PF03480


In [28]:
# Subset real FASTA/PDB/template and input
os.system("mkdir -p "+ outdir_for_prep_files+"/02-POSTFETCHING/fasta")
for fasta in glob.glob("../../../fasta/*.fasta"):
    family = fasta.split("/")[-1].split(".")[0]
    filename = fasta.split("/")[-1]
    if family in full_families:
        os.system("cp "+fasta+" "+outdir_for_prep_files+"/02-POSTFETCHING/fasta/"+filename)

In [29]:
os.system("mkdir -p "+ outdir_for_prep_files+"/02-POSTFETCHING/pdb")
for pdb in glob.glob("../../../pdb/*.pdb"):
    family = pdb.split("/")[-1].split(".")[0]
    filename = pdb.split("/")[-1]
    if family in full_families:
        os.system("cp "+pdb+" "+outdir_for_prep_files+"/02-POSTFETCHING/pdb/"+filename)

In [30]:
os.system("mkdir -p "+ outdir_for_prep_files+"/02-POSTFETCHING/templates")
for template in glob.glob("../../../templates/*.pdb"):
    family = template.split("/")[-1].split(".")[0]
    filename = template.split("/")[-1]
    if family in full_families:
        os.system("cp "+template+" "+outdir_for_prep_files+"/02-POSTFETCHING/templates/"+filename)


In [31]:
os.system("mkdir -p "+ outdir_for_prep_files+"/02-POSTFETCHING/input_list")
for input in glob.glob("../../../input_list/*.list"):
    family = input.split("/")[-1].split(".")[0]
    filename = input.split("/")[-1]
    if family in full_families:
        os.system("cp "+input+" "+outdir_for_prep_files+"/02-POSTFETCHING/input_list/"+filename)