In [46]:
from functions_analysis import *
import glob
import pandas as pd
from DockQ.DockQ import load_PDB, run_on_all_native_interfaces
from download_functions import *
import os

In [47]:
HPC_RESULT_DIR = "/home/markus/MPI_local/HPC_results_full"

In [48]:
# # Create DataFrame from all job data
# results_df_uc = pd.DataFrame(data=find_summary_files(HPC_RESULT_DIR))

# # Print basic information about the DataFrame
# print(f"Total jobs processed: {len(results_df_uc)}")

# results_df_uc['pair_id'] = results_df_uc.apply(create_pair_id, axis=1)

# print(f"jobs before cleaning: {len(results_df_uc)}")
# results_df = clean_results(results_df_uc)
# print(f"jobs after cleaning: {len(results_df)}")

In [49]:
id_list_looks_good = [
    ("Q13285", "A0A2R8YCH5"),
    ("P04637", "A0A8I5KU01"),
    ("P04637", "A0A8I5KU01"),
    ("Q9H3D4", "A0A8I5KU01"),
    ("Q8NHM5", "A1YPR0"),
    ("Q9UJU2", "A0A2R8YCH5"),
    ("Q9UJU2", "A0A2R8YCH5"),
    ("Q6SJ96", "O14981"),
    ("Q9NRY4", "O00750"),
    ("Q6ZRS2", "A0A8V8TQN3")
]

In [50]:
review_files = glob.glob('/home/markus/MPI_local/production1/structure_reviews/*.csv')
reviews_df = pd.concat([pd.read_csv(f) for f in review_files], ignore_index=True)

In [51]:
good_structs = reviews_df[reviews_df['comment'] == 'looks good']

# Get unique PDB IDs from reviews_df
unique_pdb_ids = good_structs['pdb_id'].unique()
print(f"Found {len(unique_pdb_ids)} unique PDB IDs to download")

# Define download directory
pdb_download_dir = "/home/markus/MPI_local/data/PDB"

# Download each PDB structure (function handles duplicate checking)
downloaded_count = 0
failed_count = 0

for pdb_id in unique_pdb_ids:
    if pd.isna(pdb_id):  # Skip NaN values
        continue
        
    result = download_pdb_structure(pdb_id, pdb_download_dir)
    if result:
        downloaded_count += 1
    else:
        failed_count += 1

print(f"\nSummary:")
print(f"Successfully processed: {downloaded_count}")
print(f"Failed downloads: {failed_count}")
print(f"Total processed: {len([pdb_id for pdb_id in unique_pdb_ids if not pd.isna(pdb_id)])}")

Found 12 unique PDB IDs to download
File 3tx7.cif already exists at /home/markus/MPI_local/data/PDB/3tx7.cif
File 8rci.cif already exists at /home/markus/MPI_local/data/PDB/8rci.cif
File 7z71.cif already exists at /home/markus/MPI_local/data/PDB/7z71.cif
File 9gnb.cif already exists at /home/markus/MPI_local/data/PDB/9gnb.cif
File 8uah.cif already exists at /home/markus/MPI_local/data/PDB/8uah.cif
File 3ouw.cif already exists at /home/markus/MPI_local/data/PDB/3ouw.cif
File 1g3j.cif already exists at /home/markus/MPI_local/data/PDB/1g3j.cif
File 3oc3.cif already exists at /home/markus/MPI_local/data/PDB/3oc3.cif
File 5m6u.cif already exists at /home/markus/MPI_local/data/PDB/5m6u.cif
File 9caf.cif already exists at /home/markus/MPI_local/data/PDB/9caf.cif
File 1jpw.cif already exists at /home/markus/MPI_local/data/PDB/1jpw.cif
File 7yui.cif already exists at /home/markus/MPI_local/data/PDB/7yui.cif

Summary:
Successfully processed: 12
Failed downloads: 0
Total processed: 12


In [52]:
PROTEOME_PATH = '/home/markus/MPI_local/data/Proteome/uniprotkb_proteome_UP000005640_2025_05_28.tsv'
all_uniprot = pd.read_csv(PROTEOME_PATH, low_memory=False, sep='\t')

In [53]:
def get_job_name(id_1: str, id_2: str, df: pd.DataFrame):
    ids_sorted = sorted([id_1.upper(), id_2.upper()])
    row_a = df[df['Entry'] == ids_sorted[0]]
    if row_a.empty:
        raise Exception(f"Not found in Uniprot df: {ids_sorted[0]}")
    row_b = df[df['Entry'] == ids_sorted[1]]
    if row_b.empty:
        raise Exception(f"Not found in Uniprot df: {ids_sorted[1]}")
    length_a = row_a['Length'].iloc[0]
    length_b = row_b['Length'].iloc[0]
    return f"{ids_sorted[0].lower()}_1-{length_a}_{ids_sorted[1].lower()}_1-{length_b}"

In [93]:
import itertools
def get_all_chain_mappings(native_chains, model_chains) -> List:
    
    all_mappings = []
    
    if len(native_chains) > len(model_chains):
        all_subsets = itertools.combinations(native_chains, len(model_chains))
        for subset in all_subsets:
            all_mappings.extend(all_bijective_mappings(subset, model_chains))
    elif len(model_chains) > len(native_chains):
        all_subsets = itertools.combinations(model_chains, len(native_chains))
        for subset in all_subsets:
            all_mappings.extend(all_bijective_mappings(native_chains, subset))
    else:
        all_mappings = all_bijective_mappings(native_chains, model_chains)
    return all_mappings

def all_bijective_mappings(A, B):
    """
    Return a list containing every dictionary that maps each element of A
    to a unique element of B.  A and B must be the same length.
    """

    # For each permutation of B, zip it with A to make a mapping dict
    return [dict(zip(A, perm)) for perm in itertools.permutations(B)]

In [None]:
# Case 1: model_chains and native_chains have same length
native_chains = ['A', 'B']
model_chains = ['X', 'Y']
mappings = list(get_all_chain_mappings(native_chains, model_chains))
assert mappings == [{'A': 'X', 'B': 'Y'}, {'A': 'Y', 'B': 'X'}]
# Case 2: model_chains > native_chains
native_chains = ['A', 'B', 'C']
model_chains = ['X', 'Y']
mappings = get_all_chain_mappings(native_chains, model_chains)
assert mappings == [{'A': 'X', 'B': 'Y'}, {'A': 'Y', 'B': 'X'}, {'A': 'X', 'C': 'Y'}, {'A': 'Y', 'C': 'X'}, {'B': 'X', 'C': 'Y'}, {'B': 'Y', 'C': 'X'}]

In [94]:
NATIVE_PATH_PREFIX = "/home/markus/MPI_local/data/PDB/"
MODEL_PATH_PREFIX = "/home/markus/MPI_local/HPC_results_full/all/"

for index, row in good_structs.iterrows():
    job_name = get_job_name(row['query_x'].split('|')[0], row['query_y'].split('|')[0], all_uniprot)
    model_path = f'{MODEL_PATH_PREFIX}{job_name}/{job_name}/{job_name}_model.cif'
    native_path = f'{NATIVE_PATH_PREFIX}{row["pdb_id"].lower()}.cif'
    
    # Check if both paths exist before loading
    if not os.path.exists(model_path):
        print(f"Model path does not exist: {model_path}")
        continue
    
    if not os.path.exists(native_path):
        print(f"Native path does not exist: {native_path}")
        continue
    
    model = load_PDB(model_path)
    native = load_PDB(native_path)
    native_chains = [chain.id for chain in model]
    model_chains = [chain.id for chain in native]
    if len(model_chains) > 2:
        print("Warning: Native structure has more than two chains!")
    for chain_map in get_all_chain_mappings(model_chains, native_chains):
        print(run_on_all_native_interfaces(model, native, chain_map=chain_map))
    print('------------------')

({'AB': {'DockQ': 0.01934927647516137, 'F1': 0.024390243902439025, 'iRMSD': 17.445872039673517, 'LRMSD': 49.21055972888123, 'fnat': 0.021739130434782608, 'nat_correct': 1, 'nat_total': 46, 'fnonnat': 0.9722222222222222, 'nonnat_count': 35, 'model_total': 36, 'clashes': 0, 'len1': 504, 'len2': 218, 'class1': 'receptor', 'class2': 'ligand', 'is_het': False, 'chain1': 'A', 'chain2': 'B', 'chain_map': {'A': 'A', 'B': 'B'}}}, 0.01934927647516137)
({'AB': {'DockQ': 0.0078480513048142, 'F1': 0.0, 'iRMSD': 44.554167068765736, 'LRMSD': 56.137978143520385, 'fnat': 0, 'nat_correct': 0, 'nat_total': 46, 'fnonnat': 1.0, 'nonnat_count': 4, 'model_total': 4, 'clashes': 0, 'len1': 504, 'len2': 218, 'class1': 'receptor', 'class2': 'ligand', 'is_het': False, 'chain1': 'B', 'chain2': 'A', 'chain_map': {'A': 'B', 'B': 'A'}}}, 0.0078480513048142)
------------------
({'AE': {'DockQ': 0.00796453151989812, 'F1': 0.0, 'iRMSD': 39.2178085620992, 'LRMSD': 56.111286120394226, 'fnat': 0, 'nat_correct': 0, 'nat_tot

In [12]:
model = load_PDB("/home/markus/MPI_local/HPC_results_full/batch_37/a0a2r8ych5_1-779_q9uju2_1-399/a0a2r8ych5_1-779_q9uju2_1-399/a0a2r8ych5_1-779_q9uju2_1-399_model.cif")
native = load_PDB("/home/markus/MPI_local/data/PDB/3ouw.cif")

# native:model chain map dictionary for two interfaces
chain_map = {"A":"A", "B":"B"}
# returns a dictionary containing the results and the total DockQ score
run_on_all_native_interfaces(model, native, chain_map=chain_map)

({'AB': {'DockQ': 0.661056892654276,
   'F1': 0.7236180904522613,
   'iRMSD': 2.024376351186563,
   'LRMSD': 3.157620619347018,
   'fnat': 0.75,
   'nat_correct': 72,
   'nat_total': 96,
   'fnonnat': 0.30097087378640774,
   'nonnat_count': 31,
   'model_total': 103,
   'clashes': 0,
   'len1': 496,
   'len2': 28,
   'class1': 'receptor',
   'class2': 'ligand',
   'is_het': False,
   'chain1': 'A',
   'chain2': 'B',
   'chain_map': {'A': 'A', 'B': 'B'}}},
 0.661056892654276)