In [1]:
import sys
from pathlib import Path
import pickle
import pandas as pd
import mdtraj as md
import networkx as nx
import logging
from collections import namedtuple
from concurrent.futures import ProcessPoolExecutor
import concurrent.futures as cf
import glob
import csv
from collections import defaultdict
import pandas as pd
import subprocess as sp
import shutil as sh
import MDAnalysis as mda
from itertools import combinations_with_replacement
from MDAnalysis.core.groups import AtomGroup
from MDAnalysis.analysis import distances

from Bio.PDB import PDBParser
from Bio.PDB.PDBIO import PDBIO
from Bio.SeqUtils import seq1, seq3
sys.path.insert(0, "/home/pbarletta/freesasa-python/build/lib.linux-x86_64-3.10")
import freesasa

Chains = namedtuple('Chains', ['antibody', 'antigen'])
InterfaceAtoms = namedtuple('InterfaceAtoms', ['antibody', 'antigen'])
Atom = namedtuple('Atom', ['index', 'serial', 'element', 'is_sidechain', 'resSeq',
                  'resSeq_str', 'resname', 'chain_ID', 'chain_type', 'CDR'])

from abag_interactions_hydrophobic import *
from interactions_hydrophobic import *
           
data_dir = Path("/home/pbarletta/labo/22/migue/data")
expdata_dir = Path("/home/pbarletta/labo/22/migue/data/AB-Bind-Database-master")
rtdo_dir = Path("/home/pbarletta/labo/22/migue/rtdos")
hydro_dir = Path("/home/pbarletta/labo/22/migue/rtdos/hydro")
aux_dir = Path("/home/pbarletta/labo/22/migue/aux")
pdbs_dir = Path("/home/pbarletta/labo/22/migue/run/pdbs")
mutpdbs_dir = Path("/home/pbarletta/labo/22/migue/run/mut_pdbs")
bin_dir = Path("/home/pbarletta/labo/22/locuaz/bin")
evo_bin = Path(bin_dir, "evoef2/EvoEF2")

AA_LIST = ["ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY", "HIS",
           "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL"]
AA_LIST = ("D", "E", "S", "T", "R", "N", "Q", "H", "K", "A", "G", "I", 
    "M", "L", "V", "P", "F", "W", "Y", "C" )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
expdata_df = pd.read_csv(Path(expdata_dir, "AB-Bind_experimental_data.csv"), encoding='latin-1')
pdb_list = tuple(sorted(set(expdata_df['#PDB'])))

abag_chains = {}
for partners, pdb_id in zip(expdata_df["Partners(A_B)"], expdata_df["#PDB"]):
    target, binder = partners.split("_")
    abag_chains[pdb_id] = Chains(antibody=binder, antigen=target) 

mut_list = []
with open(Path(mutpdbs_dir, "mut_list.txt"), "r") as f:
    for mute in f:
        mut_list.append(mute.strip())

pdbs_mut = defaultdict(list)
for mut in mut_list:
    this_pdb = mut.split('-')[0]
    pdbs_mut[this_pdb ].append(mut)

In [19]:
test_id = 25
test_pdb = tuple(pdbs_mut.keys())[test_id]
prueba = defaultdict(list)
prueba[test_pdb] = pdbs_mut[test_pdb]
test_pdb

'3NPS'

In [14]:
pdbs_interface_mut = {}
for pdb_id, mutas in pdbs_mut.items():
# for pdb_id, mutas in prueba.items():
    print(f"{pdb_id=}", flush=True)
    pdb_filename = Path(pdbs_dir, pdb_id + ".pdb")
    pdbs_interface_mut[pdb_id] = []
    ag_chains = abag_chains[pdb_id].antigen
    ab_chains = abag_chains[pdb_id].antibody

    # SASA of the antigen and the antibody
    grupos = '+'.join([ grupo for grupo in  (ag_chains, ab_chains) ])

    structs = freesasa.getStructures(str(pdb_filename), {"separate-chains": False}, 
        selection=grupos)
    wh_sasa = freesasa.calc(structs[0])
    ag_sasa = freesasa.calc(structs[1])
    ab_sasa = freesasa.calc(structs[2])
    sasa_by_chainID = {}
    for chainID in ag_chains:
        sasa_by_chainID[chainID] = (structs[1], ag_sasa)
    for chainID in ab_chains:
        sasa_by_chainID[chainID] = (structs[2], ab_sasa)

    for muta in mutas:
        # print(f"{muta=}", flush=True)
        mut_strings = muta.split('-')[1].split('_')
        for mut_string in mut_strings:
            # Sometimes they come with a `_chainID` when the structure has multiple mutations
            chainID, mut_residue = mut_string.split(':')
            resSeq_str = mut_residue[1:-1]

            cpx_rtdo = freesasa.selectArea(
                    [f"s1, resi {resSeq_str} and chain {chainID}"], structs[0], wh_sasa)
            struct, sasa = sasa_by_chainID[chainID]
            mono_rtdo = freesasa.selectArea(
                    [f"s1, resi {resSeq_str} and chain {chainID}"], struct, sasa)

            complex_mut_sasa = int(cpx_rtdo['s1'])
            monomer_mut_sasa = int(mono_rtdo['s1'] * .95)
            if complex_mut_sasa < monomer_mut_sasa:
                pdbs_interface_mut[pdb_id].append(muta)
                break

pdb_id='1AK4'
pdb_id='1BJ1'
pdb_id='1CZ8'
pdb_id='1DQJ'
pdb_id='1DVF'
pdb_id='1FFW'
pdb_id='1JRH'
pdb_id='1JTG'
pdb_id='1KTZ'
pdb_id='1MHP'
pdb_id='1MLC'
pdb_id='1N8Z'
pdb_id='1T83'
pdb_id='1VFB'
pdb_id='1YY9'
pdb_id='2JEL'
pdb_id='2NY7'
pdb_id='2NYY'
pdb_id='2NZ9'
pdb_id='3BDY'
pdb_id='3BE1'
pdb_id='3BN9'
pdb_id='3HFM'
pdb_id='3K2M'
pdb_id='3NGB'
pdb_id='3NPS'
pdb_id='3WJJ'
pdb_id='HM_1KTZ'
pdb_id='HM_1YY9'
pdb_id='HM_2NYY'
pdb_id='HM_2NZ9'
pdb_id='HM_3BN9'


-----

In [15]:
with open(Path.joinpath(hydro_dir, "pdb_hydrophobic.pkl"), "rb") as file:
    pdb_hydrophobic = pickle.load(file)
with open(Path.joinpath(hydro_dir, "mut_hydrophobic.pkl"), "rb") as file:
    mut_hydrophobic = pickle.load(file)

In [16]:
pdbs_cluster_mut = {}
for pdb_id, mutas in pdbs_mut.items():
    # pdb_Cs = sum(len(cluster) for cluster in pdb_hydrophobic[pdb_id])
    pdbs_cluster_mut[pdb_id] = []
    hydro_residues = { (atom.resSeq_str, atom.chain_ID) 
        for cluster in pdb_hydrophobic[pdb_id] for atom in cluster }
    for muta in mutas:
        # print(f"{muta=}", flush=True)
        mut_strings = muta.split('-')[1].split('_')
        for mut_string in mut_strings:
            chainID, mut_residue = mut_string.split(':')
            resSeq_str = mut_residue[1:-1]

            if (resSeq_str, chainID) in hydro_residues:
                pdbs_cluster_mut[pdb_id].append(muta)
                break

        # mut_Cs = sum([ len(cluster) for cluster in mut_hydrophobic[mut] ])

In [108]:
# Averga cuanta diferencia hay entre le original y sólo aquellos
# q tienen mutaciones en un cluster
for pdb_id, mutas in pdbs_mut.items():
    clus = len(pdbs_cluster_mut[pdb_id])
    orig = len(pdbs_mut[pdb_id])

    print(f"{orig=} -- {clus=}")

orig=17 -- clus=12
orig=19 -- clus=12
orig=19 -- clus=13
orig=34 -- clus=29
orig=38 -- clus=17
orig=9 -- clus=2
orig=9 -- clus=1
orig=31 -- clus=16
orig=22 -- clus=11
orig=93 -- clus=34
orig=27 -- clus=17
orig=36 -- clus=14
orig=244 -- clus=93
orig=55 -- clus=36
orig=8 -- clus=0
orig=43 -- clus=13
orig=31 -- clus=10
orig=28 -- clus=13
orig=19 -- clus=11
orig=34 -- clus=13
orig=34 -- clus=11
orig=35 -- clus=9
orig=48 -- clus=35
orig=7 -- clus=6
orig=15 -- clus=10
orig=27 -- clus=3
orig=25 -- clus=12
orig=22 -- clus=11
orig=16 -- clus=1
orig=25 -- clus=13
orig=16 -- clus=7
orig=8 -- clus=3


In [10]:
ag_chains, ab_chains

('HL', 'VW')

In [11]:
structs

[<freesasa.Structure at 0x7f5f1c737750>,
 <freesasa.Structure at 0x7f5f1c736bb0>,
 <freesasa.Structure at 0x7f5f1c736730>]