# Summary

Generate adjancency matrices for the Protherm training set directly from PDBs.

---

# Imports

In [1]:
import concurrent.futures
import importlib
import logging
import os
import os.path as op
import shutil
import sys
from collections import Counter
from pathlib import Path

import kmbio.PDB
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import psutil
import pyarrow as pa
import pyarrow.parquet as pq
from kmtools import structure_tools



In [2]:
%matplotlib inline

In [3]:
logging.basicConfig(level=logging.INFO)

In [4]:
SRC_PATH = Path.cwd().joinpath('..', 'src').resolve(strict=True)

if SRC_PATH.as_posix() not in sys.path:
    sys.path.insert(0, SRC_PATH.as_posix())

import helper
importlib.reload(helper)

<module 'helper' from '/home/kimlab1/database_data/datapkg/adjacency-net-v2/src/helper/__init__.py'>

# Parameters

In [5]:
NOTEBOOK_NAME = '01_protherm_dataset'
NOTEBOOK_PATH = Path(NOTEBOOK_NAME)

NOTEBOOK_PATH.mkdir(parents=True, exist_ok=True)

In [6]:
OUTPUT_PATH = Path(os.getenv('OUTPUT_DIR', NOTEBOOK_PATH.name)).resolve()
OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
OUTPUT_PATH

PosixPath('/home/kimlab1/database_data/datapkg/adjacency-net-v2/notebooks/protherm_dataset')

In [7]:
# DEBUG = "CI" not in os.environ
DEBUG = False
TASK_ID = os.getenv("SLURM_ARRAY_TASK_ID")
TASK_COUNT = os.getenv("ORIGINAL_ARRAY_TASK_COUNT") or os.getenv("SLURM_ARRAY_TASK_COUNT")

DEBUG, TASK_ID, TASK_COUNT

(False, None, None)

In [8]:
DATABIN_PATH = Path(os.environ['DATABIN_DIR'])  # /home/kimlab2/database_data/databin

# Load data

In [9]:
ROSETTA_RESULTS = {}

with pd.HDFStore(DATABIN_PATH.joinpath('elapsam_feature_engineering/v0.1.0/rosetta.h5').as_posix(), 'r') as store:
    for key in store:
        ROSETTA_RESULTS[key.strip('/')] = store[key][:]

In [10]:
ROSETTA_RESULTS.keys()

dict_keys(['cartesian_ddg-beta_nov15_cart-1', 'cartesian_ddg-beta_nov16_cart-1', 'cartesian_ddg-score12_cart-1', 'cartesian_ddg-talaris2013_cart-1', 'cartesian_ddg-talaris2014_cart-1', 'ddg_monomer-soft_rep_design-1'])

In [11]:
ROSETTA_RESULTS['cartesian_ddg-talaris2014_cart-1'].head()

Unnamed: 0_level_0,filename-wt,pdb_chain,mutation,ddg,ddg_exp
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,/home/kimlab2/database_data/biological-data-wa...,A,G44S,-0.633667,-0.53
1,/home/kimlab2/database_data/biological-data-wa...,A,A120M,-0.188,-0.2
2,/home/kimlab2/database_data/biological-data-wa...,A,A116N,0.114,0.17
3,/home/kimlab2/database_data/biological-data-wa...,A,A122Q,0.508667,-0.24
4,/home/kimlab2/database_data/biological-data-wa...,A,A123Q,-0.577667,-0.22


In [12]:
rosetta_results_df = None

for key, df in ROSETTA_RESULTS.items():
    df = df.rename(columns={'ddg': key})
    if rosetta_results_df is None:
        rosetta_results_df = df
    else:
        assert (rosetta_results_df['ddg_exp'].values == df['ddg_exp'].values).all()
        rosetta_results_df = rosetta_results_df.merge(
            df.drop('ddg_exp', axis=1), on=['filename-wt', 'pdb_chain', 'mutation'], how='outer')

rosetta_results_df = rosetta_results_df.rename(columns=lambda c: c.replace('-', '_').strip('_'))
display(rosetta_results_df.head())
print(rosetta_results_df.shape)

Unnamed: 0,filename_wt,pdb_chain,mutation,cartesian_ddg_beta_nov15_cart_1,ddg_exp,cartesian_ddg_beta_nov16_cart_1,cartesian_ddg_score12_cart_1,cartesian_ddg_talaris2013_cart_1,cartesian_ddg_talaris2014_cart_1,ddg_monomer_soft_rep_design_1
0,/home/kimlab2/database_data/biological-data-wa...,A,G44S,-1.808667,-0.53,-0.701,0.088,-0.289667,-0.633667,-2.384
1,/home/kimlab2/database_data/biological-data-wa...,A,A120M,2.617667,-0.2,0.354,0.56,-0.069,-0.188,2.472
2,/home/kimlab2/database_data/biological-data-wa...,A,A116N,0.502667,0.17,-0.131333,0.419,0.392,0.114,0.652
3,/home/kimlab2/database_data/biological-data-wa...,A,A122Q,1.760333,-0.24,1.999,3.205667,0.605667,0.508667,12.54
4,/home/kimlab2/database_data/biological-data-wa...,A,A123Q,0.110333,-0.22,-1.153,1.313,-0.334333,-0.577667,-3.73


(3471, 10)


## Copy structures

In [13]:
STRUCTURE_PATH = NOTEBOOK_PATH.joinpath('structures')
STRUCTURE_PATH.mkdir(exist_ok=True)

In [14]:
os.listdir(STRUCTURE_PATH)[:10]

['pdb1stn.entA_0--K131G-wt.pdb',
 'pdb1stn.entA_0--T39I-wt.pdb',
 'pdb1stn.entA_0--K65Q-wt.pdb',
 'pdb1ten.entA_0--Y68A-wt.pdb',
 'pdb1aye.entA_0--K38A-wt.pdb',
 'pdb1shf.entA_0--S41A-wt.pdb',
 'pdb1stn.entA_0--T115V-wt.pdb',
 'pdb1fmk.entA_0--Q47A-wt.pdb',
 'pdb1stn.entA_0--V61T-wt.pdb',
 'pdb1shf.entA_0--S41I-wt.pdb']

In [15]:
def get_local_filename(filename):
    return STRUCTURE_PATH.joinpath(op.basename(filename)).absolute().as_posix()

get_local_filename(rosetta_results_df['filename_wt'].iloc[0])

'/home/kimlab1/database_data/datapkg/adjacency-net-v2/notebooks/protherm_dataset/structures/pdb107l.entA_0--G44S-wt.pdb'

In [16]:
file_list = rosetta_results_df['filename_wt'].drop_duplicates().tolist()

local_filename_wt = []
for i, filename in enumerate(file_list):
    if i % 200 == 0:
        print(i)
    new_filename = STRUCTURE_PATH.joinpath(op.basename(filename))
    filename = filename.replace(
        "/home/kimlab2/database_data/biological-data-warehouse",
        Path("~/datapkg").expanduser().as_posix(),
    )
    local_filename = get_local_filename(filename)
    if not op.isfile(local_filename):
        shutil.copy(filename, local_filename)
    local_filename_wt.append(local_filename)   

rosetta_results_df['local_filename_wt'] = local_filename_wt

0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400


In [17]:
rosetta_results_df.head()

Unnamed: 0,filename_wt,pdb_chain,mutation,cartesian_ddg_beta_nov15_cart_1,ddg_exp,cartesian_ddg_beta_nov16_cart_1,cartesian_ddg_score12_cart_1,cartesian_ddg_talaris2013_cart_1,cartesian_ddg_talaris2014_cart_1,ddg_monomer_soft_rep_design_1,local_filename_wt
0,/home/kimlab2/database_data/biological-data-wa...,A,G44S,-1.808667,-0.53,-0.701,0.088,-0.289667,-0.633667,-2.384,/home/kimlab1/database_data/datapkg/adjacency-...
1,/home/kimlab2/database_data/biological-data-wa...,A,A120M,2.617667,-0.2,0.354,0.56,-0.069,-0.188,2.472,/home/kimlab1/database_data/datapkg/adjacency-...
2,/home/kimlab2/database_data/biological-data-wa...,A,A116N,0.502667,0.17,-0.131333,0.419,0.392,0.114,0.652,/home/kimlab1/database_data/datapkg/adjacency-...
3,/home/kimlab2/database_data/biological-data-wa...,A,A122Q,1.760333,-0.24,1.999,3.205667,0.605667,0.508667,12.54,/home/kimlab1/database_data/datapkg/adjacency-...
4,/home/kimlab2/database_data/biological-data-wa...,A,A123Q,0.110333,-0.22,-1.153,1.313,-0.334333,-0.577667,-3.73,/home/kimlab1/database_data/datapkg/adjacency-...


# Process data

In [18]:
if DEBUG:
    rosetta_results_df = rosetta_results_df.iloc[:10]

## Extract adjacencies

In [19]:
def extract_seq_and_adj(row):
    domain, result_df = helper.get_interaction_dataset_wdistances(
        row.local_filename_wt, 0, row.pdb_chain, r_cutoff=12)
    domain_sequence = structure_tools.get_chain_sequence(domain)
    assert max(result_df['residue_idx_1'].values) < len(domain_sequence)
    assert max(result_df['residue_idx_2'].values) < len(domain_sequence)
    result = {
        'sequence': domain_sequence,
        'residue_idx_1': result_df['residue_idx_1'].values,
        'residue_idx_2': result_df['residue_idx_2'].values,
        'distances': result_df['distance'].values,
    }
    return result

In [20]:
def worker(row_dict):
    row = helper.to_namedtuple(row_dict)
    result = extract_seq_and_adj(row)
    return result

In [21]:
logging.getLogger("kmbio.PDB.core.atom").setLevel(logging.WARNING)

In [22]:
columns = ["local_filename_wt", "pdb_chain"]

with concurrent.futures.ProcessPoolExecutor(psutil.cpu_count(logical=False)) as pool:
    futures = pool.map(worker, (t._asdict() for t in rosetta_results_df[columns].itertuples()))
    results = list(futures)

In [23]:
protherm_validaton_dataset = rosetta_results_df.copy()
protherm_validaton_dataset = protherm_validaton_dataset.rename(columns={'pdb_chain': 'chain_id'})
                                                               
protherm_validaton_dataset['structure_id'] = [
    Path(filename).name[3:7] for filename in protherm_validaton_dataset["filename_wt"]
]
protherm_validaton_dataset['model_id'] = 0

In [24]:
protherm_validaton_dataset['qseq'] = [result["sequence"] for result in results]
protherm_validaton_dataset['residue_idx_1_corrected'] = [result["residue_idx_1"] for result in results]
protherm_validaton_dataset['residue_idx_2_corrected'] = [result["residue_idx_2"] for result in results]
protherm_validaton_dataset['distances'] = [result["distances"] for result in results]

In [25]:
def mutation_matches_sequence(mutation, sequence):
    return sequence[int(mutation[1:-1]) - 1] == mutation[0]


protherm_validaton_dataset['mutation_matches_sequence'] = [
    mutation_matches_sequence(mutation, sequence)
    for mutation, sequence
    in protherm_validaton_dataset[['mutation', 'qseq']].values
]
assert protherm_validaton_dataset['mutation_matches_sequence'].all()

In [26]:
def apply_mutation(sequence, mutation):
    wt, pos, mut = mutation[0], int(mutation[1:-1]), mutation[-1]
    assert sequence[pos - 1] == wt
    sequence_mut = sequence[:pos - 1] + mut + sequence[pos:]
    assert sequence_mut[pos - 1] == mut
    assert len(sequence) == len(sequence_mut)
    return sequence_mut

protherm_validaton_dataset['qseq_mutation'] = [
    apply_mutation(sequence, mutation)
    for mutation, sequence
    in protherm_validaton_dataset[['mutation', 'qseq']].values
]

In [27]:
assert not protherm_validaton_dataset.isnull().any().any()

In [28]:
columns = [
    'structure_id', 'model_id', 'chain_id', 'qseq', 'qseq_mutation', 'ddg_exp', 
    'residue_idx_1_corrected', 'residue_idx_2_corrected', 'distances',
]

for column in columns:
    assert column in protherm_validaton_dataset.columns, column

In [29]:
pq.write_table(
    pa.Table.from_pandas(protherm_validaton_dataset, preserve_index=False),
    OUTPUT_PATH.joinpath('protherm_validaton_dataset.parquet').as_posix(),
    version='2.0', flavor='spark'
)

# Explore

In [30]:
protherm_validaton_dataset.head(2)

Unnamed: 0,filename_wt,chain_id,mutation,cartesian_ddg_beta_nov15_cart_1,ddg_exp,cartesian_ddg_beta_nov16_cart_1,cartesian_ddg_score12_cart_1,cartesian_ddg_talaris2013_cart_1,cartesian_ddg_talaris2014_cart_1,ddg_monomer_soft_rep_design_1,local_filename_wt,structure_id,model_id,qseq,residue_idx_1_corrected,residue_idx_2_corrected,distances,mutation_matches_sequence,qseq_mutation
0,/home/kimlab2/database_data/biological-data-wa...,A,G44S,-1.808667,-0.53,-0.701,0.088,-0.289667,-0.633667,-2.384,/home/kimlab1/database_data/datapkg/adjacency-...,107l,0,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKGEL...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 66, 70, 91...","[1.3463743907249561, 4.728744865183572, 6.3895...",True,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...
1,/home/kimlab2/database_data/biological-data-wa...,A,A120M,2.617667,-0.2,0.354,0.56,-0.069,-0.188,2.472,/home/kimlab1/database_data/datapkg/adjacency-...,160l,0,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 66, 70, 91, 92...","[1.3457596367851132, 4.727237459658657, 6.4231...",True,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...


In [32]:
protherm_validaton_dataset["filename"] = protherm_validaton_dataset["filename_wt"].str.split("/").str[-1]

In [37]:
protherm_validaton_dataset[["filename", "chain_id", "mutation", "ddg_exp"]].to_csv(NOTEBOOK_PATH.joinpath("for_carles.csv"), sep="\t")

In [None]:
aa_wt_counter = Counter(protherm_validaton_dataset['mutation'].str[0])
aa_mut_counter = Counter(protherm_validaton_dataset['mutation'].str[-1])

labels = list(aa_wt_counter)
aa_wt = [aa_wt_counter[l] for l in labels]
aa_mut = [aa_mut_counter[l] for l in labels]

indexes = np.arange(len(labels))
width = 0.3

with plt.rc_context(rc={'figure.figsize': (8, 5), 'font.size': 14}):
    plt.bar(indexes - 0.15 , aa_wt, width, label="wt")
    plt.bar(indexes + 0.15, aa_mut, width, label="mut")
    plt.xticks(indexes, labels)
    plt.ylabel("Number of occurrences")
    plt.legend()