In [1]:
import os
import gzip

from pypdb.clients.pdb import pdb_client
import pandas as pd
from Bio.PDB import *
from Bio.PDB.PDBExceptions import PDBConstructionWarning
from tqdm import tqdm

import warnings
warnings.simplefilter('ignore', PDBConstructionWarning)
warnings.simplefilter('ignore', UserWarning)

In [2]:
## This code for pdb file manipulation is taken is taken from https://github.com/compbiomed-unito/acdc-nn/blob/master/acdc_nn/util.py

def magic_open(path):
    return (gzip.open if path.endswith('.gz') else open)(path, 'rt')

def pdb2seq(pp):
    ''' pdb2seq(pp) takes a pdb_structure_chain 
    and return its sequence '''
    seq = [] # pp.get_sequence()
    reslist = []
    for ppc  in pp:
        reslist += [res for res in ppc]
        seq += [str(ppc.get_sequence())]
    return "".join(seq)

def map_pdb_pos(pp):
    ''' map_pdb_pos
    Returns two dicts seq2pdb[seq_pos], pdb2seq[pdb_pos]'''
    reslist = []
    for ppc  in pp:
        reslist += [res for res in ppc]
    seq2pdb = dict(zip( map(str,range(1,len(reslist)+1)), [str(r.get_id()[1])+r.get_id()[2].strip() for r in reslist]))
    pdb2seq = dict(zip( [str(r.get_id()[1])+r.get_id()[2].strip() for r in reslist], map(str,range(1,len(reslist)+1)) ))
    return seq2pdb, pdb2seq

def pdb2info(pdb_file, chain):
    ''' pdb2info(pdb_file) 
    Returns structure, polypeptide '''
    parser=PDBParser(QUIET=True)
    assert pdb_file.endswith(".pdb")
    pdb_file = pdb_file[:-4].upper() + ".pdb"
    with magic_open(pdb_file) as f:
        structure = parser.get_structure('X', f)
    pchain=structure[0][chain]
    ppb=PPBuilder()
    pp = ppb.build_peptides(pchain, aa_only=False) #[0]
    return (structure, pchain, pdb2seq(pp), *map_pdb_pos(pp)) 

# S2648

In [3]:
df_S2648 = pd.read_csv('DATA/S2648.csv')

In [4]:
print('Total dataset length', len(df_S2648))
pdb_ids = list(set([t.split()[0].upper() for t in df_S2648.PDB_CHAIN.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 2648
Total number of different chains in dataset 132


In [5]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [6]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

verbatim_pdb_ids = {'1LVEA'}


print('Processing s2648')

for idx in tqdm(range(len(df_S2648))):
    pdb_id = df_S2648.iloc[idx]['PDB_CHAIN'].upper()
    wild_aa = df_S2648.iloc[idx]['WILD_RES']
    pos = str(df_S2648.iloc[idx]['POSITION'])
    mutant_aa = df_S2648.iloc[idx]['MUTANT_RES']
    exp_ddg = df_S2648.iloc[idx]['EXP_DDG']
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id in verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing s2648


  0%|          | 0/2648 [00:00<?, ?it/s]

100%|██████████| 2648/2648 [01:47<00:00, 24.63it/s] 


In [7]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/S2648.csv')

# S3488

In [8]:
df_1744 = pd.read_csv('DATA/Q1744.txt', sep = ' ', names = ['PDB_CHAIN', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG'])

In [9]:
print('Total dataset length', len(df_1744))
pdb_ids = list(set([t.split()[0].upper() for t in df_1744.PDB_CHAIN.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 1744
Total number of different chains in dataset 127


In [10]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [11]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

verbatim_pdb_ids = {'1LVEA'}


print('Processing S3488')

for idx in tqdm(range(len(df_1744))):
    pdb_id = df_1744.iloc[idx]['PDB_CHAIN']
    wild_aa = df_1744.iloc[idx]['WILD_RES']
    pos = str(df_1744.iloc[idx]['POSITION'])
    mutant_aa = df_1744.iloc[idx]['MUTANT_RES']
    exp_ddg = df_1744.iloc[idx]['EXP_DDG']
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id in verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(-1*exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))
            
            mut.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            wt.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing S3488


  0%|          | 0/1744 [00:00<?, ?it/s]

100%|██████████| 1744/1744 [01:27<00:00, 19.82it/s]


In [12]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/S3488.csv')

# S3421

In [13]:
df_3421 = pd.read_csv('DATA/Q3421.txt', sep = '\t', skiprows = 2, index_col=False,
                      names = ['PDB_ID', 'PDB_CHAIN', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG', 'T', 'PH', 'POS2'])

  df_3421 = pd.read_csv('DATA/Q3421.txt', sep = '\t', skiprows = 2, index_col=False,


In [14]:
print('Total dataset length', len(df_3421))
pdb_ids = list(set([t.split()[0].upper() for t in df_3421.PDB_ID.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 3421
Total number of different chains in dataset 148


In [15]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [16]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

verbatim_pdb_ids = {'1LVEA'}


print('Processing S3421')

for idx in tqdm(range(len(df_3421))):
    pdb_id = df_3421.iloc[idx]['PDB_ID'].upper() + df_3421.iloc[idx]['PDB_CHAIN'].upper()
    wild_aa = df_3421.iloc[idx]['WILD_RES']
    pos = str(df_3421.iloc[idx]['POSITION'])
    mutant_aa = df_3421.iloc[idx]['MUTANT_RES']
    exp_ddg = df_3421.iloc[idx]['EXP_DDG']
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id in verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing S3421


  0%|          | 3/3421 [00:00<02:05, 27.25it/s]

 31%|███       | 1048/3421 [00:53<02:53, 13.71it/s]

Error for 1LVEA expected Q at position 89 
Error for 1LVEA expected Q at position 89 
Error for 1LVEA expected Q at position 38 
Error for 1LVEA expected S at position 97 
Error for 1LVEA expected I at position 106 
Error for 1LVEA expected N at position 28 


 31%|███       | 1060/3421 [00:54<01:46, 22.08it/s]

Error for 1LVEA expected K at position 39 
Error for 1LVEA expected K at position 30 
Error for 1LVEA expected K at position 30 
Error for 1LVEA expected S at position 29 
Error for 1LVEA expected T at position 94 
Error for 1LVEA expected Y at position 96 
Error for 1LVEA expected Y at position 96 
Error for 1LVEA expected P at position 40 


100%|██████████| 3421/3421 [02:08<00:00, 26.71it/s] 


In [17]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/S3421.csv')

# ACDC-varibench

In [22]:
df_acdc_varibench = pd.concat([pd.read_csv(os.path.join('DATA/varibench/', f), sep = ' ',
            names = ['PDB_CHAIN', 'MUTATION', 'EXP_DDG']) for f in sorted(os.listdir('DATA/varibench/'))]).drop_duplicates()

In [23]:
print('Total dataset length', len(df_acdc_varibench))
pdb_ids = list(set([t.split()[0].upper() for t in df_acdc_varibench.PDB_CHAIN.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 1387
Total number of different chains in dataset 78


In [24]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [25]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

no_verbatim_pdb_ids = {'1C9OA', '1VQBA'}


print('Processing ACDC-varibench')

for idx in tqdm(range(len(df_acdc_varibench))):
    pdb_id = df_acdc_varibench.iloc[idx]['PDB_CHAIN'].upper()
    wild_aa = df_acdc_varibench.iloc[idx]['MUTATION'][0]
    pos = df_acdc_varibench.iloc[idx]['MUTATION'][1:-1]
    mutant_aa = df_acdc_varibench.iloc[idx]['MUTATION'][-1]
    exp_ddg = df_acdc_varibench.iloc[idx]['EXP_DDG']
    
    #if pdb_id!= '1CLWA':
    #    continue
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id not in no_verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing ACDC-varibench


  2%|▏         | 30/1387 [00:00<00:39, 34.64it/s]

Error for 1AM7A expected H at position 30 
Error for 1AM7A expected H at position 47 


  9%|▉         | 126/1387 [00:03<00:35, 35.29it/s]

Indexing error for 1BNIA position 108 not present in mapping {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69', '70': '70', '71': '71', '72': '72', '73': '73', '74': '74', '75': '75', '76': '76', '77': '77', '78': '78', '79': '79'

 27%|██▋       | 377/1387 [00:09<00:18, 54.30it/s]

Error for 1ONCA expected M at position 22 


 43%|████▎     | 596/1387 [00:13<00:10, 74.28it/s]

Indexing error for 1STNA position 136 not present in mapping {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69', '70': '70', '71': '71', '72': '72', '73': '73', '74': '74', '75': '75', '76': '76', '77': '77', '78': '78', '79': '79'

 47%|████▋     | 645/1387 [00:16<00:36, 20.45it/s]

Error for 1YCCA expected C at position 106 
Error for 1YCCA expected C at position 106 
Error for 1YCCA expected C at position 106 
Error for 1YCCA expected F at position 86 
Error for 1YCCA expected L at position 89 


 47%|████▋     | 656/1387 [00:16<00:21, 33.88it/s]

Error for 1YCCA expected P at position 80 
Error for 1YCCA expected P at position 80 
Error for 1YCCA expected P at position 80 
Error for 1YCCA expected P at position 80 
Error for 1YCCA expected P at position 80 
Error for 1YCCA expected P at position 80 
Error for 1YCCA expected P at position 80 


100%|██████████| 1387/1387 [00:40<00:00, 34.02it/s] 


In [26]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/ACDC_varibench.csv')

# Ssym

In [27]:
df_ssym = pd.read_csv('DATA/s_sym.txt', sep= ' ', names = ['PDB_ID', '_', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG'])

In [28]:
print('Total dataset length', len(df_ssym))
pdb_ids = list(set([t.split()[0].upper() for t in df_ssym['PDB_ID'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 342
Total number of different chains in dataset 15


In [29]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [30]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

print('Processing Ssym')

for idx in tqdm(range(len(df_ssym))):
    pdb_id = df_ssym.iloc[idx]['PDB_ID'].upper()
    wild_aa = df_ssym.iloc[idx]['WILD_RES']
    pos = str(df_ssym.iloc[idx]['POSITION'])
    mutant_aa = df_ssym.iloc[idx]['MUTANT_RES']
    exp_ddg = df_ssym.iloc[idx]['EXP_DDG']
    
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    #if pdb_id not in no_verbatim_pdb_ids:
    #  seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing Ssym


100%|██████████| 342/342 [00:08<00:00, 41.72it/s]


In [31]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [-t for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/ssym.csv')

In [32]:
pd.DataFrame({'wt_seq': mut, 
              'mut_seq': wt ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': [t[-1] + t[1:-1] + t[0] for t in mut_infos],
              'pos': poss}).to_csv('DATASETS/ssym_r.csv')

# Myoglobin

In [34]:
df_myoglobin = pd.read_csv('DATA/myoglobin.txt', sep= ' ', names = ['PDB_ID', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG'])

In [35]:
print('Total dataset length', len(df_myoglobin))
pdb_ids = list(set([t.split()[0].upper() for t in df_myoglobin['PDB_ID'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 134
Total number of different chains in dataset 1


In [36]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [37]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

#no_verbatim_pdb_ids = {'1C9OA', '1VQBA'}


print('Processing myoglobin')

for idx in tqdm(range(len(df_myoglobin))):
    pdb_id = df_myoglobin.iloc[idx]['PDB_ID'].upper()
    wild_aa = df_myoglobin.iloc[idx]['WILD_RES']
    pos = str(df_myoglobin.iloc[idx]['POSITION'])
    mutant_aa = df_myoglobin.iloc[idx]['MUTANT_RES']
    exp_ddg = df_myoglobin.iloc[idx]['EXP_DDG']
    
    #if pdb_id!= '1CLWA':
    #    continue
        
    
    pdb = PDBParser().get_structure(pdb_id[:4], f'PDB/{pdb_id[:4]}.pdb')
    chain = next(pdb.get_chains()).get_id()
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    #if pdb_id not in no_verbatim_pdb_ids:
    #  seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing myoglobin


100%|██████████| 134/134 [00:07<00:00, 18.76it/s]


In [38]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [-t for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/myoglobin.csv')

In [39]:
pd.DataFrame({'wt_seq': mut, 
              'mut_seq': wt ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': [t[-1] + t[1:-1] + t[0] for t in mut_infos],
              'pos': poss}).to_csv('DATASETS/myoglobin_r.csv')

# P53

In [40]:
df_p53 = pd.read_csv('DATA/p53.txt', sep= ' ', names = ['PDB_ID', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG'])

In [41]:
print('Total dataset length', len(df_p53))
pdb_ids = list(set([t.split()[0].upper() for t in df_p53['PDB_ID'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 42
Total number of different chains in dataset 1


In [42]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [43]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

print('Processing p53')

for idx in tqdm(range(len(df_p53))):
    pdb_id = df_p53.iloc[idx]['PDB_ID'].upper()
    wild_aa = df_p53.iloc[idx]['WILD_RES']
    pos = str(df_p53.iloc[idx]['POSITION'])
    mutant_aa = df_p53.iloc[idx]['MUTANT_RES']
    exp_ddg = df_p53.iloc[idx]['EXP_DDG']
    
        
    
    pdb = PDBParser().get_structure(pdb_id[:4], f'PDB/{pdb_id[:4]}.pdb')
    chain = next(pdb.get_chains()).get_id()
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing p53


  0%|          | 0/42 [00:00<?, ?it/s]

100%|██████████| 42/42 [00:05<00:00,  8.01it/s]


In [44]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [-t for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/p53.csv')

# PROSTATA dataset

In [45]:
df_article_dataset = pd.read_pickle('DATA/dataset_our_w_clusters_v2.5.pkl')

In [46]:
print('Total dataset length', len(df_article_dataset))
pdb_ids = list(set([t.split()[0].upper() for t in df_article_dataset['pdb_id'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 6547
Total number of different chains in dataset 663


In [47]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [48]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []
ssym_splits = []
s669_splits = []
clusters = []

print('Processing article dataset')

for idx in tqdm(range(len(df_article_dataset))):
    pdb_id = df_article_dataset.iloc[idx]['pdb_id'].upper() + df_article_dataset.iloc[idx]['pdb_chain'].upper()
    wild_aa = df_article_dataset.iloc[idx]['Mut_code'][0]
    pos = df_article_dataset.iloc[idx]['Mut_code'][1:-1]
    mutant_aa = df_article_dataset.iloc[idx]['Mut_code'][-1]
    exp_ddg = df_article_dataset.iloc[idx]['mean_ddG']
    
    ssym_split = df_article_dataset.iloc[idx]['ssym_split']
    s669_split = df_article_dataset.iloc[idx]['s669_split']
    cluster = df_article_dataset.iloc[idx]['cluster_id'] 
    
        
    
    pdb = PDBParser().get_structure(pdb_id[:4], f'PDB/{pdb_id[:4]}.pdb')
    chain = next(pdb.get_chains()).get_id()
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))
            ssym_splits.append(ssym_split)
            s669_splits.append(s669_split)
            clusters.append(cluster)

Processing article dataset


  0%|          | 0/6547 [00:00<?, ?it/s]

  5%|▍         | 313/6547 [00:55<03:09, 32.82it/s]

Error for 1AYEA expected A at position 31 
Error for 1AYEA expected E at position 14 
Error for 1AYEA expected E at position 20 
Error for 1AYEA expected I at position 15 


  5%|▍         | 318/6547 [00:55<03:56, 26.37it/s]

Error for 1AYEA expected I at position 23 
Error for 1AYEA expected V at position 12 
Error for 1AYEA expected V at position 64 


 37%|███▋      | 2443/6547 [02:57<03:34, 19.10it/s] 

Indexing error for 1LRPA position 15 not present in mapping {}
Indexing error for 1LRPA position 20 not present in mapping {}
Indexing error for 1LRPA position 37 not present in mapping {}
Indexing error for 1LRPA position 49 not present in mapping {}
Indexing error for 1LRPA position 49 not present in mapping {}
Indexing error for 1LRPA position 63 not present in mapping {}
Indexing error for 1LRPA position 66 not present in mapping {}
Indexing error for 1LRPA position 66 not present in mapping {}
Indexing error for 1LRPA position 81 not present in mapping {}
Indexing error for 1LRPA position 46 not present in mapping {}
Indexing error for 1LRPA position 48 not present in mapping {}
Indexing error for 1LRPA position 48 not present in mapping {}
Indexing error for 1LRPA position 48 not present in mapping {}
Indexing error for 1LRPA position 84 not present in mapping {}
Indexing error for 1LRPA position 4 not present in mapping {}
Indexing error for 1LRPA position 40 not present in mapp

 38%|███▊      | 2456/6547 [02:57<01:31, 44.75it/s]

Indexing error for 1LRPA position 78 not present in mapping {}
Indexing error for 1LRPA position 33 not present in mapping {}
Indexing error for 1LRPA position 44 not present in mapping {}
Indexing error for 1LRPA position 36 not present in mapping {}
Indexing error for 1LRPA position 22 not present in mapping {}
Indexing error for 1LRPA position 88 not present in mapping {}


100%|██████████| 6547/6547 [09:00<00:00, 12.10it/s]


In [49]:
article_dataset = pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [t for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss,
              'ssym_split': ssym_splits,
              's669_split': s669_splits,
              'cluster': clusters})

In [50]:
article_dataset_rev = pd.DataFrame({'wt_seq': mut, 
              'mut_seq': wt ,
              'ddg': [-t for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': [t[-1] + t[1:-1] + t[0] for t in mut_infos],
              'pos': poss,
              'ssym_split': ssym_splits,
              's669_split': s669_splits,
              'cluster': clusters})

In [51]:
article_dataset_with_rev = pd.concat([article_dataset, article_dataset_rev]).drop_duplicates(['wt_seq', 'mut_seq'])
print(len(article_dataset_with_rev))
#10542

10542


In [52]:
article_dataset_with_rev[article_dataset_with_rev.ssym_split == 'train'][['wt_seq', 'mut_seq', 'ddg', 'pdb_id', 'mut_info', 'pos']].to_csv('DATASETS/new_ds_for_ssym.csv')
article_dataset_with_rev[article_dataset_with_rev.s669_split == 'train'][['wt_seq', 'mut_seq', 'ddg', 'pdb_id', 'mut_info', 'pos']].to_csv('DATASETS/new_ds_for_s669.csv')

In [53]:
from sklearn.model_selection import GroupKFold
group_kfold = GroupKFold(n_splits=5)
article_dataset_with_rev = article_dataset_with_rev.reset_index(drop = True)
article_dataset_with_rev['fold'] = None
for fold, (train_index, test_index) in enumerate(group_kfold.split(article_dataset_with_rev.cluster, 
                                                                   article_dataset_with_rev.cluster, 
                                                                   article_dataset_with_rev.cluster)):
    article_dataset_with_rev.loc[test_index, 'fold'] = fold

In [54]:
article_dataset_with_rev[['wt_seq', 'mut_seq', 'ddg', 'pdb_id', 'mut_info', 'pos', 'fold']].to_csv('DATASETS/new_ds_with_folds.csv')

# Case studies datasets

In [55]:
dimer_clusters = {'cluster_162_ref_1AV1_A', 'cluster_4_ref_1BFM_A', 'cluster_161_ref_1ARR_A', 'cluster_34_ref_1R6R_A', 'cluster_139_ref_1SAK_A', 'cluster_222_ref_1ZNJ_B', 'cluster_223_ref_1ZNJ_A', 'cluster_210_ref_1UWO_A', 'cluster_69_ref_3MON_B', 'cluster_47_ref_2KJ3_A', 'cluster_183_ref_1CDC_B', 'cluster_140_ref_1SCE_A'}
gem_clusters = {'cluster_2_ref_1A7V_A', 'cluster_189_ref_1CYC_A', 'cluster_217_ref_1YCC_A', 'cluster_92_ref_1I5T_A', 'cluster_257_ref_451C_A', 'cluster_168_ref_1B5M_A', 'cluster_190_ref_1CYO_A', 'cluster_179_ref_1C52_A', 'cluster_178_ref_1C2R_A', 'cluster_218_ref_1YEA_A', 'cluster_177_ref_1BVC_A'}
transmembrane_clusters = {'cluster_145_ref_1THQ_A', 'cluster_200_ref_1FEP_A', 'cluster_230_ref_2BRD_A', 'cluster_129_ref_1QJP_A'}
        

In [56]:
article_dataset_full = article_dataset_with_rev[['wt_seq', 'mut_seq', 'ddg', 'pdb_id', 'mut_info', 'pos', 'cluster']]

In [57]:
article_dataset_full[article_dataset_full.cluster.isin(dimer_clusters)].to_csv('DATASETS/case_study_dimer_test.csv')
article_dataset_full[article_dataset_full.cluster.isin(gem_clusters)].to_csv('DATASETS/case_study_gem_test.csv')
article_dataset_full[article_dataset_full.cluster.isin(transmembrane_clusters)].to_csv('DATASETS/case_study_transmembrane_test.csv')