In [1]:
import pypdb
import os
import pandas as pd
import pickle
from pypdb.clients.pdb import pdb_client
import tqdm
from tqdm import tqdm

import gzip
import numpy as np
from Bio.PDB import *
from Bio.PDB.Polypeptide import three_to_one, is_aa

In [2]:
## This code for pdb file manipulation is taken is taken from https://github.com/compbiomed-unito/acdc-nn/blob/master/acdc_nn/util.py

def magic_open(path):
    return (gzip.open if path.endswith('.gz') else open)(path, 'rt')

def pdb2seq(pp):
    ''' pdb2seq(pp) takes a pdb_structure_chain 
    and return its sequence '''
    seq = [] # pp.get_sequence()
    reslist = []
    for ppc  in pp:
        reslist += [res for res in ppc]
        seq += [str(ppc.get_sequence())]
    return "".join(seq)

def map_pdb_pos(pp):
    ''' map_pdb_pos
    Returns two dicts seq2pdb[seq_pos], pdb2seq[pdb_pos]'''
    reslist = []
    for ppc  in pp:
        reslist += [res for res in ppc]
    seq2pdb = dict(zip( map(str,range(1,len(reslist)+1)), [str(r.get_id()[1])+r.get_id()[2].strip() for r in reslist]))
    pdb2seq = dict(zip( [str(r.get_id()[1])+r.get_id()[2].strip() for r in reslist], map(str,range(1,len(reslist)+1)) ))
    return seq2pdb, pdb2seq

def pdb2info(pdb_file, chain):
    ''' pdb2info(pdb_file) 
    Returns structure, polypeptide '''
    parser=PDBParser(QUIET=True)
    with magic_open(pdb_file) as f:
        structure = parser.get_structure('X', f)
    pchain=structure[0][chain]
    ppb=PPBuilder()
    pp = ppb.build_peptides(pchain, aa_only=False) #[0]
    return (structure, pchain, pdb2seq(pp), *map_pdb_pos(pp)) 

# S2648

In [3]:
df_S2648 = pd.read_csv('DATA/S2648.csv')

In [4]:
print('Total dataset length', len(df_S2648))
pdb_ids = list(set([t.split()[0] for t in df_S2648.PDB_CHAIN.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 2648
Total number of different chains in dataset 132


In [None]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

Sending GET request to https://files.rcsb.org/download/1B26.pdb to fetch 1B26's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1A5E.pdb to fetch 1A5E's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1CEY.pdb to fetch 1CEY's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1B8E.pdb to fetch 1B8E's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1IMQ.pdb to fetch 1IMQ's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/3PGK.pdb to fetch 3PGK's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1ARR.pdb to fetch 1ARR's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2NVH.pdb to fetch 2NVH's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1MBG.pdb to fetch 1MBG's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1N0J.pdb to fetch 1N0J's pd

In [None]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

verbatim_pdb_ids = {'1LVEA'}


print('Processing s2648')

for idx in tqdm(range(len(df_S2648))):
    pdb_id = df_S2648.iloc[idx]['PDB_CHAIN']
    wild_aa = df_S2648.iloc[idx]['WILD_RES']
    pos = str(df_S2648.iloc[idx]['POSITION'])
    mutant_aa = df_S2648.iloc[idx]['MUTANT_RES']
    exp_ddg = df_S2648.iloc[idx]['EXP_DDG']
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id in verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

In [None]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/S2648.csv')

In [39]:
df_s669 = pd.read_csv('datasets/Data_s669_with_predictions.csv')

for pdb_id in list(set(df_s669.Protein.to_list())):
    with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
        fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

Sending GET request to https://files.rcsb.org/download/1EKG.pdb to fetch 1EKG's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1GLU.pdb to fetch 1GLU's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1F8I.pdb to fetch 1F8I's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2ZTA.pdb to fetch 2ZTA's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1NM1.pdb to fetch 1NM1's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/3S4M.pdb to fetch 3S4M's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1FH5.pdb to fetch 1FH5's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/4HE7.pdb to fetch 4HE7's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1BA3.pdb to fetch 1BA3's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1A7V.pdb to fetch 1A7V's pd

Sending GET request to https://files.rcsb.org/download/1OSI.pdb to fetch 1OSI's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1BNL.pdb to fetch 1BNL's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2C9Q.pdb to fetch 2C9Q's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/5OAQ.pdb to fetch 5OAQ's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2KS4.pdb to fetch 2KS4's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2PR5.pdb to fetch 2PR5's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1XZO.pdb to fetch 1XZO's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2KJ3.pdb to fetch 2KJ3's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1IV7.pdb to fetch 1IV7's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1A0F.pdb to fetch 1A0F's pd

In [129]:
df_q3214 = pd.read_csv('ThermoNet/data/datasets/Q3214_direct.csv')
df_q1744 = pd.read_csv('ThermoNet/data/datasets/Q1744_direct.csv')

In [15]:
pdb2info('PDB/1A43.pdb', 'A')

(<Structure id=X>,
 <Chain id=A>,
 'TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQ',
 {'1': '148',
  '2': '149',
  '3': '150',
  '4': '151',
  '5': '152',
  '6': '153',
  '7': '154',
  '8': '155',
  '9': '156',
  '10': '157',
  '11': '158',
  '12': '159',
  '13': '160',
  '14': '161',
  '15': '162',
  '16': '163',
  '17': '164',
  '18': '165',
  '19': '166',
  '20': '167',
  '21': '168',
  '22': '169',
  '23': '170',
  '24': '171',
  '25': '172',
  '26': '173',
  '27': '174',
  '28': '175',
  '29': '176',
  '30': '177',
  '31': '178',
  '32': '179',
  '33': '180',
  '34': '181',
  '35': '182',
  '36': '183',
  '37': '184',
  '38': '185',
  '39': '186',
  '40': '187',
  '41': '188',
  '42': '189',
  '43': '190',
  '44': '191',
  '45': '192',
  '46': '193',
  '47': '194',
  '48': '195',
  '49': '196',
  '50': '197',
  '51': '198',
  '52': '199',
  '53': '200',
  '54': '201',
  '55': '202',
  '56': '203',
  '57': '204',
  '58': '205',
  '59': '206',
  '60': '207'

In [125]:
wt = []
mut = []
ddg = []
dataset = []
pdb_ids = []
mut_infos = []
poss = []


print('Processing s2648')

for idx in tqdm(range(len(df_S2648))):
    pdb_id = df_S2648.iloc[idx]['PDB_CHAIN']
    wild_aa = df_S2648.iloc[idx]['WILD_RES']
    pos = str(df_S2648.iloc[idx]['POSITION'])
    mutant_aa = df_S2648.iloc[idx]['MUTANT_RES']
    exp_ddg = df_S2648.iloc[idx]['EXP_DDG']
    
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
     
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            dataset.append('s2648')
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))
            
            mut.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            wt.append(''.join(tt))
            ddg.append(-1*exp_ddg)
            dataset.append('s2648_rev')
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing s2648


 50%|██████████████████████████████████████████████████████████████████████████████▉                                                                               | 1322/2648 [02:42<01:28, 15.04it/s]

Error for 1LVEA expected L at position 30 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '27A': '28', '27B': '29', '27C': '30', '27D': '31', '27E': '32', '27F': '33', '28': '34', '29': '35', '30': '36', '31': '37', '32': '38', '33': '39', '34': '40', '35': '41', '36': '42', '37': '43', '38': '44', '39': '45', '40': '46', '41': '47', '42': '48', '43': '49', '44': '50', '45': '51', '46': '52', '47': '53', '48': '54', '49': '55', '50': '56', '51': '57', '52': '58', '53': '59', '54': '60', '55': '61', '56': '62', '57': '63', '58': '64', '59': '65', '60': '66', '61': '67', '62': '68', '63': '69', '6

 50%|███████████████████████████████████████████████████████████████████████████████▎                                                                              | 1330/2648 [02:42<00:56, 23.46it/s]

Error for 1LVEA expected Q at position 44 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '27A': '28', '27B': '29', '27C': '30', '27D': '31', '27E': '32', '27F': '33', '28': '34', '29': '35', '30': '36', '31': '37', '32': '38', '33': '39', '34': '40', '35': '41', '36': '42', '37': '43', '38': '44', '39': '45', '40': '46', '41': '47', '42': '48', '43': '49', '44': '50', '45': '51', '46': '52', '47': '53', '48': '54', '49': '55', '50': '56', '51': '57', '52': '58', '53': '59', '54': '60', '55': '61', '56': '62', '57': '63', '58': '64', '59': '65', '60': '66', '61': '67', '62': '68', '63': '69', '6

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2648/2648 [05:19<00:00,  8.29it/s]


In [126]:
for idx in tqdm(range(len(df_s669))):
    pdb_id = df_s669.iloc[idx]['Protein']
    ms = df_s669.iloc[idx]['PDB_Mut']
    wild_aa = ms[0]
    pos = ms[1:-1]
    mutant_aa = ms[-1]
    exp_ddg = df_s669.iloc[idx]['DDG_checked_dir']
    
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            dataset.append('s669')
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))



100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 669/669 [03:18<00:00,  3.37it/s]


In [58]:
all_muts = df_q3214.pdb_id.to_list()
print(len(all_muts))
pdb_ids = list(set([t[:4] for t in all_muts]))
print(len(pdb_ids))
for pdb_id in pdb_ids:
    with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
        fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

3214
147
Sending GET request to https://files.rcsb.org/download/3mbp.pdb to fetch 3mbp's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1sak.pdb to fetch 1sak's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2q98.pdb to fetch 2q98's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1blc.pdb to fetch 1blc's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1hng.pdb to fetch 1hng's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1msi.pdb to fetch 1msi's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2trx.pdb to fetch 2trx's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1am7.pdb to fetch 1am7's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/4lyz.pdb to fetch 4lyz's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1b0o.pdb to fetch 

Sending GET request to https://files.rcsb.org/download/1pga.pdb to fetch 1pga's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/3pgk.pdb to fetch 3pgk's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1oh0.pdb to fetch 1oh0's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1bnl.pdb to fetch 1bnl's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2a36.pdb to fetch 2a36's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1dil.pdb to fetch 1dil's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1frd.pdb to fetch 1frd's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/3d2a.pdb to fetch 3d2a's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/1thq.pdb to fetch 1thq's pdb file as a string.
Sending GET request to https://files.rcsb.org/download/2hmb.pdb to fetch 2hmb's pd

In [127]:
print('Processing q3214')

for idx in tqdm(range(len(df_q3214))):
    pdb_id = df_q3214.iloc[idx]['pdb_id']
    wild_aa = df_q3214.iloc[idx]['wild_type']
    pos = str(df_q3214.iloc[idx]['position'])
    mutant_aa = df_q3214.iloc[idx]['mutant']
    exp_ddg = -1 * df_q3214.iloc[idx]['ddg']
    
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
     
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            dataset.append('q3214')
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))
            
            
            mut.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            wt.append(''.join(tt))
            ddg.append(-1 * exp_ddg)
            dataset.append('q3214_rev')
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing q3214








100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3214/3214 [05:37<00:00,  9.51it/s]


In [131]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'dataset': dataset, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('ddg_v5.csv')

In [130]:
for idx in tqdm(range(len(df_q1744))):
    pdb_id = df_q3214.iloc[idx]['pdb_id']
    wild_aa = df_q3214.iloc[idx]['wild_type']
    pos = str(df_q3214.iloc[idx]['position'])
    mutant_aa = df_q3214.iloc[idx]['mutant']
    exp_ddg = -1 * df_q3214.iloc[idx]['ddg']
    
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
     
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            dataset.append('q1744')
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))
            
            
            mut.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            wt.append(''.join(tt))
            ddg.append(-1 * exp_ddg)
            dataset.append('q1744_rev')
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))





100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1744/1744 [02:02<00:00, 14.20it/s]


In [95]:
df = pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'dataset': dataset, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss})

In [96]:
len(set(df[df.dataset == 's2648'].mut_seq.to_list()).intersection(set(df[df.dataset == 'q3214'].mut_seq.to_list())))

1964

In [97]:
len(set(df[df.dataset == 's2648'].mut_seq.to_list()).intersection(set(df[df.dataset == 's669'].mut_seq.to_list())))

0

In [98]:
len(set(df[df.dataset == 'q3214'].mut_seq.to_list()).intersection(set(df[df.dataset == 's669'].mut_seq.to_list())))

54

In [99]:
intersect = set(df[df.dataset == 'q3214'].mut_seq.to_list()).intersection(set(df[df.dataset == 's669'].mut_seq.to_list()))

In [100]:
df[df.mut_seq.isin(intersect)].sort_values(['mut_info']).head(20)

Unnamed: 0,wt_seq,mut_seq,ddg,dataset,pdb_id,mut_info,pos
2656,MKVIFLKDVKGKGKKGEIKNVADGYANNFLFKQGLAIEATPANLKA...,MKVIFLKDVKGKGKKGEIKNVAAGYANNFLFKQGLAIEATPANLKA...,-0.46,s669,1DIVA,D23A,22
6197,MKVIFLKDVKGKGKKGEIKNVADGYANNFLFKQGLAIEATPANLKA...,MKVIFLKDVKGKGKKGEIKNVAAGYANNFLFKQGLAIEATPANLKA...,0.35,q3214,1divA,D23A,22
2657,MKVIFLKDVKGKGKKGEIKNVADGYANNFLFKQGLAIEATPANLKA...,MKVIFLKDVKGKGKKGEIKNVANGYANNFLFKQGLAIEATPANLKA...,-0.58,s669,1DIVA,D23N,22
6439,MKVIFLKDVKGKGKKGEIKNVADGYANNFLFKQGLAIEATPANLKA...,MKVIFLKDVKGKGKKGEIKNVANGYANNFLFKQGLAIEATPANLKA...,-0.44,q3214,1divA,D23N,22
2653,HSHRDFQPVLHLVALNAPLSGGMRGIRGADFQCFQQARAVGLAGTF...,HSHRDFQPVLHLVALNAPLSGGMRGIRGADFQCFQQARAVGLAGTF...,-0.821,s669,1BNLA,D76A,75
5940,HSHRDFQPVLHLVALNAPLSGGMRGIRGADFQCFQQARAVGLAGTF...,HSHRDFQPVLHLVALNAPLSGGMRGIRGADFQCFQQARAVGLAGTF...,-3.88,q3214,1bnlA,D76A,75
3325,MKVIFLKDVKGKGKKGEIKNVADGYANNFLFKQGLAIEATPANLKA...,MKVIFLKNVKGKGKKGEIKNVADGYANNFLFKQGLAIEATPANLKA...,-0.3,q3214,1divA,D8N,7
2658,MKVIFLKDVKGKGKKGEIKNVADGYANNFLFKQGLAIEATPANLKA...,MKVIFLKNVKGKGKKGEIKNVADGYANNFLFKQGLAIEATPANLKA...,-0.3,s669,1DIVA,D8N,7
3197,RMKQLEDKVEELLSKNYHLENEVARLKKLVG,RMKQLEDKVEQLLSKNYHLENEVARLKKLVG,-0.74,s669,2ZTAA,E11Q,10
4181,RMKQLEDKVEELLSKNYHLENEVARLKKLVG,RMKQLEDKVEQLLSKNYHLENEVARLKKLVG,-0.74,q3214,2ztaA,E11Q,10


In [74]:
df

Unnamed: 0,wt_seq,mut_seq,ddg,dataset,pdb_id,mut_info,pos
0,TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQN...,TSILDIRQAPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQN...,-2.40,s2648,1A43A,G156A,8
1,TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQN...,TSILDIRQGPKDPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQN...,-4.55,s2648,1A43A,E159D,11
2,TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQN...,TSILDIRQGPKEPFRDYVDAFYKTLRAEQASQEVKNWMTETLLVQN...,-4.55,s2648,1A43A,R167A,19
3,TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQN...,TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNAMTETLLVQN...,-0.70,s2648,1A43A,W184A,36
4,TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQN...,TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQN...,-3.70,s2648,1A43A,C218S,70
...,...,...,...,...,...,...,...
6511,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAKKSEL...,3.70,q3214,2lzmA,A42K,41
6512,AKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIFG...,AKESTGFKPGSAKKGATLFKTRCQQCHTIEEGGPNKVGPNLHGIFG...,1.20,q3214,1yeaA,P76G,84
6513,KLHKEPATLIKAIDGDTVKLMYKGQPMTFRLLLVDTPETKHPKKGV...,KLHKEPATLIKAIDGDTVKLMYKGQPMTFRLLLVDTPETKHPKKGV...,2.50,q3214,1stnA,V104T,98
6514,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPSLNAAKSEL...,1.50,q3214,2lzmA,V71A,70


In [55]:
df_q3214

Unnamed: 0,pdb_id,position,wild_type,mutant,ddg
0,1otrB,34,E,A,-0.07
1,1a5eA,121,L,R,-0.66
2,1rtbA,4,A,S,0.47
3,4lyzA,102,G,R,-0.38
4,1thqA,157,M,A,0.77
...,...,...,...,...,...
3209,2lzmA,42,A,K,3.70
3210,1yeaA,76,P,G,1.20
3211,1stnA,104,V,T,2.50
3212,2lzmA,71,V,A,1.50


In [105]:
comp_df = pd.read_csv('novozymes-enzyme-stability-prediction/test.csv')

In [106]:
_, _, wt_sequence, _, _ = pdb2info(f'novozymes-enzyme-stability-prediction/wildtype_structure_prediction_af2.pdb', 'A')

In [107]:
wt_sequence

'VPVNPEPDATSVENVALKTGSGDSQSDPIKADLEVKGQSALPFDVDCWAILCKGAPNVLQRVNEKTKNSNRDRSGANKGPFKDPQKWGIKALPPKNPSWSAQDFKSPEEYAFASSLQGGTNAILAPVNLASQNSQGGVLNGFYSANKVAQFDPSKPQQTKGTWFQITKFTGAAGPYCKALGSNDKSVCDKNKNIAGDWGFDPAKWAYQYDEKNNKFNYVGK'

In [108]:
len(wt_sequence)

221

In [114]:
wt = []
mut = []
ddg = []
pdb_ids = []

for idx in tqdm(range(len(comp_df))):
    
    wt.append(wt_sequence)
    mut.append(comp_df.iloc[idx]['protein_sequence'])
    ddg.append(0)
    pdb_ids.append(comp_df.iloc[idx]['seq_id'])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2413/2413 [00:00<00:00, 8615.09it/s]


In [122]:
poss = []
for wt_seq, mut_seq in zip(wt, mut):
    if len(wt_seq)!=len(mut_seq):
        poss.append(-1)
    else:
        first = False
        for p, (w,m) in enumerate(zip(wt_seq, mut_seq)):
            if w!=m:
                if first:
                    print('Err')
                else:
                    poss.append(p)
                    first = True
        if not first:
            poss.append(-1)

In [123]:
len(poss)

2413

In [124]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids,
              'pos': poss}).to_csv('ddg_competition.csv')

In [37]:
df_s669

Unnamed: 0.1,Unnamed: 0,Protein,PDB_Mut,Mut_seq,TEMP,pH,DDG_checked_dir,DOI,nmr_xray,resolution,...,SEC_STR_dir,SEC_STR_inv,ThermoNet_dir,ThermoNet_inv,ACDC-NN-Seq_inv,ACDC-NN_inv,PDB_wild,DDGun_inv,DDG_checked_inv,DDGun3D_inv
0,0,1A0FA,S11A,S11A,329.83,6.5,-1.800,10.1042/BJ20061707,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.10 ANGSTROMS.,...,T,T,0.0209,-0.1772,-0.041723,-0.319539,1A0F,-0.0,1.800,-0.5
1,1,1A7VA,A104H,A104H,298.15,6.5,-2.690,10.1016/j.jmb.2009.07.074,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.30 ANGSTROMS.,...,H,H,0.1795,-0.0441,0.495499,0.308649,1A7V,0.5,2.690,0.1
2,2,1A7VA,A66H,A66H,298.15,6.5,-1.980,10.1016/j.jmb.2009.07.074,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.30 ANGSTROMS.,...,C,C,0.0017,-0.2933,-0.114885,-0.392607,1A7V,-0.8,1.980,-0.5
3,3,1A7VA,A91H,A91H,298.15,6.5,-1.700,10.1016/j.jmb.2009.07.074,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.30 ANGSTROMS.,...,H,H,0.2384,-0.0203,0.264991,0.183003,1A7V,0.2,1.700,0.1
4,4,1A7VA,D3H,D3H,298.15,6.5,-1.360,10.1016/j.jmb.2009.07.074,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.30 ANGSTROMS.,...,C,H,0.0510,-0.1593,0.000000,0.126173,1A7V,-0.5,1.360,-0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,689,5JXBA,D329P,D25P,,7.5,-1.440,10.1371/journal.pone.0098124,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.90 ANGSTROMS.,...,S,T,-0.1373,-0.0681,-0.000536,-0.134216,5JXB,0.1,1.440,0.0
665,690,5OAQA,Y429H,Y199H,298.00,7.4,-2.990,10.1111/febs.14817,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 1.95 ANGSTROMS.,...,E,E,-0.2299,1.5324,1.387815,1.590481,5OAQ,2.6,2.990,2.4
666,691,5VP3A,R39K,R39K,298.15,8.0,0.413,10.1016/j.saa.2016.01.020,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.15 ANGSTROMS.,...,H,H,0.0667,-0.7076,0.406377,0.528053,5VP3,1.3,-0.413,1.0
667,692,5VP3A,S128G,S128G,298.15,8.0,-0.378,10.1016/j.saa.2016.01.020,EXPDTA X-RAY DIFFRACTION,REMARK 2 RESOLUTION. 2.15 ANGSTROMS.,...,H,H,-0.0753,-0.1563,0.701056,0.537406,5VP3,0.8,0.378,0.8


In [22]:
pdb2info(f'PDB/1A43.pdb', 'A')

(<Structure id=X>,
 <Chain id=A>,
 'TSILDIRQGPKEPFRDYVDRFYKTLRAEQASQEVKNWMTETLLVQNANPDCKTILKALGPGATLEEMMTACQ',
 {'1': '148',
  '2': '149',
  '3': '150',
  '4': '151',
  '5': '152',
  '6': '153',
  '7': '154',
  '8': '155',
  '9': '156',
  '10': '157',
  '11': '158',
  '12': '159',
  '13': '160',
  '14': '161',
  '15': '162',
  '16': '163',
  '17': '164',
  '18': '165',
  '19': '166',
  '20': '167',
  '21': '168',
  '22': '169',
  '23': '170',
  '24': '171',
  '25': '172',
  '26': '173',
  '27': '174',
  '28': '175',
  '29': '176',
  '30': '177',
  '31': '178',
  '32': '179',
  '33': '180',
  '34': '181',
  '35': '182',
  '36': '183',
  '37': '184',
  '38': '185',
  '39': '186',
  '40': '187',
  '41': '188',
  '42': '189',
  '43': '190',
  '44': '191',
  '45': '192',
  '46': '193',
  '47': '194',
  '48': '195',
  '49': '196',
  '50': '197',
  '51': '198',
  '52': '199',
  '53': '200',
  '54': '201',
  '55': '202',
  '56': '203',
  '57': '204',
  '58': '205',
  '59': '206',
  '60': '207'

In [20]:
df_S2648

Unnamed: 0,PDB_CHAIN,WILD_RES,POSITION,MUTANT_RES,PH,TEMPERATURE,EXP_DDG
0,1A43A,G,156,A,7.3,25.0,-2.40
1,1A43A,E,159,D,7.3,25.0,-4.55
2,1A43A,R,167,A,7.3,25.0,-4.55
3,1A43A,W,184,A,7.3,25.0,-0.70
4,1A43A,C,218,S,7.3,25.0,-3.70
...,...,...,...,...,...,...,...
2643,5PTIA,G,36,D,8.7,25.0,-2.80
2644,5PTIA,G,36,S,4.6,72.6,-0.70
2645,5PTIA,G,37,A,5.5,23.9,-3.01
2646,5PTIA,G,37,D,8.7,25.0,-1.70


In [17]:
pdb2seq_pos

{'1': '148',
 '2': '149',
 '3': '150',
 '4': '151',
 '5': '152',
 '6': '153',
 '7': '154',
 '8': '155',
 '9': '156',
 '10': '157',
 '11': '158',
 '12': '159',
 '13': '160',
 '14': '161',
 '15': '162',
 '16': '163',
 '17': '164',
 '18': '165',
 '19': '166',
 '20': '167',
 '21': '168',
 '22': '169',
 '23': '170',
 '24': '171',
 '25': '172',
 '26': '173',
 '27': '174',
 '28': '175',
 '29': '176',
 '30': '177',
 '31': '178',
 '32': '179',
 '33': '180',
 '34': '181',
 '35': '182',
 '36': '183',
 '37': '184',
 '38': '185',
 '39': '186',
 '40': '187',
 '41': '188',
 '42': '189',
 '43': '190',
 '44': '191',
 '45': '192',
 '46': '193',
 '47': '194',
 '48': '195',
 '49': '196',
 '50': '197',
 '51': '198',
 '52': '199',
 '53': '200',
 '54': '201',
 '55': '202',
 '56': '203',
 '57': '204',
 '58': '205',
 '59': '206',
 '60': '207',
 '61': '208',
 '62': '209',
 '63': '210',
 '64': '211',
 '65': '212',
 '66': '213',
 '67': '214',
 '68': '215',
 '69': '216',
 '70': '217',
 '71': '218',
 '72': '219'}