In [1]:
import pypdb
import os
import pandas as pd
import pickle
from pypdb.clients.pdb import pdb_client
import tqdm
from tqdm import tqdm

import gzip
import numpy as np
from Bio.PDB import *
from Bio.PDB.Polypeptide import three_to_one, is_aa

In [2]:
## This code for pdb file manipulation is taken is taken from https://github.com/compbiomed-unito/acdc-nn/blob/master/acdc_nn/util.py

def magic_open(path):
    return (gzip.open if path.endswith('.gz') else open)(path, 'rt')

def pdb2seq(pp):
    ''' pdb2seq(pp) takes a pdb_structure_chain 
    and return its sequence '''
    seq = [] # pp.get_sequence()
    reslist = []
    for ppc  in pp:
        reslist += [res for res in ppc]
        seq += [str(ppc.get_sequence())]
    return "".join(seq)

def map_pdb_pos(pp):
    ''' map_pdb_pos
    Returns two dicts seq2pdb[seq_pos], pdb2seq[pdb_pos]'''
    reslist = []
    for ppc  in pp:
        reslist += [res for res in ppc]
    seq2pdb = dict(zip( map(str,range(1,len(reslist)+1)), [str(r.get_id()[1])+r.get_id()[2].strip() for r in reslist]))
    pdb2seq = dict(zip( [str(r.get_id()[1])+r.get_id()[2].strip() for r in reslist], map(str,range(1,len(reslist)+1)) ))
    return seq2pdb, pdb2seq

def pdb2info(pdb_file, chain):
    ''' pdb2info(pdb_file) 
    Returns structure, polypeptide '''
    parser=PDBParser(QUIET=True)
    with magic_open(pdb_file) as f:
        structure = parser.get_structure('X', f)
    pchain=structure[0][chain]
    ppb=PPBuilder()
    pp = ppb.build_peptides(pchain, aa_only=False) #[0]
    return (structure, pchain, pdb2seq(pp), *map_pdb_pos(pp)) 

# S2648

In [3]:
df_S2648 = pd.read_csv('DATA/S2648.csv')

In [4]:
print('Total dataset length', len(df_S2648))
pdb_ids = list(set([t.split()[0].upper() for t in df_S2648.PDB_CHAIN.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 2648
Total number of different chains in dataset 132


In [5]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [6]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

verbatim_pdb_ids = {'1LVEA'}


print('Processing s2648')

for idx in tqdm(range(len(df_S2648))):
    pdb_id = df_S2648.iloc[idx]['PDB_CHAIN'].upper()
    wild_aa = df_S2648.iloc[idx]['WILD_RES']
    pos = str(df_S2648.iloc[idx]['POSITION'])
    mutant_aa = df_S2648.iloc[idx]['MUTANT_RES']
    exp_ddg = df_S2648.iloc[idx]['EXP_DDG']
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id in verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing s2648






100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2648/2648 [03:18<00:00, 13.33it/s]


In [7]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/S2648.csv')

# S3488

In [8]:
df_1744 = pd.read_csv('DATA/Q1744.txt', sep = ' ', names = ['PDB_CHAIN', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG'])

In [9]:
print('Total dataset length', len(df_1744))
pdb_ids = list(set([t.split()[0].upper() for t in df_1744.PDB_CHAIN.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 1744
Total number of different chains in dataset 127


In [10]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [11]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

verbatim_pdb_ids = {'1LVEA'}


print('Processing S3488')

for idx in tqdm(range(len(df_1744))):
    pdb_id = df_1744.iloc[idx]['PDB_CHAIN'].upper()
    wild_aa = df_1744.iloc[idx]['WILD_RES']
    pos = str(df_1744.iloc[idx]['POSITION'])
    mutant_aa = df_1744.iloc[idx]['MUTANT_RES']
    exp_ddg = df_1744.iloc[idx]['EXP_DDG']
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id in verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))
            
            mut.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            wt.append(''.join(tt))
            ddg.append(-1*exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing S3488


  8%|████████████▎                                                                                                                                                  | 135/1744 [00:11<01:41, 15.79it/s]

Error for 1LVEA expected Q at position 89 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'



 28%|████████████████████████████████████████████▊                                                                                                                  | 492/1744 [00:39<01:18, 15.89it/s]

Error for 1LVEA expected I at position 106 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69

 29%|██████████████████████████████████████████████▋                                                                                                                | 512/1744 [00:40<00:53, 22.86it/s]

Error for 1LVEA expected Y at position 96 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 34%|█████████████████████████████████████████████████████▌                                                                                                         | 588/1744 [00:47<01:57,  9.84it/s]

Error for 1LVEA expected N at position 28 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 43%|████████████████████████████████████████████████████████████████████▌                                                                                          | 752/1744 [01:02<00:45, 21.80it/s]

Error for 1LVEA expected K at position 39 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 48%|███████████████████████████████████████████████████████████████████████████▊                                                                                   | 832/1744 [01:10<00:48, 18.80it/s]

Error for 1LVEA expected S at position 97 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 48%|████████████████████████████████████████████████████████████████████████████▊                                                                                  | 843/1744 [01:11<00:37, 23.88it/s]

Error for 1LVEA expected Q at position 38 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 57%|██████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 989/1744 [01:23<00:46, 16.27it/s]

Error for 1LVEA expected K at position 30 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'



Error for 1LVEA expected P at position 40 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 63%|██████████████████████████████████████████████████████████████████████████████████████████████████▉                                                           | 1092/1744 [01:31<00:36, 17.75it/s]

Error for 1LVEA expected S at position 29 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 63%|███████████████████████████████████████████████████████████████████████████████████████████████████▌                                                          | 1099/1744 [01:31<00:30, 21.36it/s]

Error for 1LVEA expected K at position 30 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 73%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                                          | 1277/1744 [01:50<00:40, 11.50it/s]

Error for 1LVEA expected T at position 94 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 75%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                        | 1301/1744 [01:52<00:21, 20.43it/s]

Error for 1LVEA expected Y at position 96 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 81%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                             | 1418/1744 [02:01<00:20, 16.23it/s]

Error for 1LVEA expected Q at position 89 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1744/1744 [02:30<00:00, 11.59it/s]


In [12]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/S3488.csv')

# S3421

In [13]:
df_3421 = pd.read_csv('DATA/Q3421.txt', sep = '\t', skiprows = 2, index_col=False,
                      names = ['PDB_ID', 'PDB_CHAIN', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG', 'T', 'PH', 'POS2'])

  df_3421 = pd.read_csv('DATA/Q3421.txt', sep = '\t', skiprows = 2, index_col=False,


In [14]:
print('Total dataset length', len(df_3421))
pdb_ids = list(set([t.split()[0].upper() for t in df_3421.PDB_ID.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 3421
Total number of different chains in dataset 148


In [15]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [16]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

verbatim_pdb_ids = {'1LVEA'}


print('Processing S3421')

for idx in tqdm(range(len(df_3421))):
    pdb_id = df_3421.iloc[idx]['PDB_ID'].upper() + df_3421.iloc[idx]['PDB_CHAIN'].upper()
    wild_aa = df_3421.iloc[idx]['WILD_RES']
    pos = str(df_3421.iloc[idx]['POSITION'])
    mutant_aa = df_3421.iloc[idx]['MUTANT_RES']
    exp_ddg = df_3421.iloc[idx]['EXP_DDG']
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id in verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing S3421


 31%|████████████████████████████████████████████████▍                                                                                                             | 1048/3421 [01:33<05:56,  6.66it/s]

Error for 1LVEA expected Q at position 89 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'

 31%|████████████████████████████████████████████████▊                                                                                                             | 1058/3421 [01:33<02:56, 13.36it/s]

Error for 1LVEA expected S at position 29 
Sequence is DIVMTQSPDSLAVSLGERATINCKSSQSVLYSSNSKNYLAWYQQKPGQPPKLLIYWASTRESGVPDRFSGSGSGTDFTLTISSLQAEDVAVYYCQQYYSTPYSFGQGTKLEIKR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69'







100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3421/3421 [03:52<00:00, 14.71it/s]


In [17]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/S3421.csv')

# ACDC-varibench

In [18]:
df_acdc_varibench = pd.concat([pd.read_csv(os.path.join('DATA/varibench/', f), sep = ' ',
            names = ['PDB_CHAIN', 'MUTATION', 'EXP_DDG']) for f in os.listdir('DATA/varibench/')]).drop_duplicates()

In [19]:
print('Total dataset length', len(df_acdc_varibench))
pdb_ids = list(set([t.split()[0].upper() for t in df_acdc_varibench.PDB_CHAIN.to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 1387
Total number of different chains in dataset 78


In [20]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [21]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

no_verbatim_pdb_ids = {'1C9OA', '1VQBA'}


print('Processing ACDC-varibench')

for idx in tqdm(range(len(df_acdc_varibench))):
    pdb_id = df_acdc_varibench.iloc[idx]['PDB_CHAIN'].upper()
    wild_aa = df_acdc_varibench.iloc[idx]['MUTATION'][0]
    pos = df_acdc_varibench.iloc[idx]['MUTATION'][1:-1]
    mutant_aa = df_acdc_varibench.iloc[idx]['MUTATION'][-1]
    exp_ddg = df_acdc_varibench.iloc[idx]['EXP_DDG']
    
    #if pdb_id!= '1CLWA':
    #    continue
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    if pdb_id not in no_verbatim_pdb_ids:
        seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing ACDC-varibench


  1%|█▊                                                                                                                                                              | 16/1387 [00:00<01:45, 12.94it/s]

Error for 1AM7A expected H at position 30 
Sequence is MVEINNQRKAFLDMLAWSEGTDNGRQKTRNHGYDVIVGGELFTDYSDHPRKLVTLNPKLKSTGAGRYQLLSRWWDAYRKQLGLKDFSPKSQDAVALQQIKERGALPMIDRGDIRQAIDRCSNIWASLPGAGYGQFEHKADSLIAKFKEAGGTVR
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': 

 22%|███████████████████████████████████▏                                                                                                                           | 307/1387 [00:13<00:42, 25.14it/s]

Error for 1ONCA expected M at position 22 
Sequence is EDWLTFQKKHITNTRDVDCDNIMSTNLFHCKDKNTFIYSRPEPVKAICKGIIASKNVLTTSEFYLSDCNVTSRPCKYKLKKSTNKFCVTCENQAPVHFVGVGSC
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69', '70': '7

 37%|██████████████████████████████████████████████████████████▊                                                                                                    | 513/1387 [00:21<00:25, 34.81it/s]

Indexing error for 1STNA position 136 not present in mapping {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69', '70': '70', '71': '71', '72': '72', '73': '73', '74': '74', '75': '75', '76': '76', '77': '77', '78': '78', '79': '79'

 45%|███████████████████████████████████████████████████████████████████████▉                                                                                       | 627/1387 [00:24<00:18, 41.64it/s]

Error for 1YCCA expected C at position 106 
Sequence is TEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHSGQAEGYSYTDANIKKNVLWDENNMSEYLTNPKKYIPGTKMAFGGLKKEKDRNDLITYLKKACE
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69', '70

 46%|█████████████████████████████████████████████████████████████████████████                                                                                      | 637/1387 [00:25<00:19, 37.69it/s]

Error for 1YCCA expected L at position 89 
Sequence is TEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHSGQAEGYSYTDANIKKNVLWDENNMSEYLTNPKKYIPGTKMAFGGLKKEKDRNDLITYLKKACE
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69', '70'

 46%|█████████████████████████████████████████████████████████████████████████▍                                                                                     | 641/1387 [00:25<00:21, 34.29it/s]

Error for 1YCCA expected P at position 80 
Sequence is TEFKAGSAKKGATLFKTRCLQCHTVEKGGPHKVGPNLHGIFGRHSGQAEGYSYTDANIKKNVLWDENNMSEYLTNPKKYIPGTKMAFGGLKKEKDRNDLITYLKKACE
Mapping is {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69', '70'

 78%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                  | 1087/1387 [01:00<00:15, 19.86it/s]

Indexing error for 1BNIA position 108 not present in mapping {'0': '0', '1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', '41': '41', '42': '42', '43': '43', '44': '44', '45': '45', '46': '46', '47': '47', '48': '48', '49': '49', '50': '50', '51': '51', '52': '52', '53': '53', '54': '54', '55': '55', '56': '56', '57': '57', '58': '58', '59': '59', '60': '60', '61': '61', '62': '62', '63': '63', '64': '64', '65': '65', '66': '66', '67': '67', '68': '68', '69': '69', '70': '70', '71': '71', '72': '72', '73': '73', '74': '74', '75': '75', '76': '76', '77': '77', '78': '78', '79': '79'

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1387/1387 [01:15<00:00, 18.34it/s]


In [22]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/ACDC_varibench.csv')

# DeepDDG train

In [23]:
df_deepddg_train = pd.read_csv('DATA/deep_ddg_train.csv', sep = ';')

In [24]:
print('Total dataset length', len(df_deepddg_train))
pdb_ids = list(set([t.split()[0].upper() for t in df_deepddg_train['PDB ID with modifications to be made'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

Total dataset length 5444
Total number of different chains in dataset 209


In [25]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [None]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

#no_verbatim_pdb_ids = {'1C9OA', '1VQBA'}


print('Processing DeepDDG train')

for idx in tqdm(range(len(df_deepddg_train))):
    pdb_id = df_deepddg_train.iloc[idx]['PDB ID with modifications to be made'].upper()
    wild_aa = df_deepddg_train.iloc[idx]['Mutation'][0]
    pos = df_deepddg_train.iloc[idx]['Mutation'][2:-2]
    mutant_aa = df_deepddg_train.iloc[idx]['Mutation'][-1]
    exp_ddg = df_deepddg_train.iloc[idx]['ΔΔG (kcal/mol) positive is stable']
    
    #if pdb_id!= '1CLWA':
    #    continue
        
    
    pdb = PDBParser().get_structure("pdb_id[:4]", f'PDB/{pdb_id[:4]}.pdb')
    chain = next(pdb.get_chains()).get_id()
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', chain)
    
    #if pdb_id not in no_verbatim_pdb_ids:
    #    seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

Processing DeepDDG train








Error for 1ACB:I:F10W expected V at position 18 
Sequence is CGVPAIQPVLSGLIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSLINENWVVTAAHCGVTTSDVVVAGEFDQGSSSEKIQKLKIAKVFKNSKYNSLTINNDITLLKLSTAASFSQTVSAVCLPSASDDFAAGTTCVTTGWGLTRYANTPDRLQQASLPLLSNTNCKKYWGTKIKDAMICAGASGVSSCMGDSGGPLVCKKNGAWTLVGIVSWGSSTCSTSTPGVYARVTALVNWVQQTLAAN
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '16': '14', '17': '15', '18': '16', '19': '17', '20': '18', '21': '19', '22': '20', '23': '21', '24': '22', '25': '23', '26': '24', '27': '25', '28': '26', '29': '27', '30': '28', '31': '29', '32': '30', '33': '31', '34': '32', '35': '33', '36': '34', '37': '35', '38': '36', '39': '37', '40': '38', '41': '39', '42': '40', '43': '41', '44': '42', '45': '43', '46': '44', '47': '45', '48': '46', '49': '47', '50': '48', '51': '49', '52': '50', '53': '51', '54': '52', '55': '53', '56': '54', '57': '55', '58': '56', '59': '57', '60': '58', '61': '



Error for 1ACB:I:F10W expected P at position 58 
Sequence is CGVPAIQPVLSGLIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSLINENWVVTAAHCGVTTSDVVVAGEFDQGSSSEKIQKLKIAKVFKNSKYNSLTINNDITLLKLSTAASFSQTVSAVCLPSASDDFAAGTTCVTTGWGLTRYANTPDRLQQASLPLLSNTNCKKYWGTKIKDAMICAGASGVSSCMGDSGGPLVCKKNGAWTLVGIVSWGSSTCSTSTPGVYARVTALVNWVQQTLAAN
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '16': '14', '17': '15', '18': '16', '19': '17', '20': '18', '21': '19', '22': '20', '23': '21', '24': '22', '25': '23', '26': '24', '27': '25', '28': '26', '29': '27', '30': '28', '31': '29', '32': '30', '33': '31', '34': '32', '35': '33', '36': '34', '37': '35', '38': '36', '39': '37', '40': '38', '41': '39', '42': '40', '43': '41', '44': '42', '45': '43', '46': '44', '47': '45', '48': '46', '49': '47', '50': '48', '51': '49', '52': '50', '53': '51', '54': '52', '55': '53', '56': '54', '57': '55', '58': '56', '59': '57', '60': '58', '61': '



Indexing error for 1ACB:I:F10W position 14 not present in mapping {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '16': '14', '17': '15', '18': '16', '19': '17', '20': '18', '21': '19', '22': '20', '23': '21', '24': '22', '25': '23', '26': '24', '27': '25', '28': '26', '29': '27', '30': '28', '31': '29', '32': '30', '33': '31', '34': '32', '35': '33', '36': '34', '37': '35', '38': '36', '39': '37', '40': '38', '41': '39', '42': '40', '43': '41', '44': '42', '45': '43', '46': '44', '47': '45', '48': '46', '49': '47', '50': '48', '51': '49', '52': '50', '53': '51', '54': '52', '55': '53', '56': '54', '57': '55', '58': '56', '59': '57', '60': '58', '61': '59', '62': '60', '63': '61', '64': '62', '65': '63', '66': '64', '67': '65', '68': '66', '69': '67', '70': '68', '71': '69', '72': '70', '73': '71', '74': '72', '75': '73', '76': '74', '77': '75', '78': '76', '79': '77', '80': '78', '81': '79', '82



Error for 1ACB:I:F10W expected L at position 27 
Sequence is CGVPAIQPVLSGLIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSLINENWVVTAAHCGVTTSDVVVAGEFDQGSSSEKIQKLKIAKVFKNSKYNSLTINNDITLLKLSTAASFSQTVSAVCLPSASDDFAAGTTCVTTGWGLTRYANTPDRLQQASLPLLSNTNCKKYWGTKIKDAMICAGASGVSSCMGDSGGPLVCKKNGAWTLVGIVSWGSSTCSTSTPGVYARVTALVNWVQQTLAAN
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '16': '14', '17': '15', '18': '16', '19': '17', '20': '18', '21': '19', '22': '20', '23': '21', '24': '22', '25': '23', '26': '24', '27': '25', '28': '26', '29': '27', '30': '28', '31': '29', '32': '30', '33': '31', '34': '32', '35': '33', '36': '34', '37': '35', '38': '36', '39': '37', '40': '38', '41': '39', '42': '40', '43': '41', '44': '42', '45': '43', '46': '44', '47': '45', '48': '46', '49': '47', '50': '48', '51': '49', '52': '50', '53': '51', '54': '52', '55': '53', '56': '54', '57': '55', '58': '56', '59': '57', '60': '58', '61': '



Indexing error for 1ACB:I:F10W position 14 not present in mapping {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '16': '14', '17': '15', '18': '16', '19': '17', '20': '18', '21': '19', '22': '20', '23': '21', '24': '22', '25': '23', '26': '24', '27': '25', '28': '26', '29': '27', '30': '28', '31': '29', '32': '30', '33': '31', '34': '32', '35': '33', '36': '34', '37': '35', '38': '36', '39': '37', '40': '38', '41': '39', '42': '40', '43': '41', '44': '42', '45': '43', '46': '44', '47': '45', '48': '46', '49': '47', '50': '48', '51': '49', '52': '50', '53': '51', '54': '52', '55': '53', '56': '54', '57': '55', '58': '56', '59': '57', '60': '58', '61': '59', '62': '60', '63': '61', '64': '62', '65': '63', '66': '64', '67': '65', '68': '66', '69': '67', '70': '68', '71': '69', '72': '70', '73': '71', '74': '72', '75': '73', '76': '74', '77': '75', '78': '76', '79': '77', '80': '78', '81': '79', '82



Error for 1ACB:I:F10W expected V at position 54 
Sequence is CGVPAIQPVLSGLIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSLINENWVVTAAHCGVTTSDVVVAGEFDQGSSSEKIQKLKIAKVFKNSKYNSLTINNDITLLKLSTAASFSQTVSAVCLPSASDDFAAGTTCVTTGWGLTRYANTPDRLQQASLPLLSNTNCKKYWGTKIKDAMICAGASGVSSCMGDSGGPLVCKKNGAWTLVGIVSWGSSTCSTSTPGVYARVTALVNWVQQTLAAN
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '16': '14', '17': '15', '18': '16', '19': '17', '20': '18', '21': '19', '22': '20', '23': '21', '24': '22', '25': '23', '26': '24', '27': '25', '28': '26', '29': '27', '30': '28', '31': '29', '32': '30', '33': '31', '34': '32', '35': '33', '36': '34', '37': '35', '38': '36', '39': '37', '40': '38', '41': '39', '42': '40', '43': '41', '44': '42', '45': '43', '46': '44', '47': '45', '48': '46', '49': '47', '50': '48', '51': '49', '52': '50', '53': '51', '54': '52', '55': '53', '56': '54', '57': '55', '58': '56', '59': '57', '60': '58', '61': '

  1%|█▊                                                                                                                                                              | 60/5444 [00:11<08:42, 10.31it/s]

Error for 1ACB:I:F10W expected V at position 54 
Sequence is CGVPAIQPVLSGLIVNGEEAVPGSWPWQVSLQDKTGFHFCGGSLINENWVVTAAHCGVTTSDVVVAGEFDQGSSSEKIQKLKIAKVFKNSKYNSLTINNDITLLKLSTAASFSQTVSAVCLPSASDDFAAGTTCVTTGWGLTRYANTPDRLQQASLPLLSNTNCKKYWGTKIKDAMICAGASGVSSCMGDSGGPLVCKKNGAWTLVGIVSWGSSTCSTSTPGVYARVTALVNWVQQTLAAN
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '16': '14', '17': '15', '18': '16', '19': '17', '20': '18', '21': '19', '22': '20', '23': '21', '24': '22', '25': '23', '26': '24', '27': '25', '28': '26', '29': '27', '30': '28', '31': '29', '32': '30', '33': '31', '34': '32', '35': '33', '36': '34', '37': '35', '38': '36', '39': '37', '40': '38', '41': '39', '42': '40', '43': '41', '44': '42', '45': '43', '46': '44', '47': '45', '48': '46', '49': '47', '50': '48', '51': '49', '52': '50', '53': '51', '54': '52', '55': '53', '56': '54', '57': '55', '58': '56', '59': '57', '60': '58', '61': '

  1%|█▊                                                                                                                                                              | 62/5444 [00:11<07:57, 11.27it/s]

Indexing error for 1ACB:I:F10W position 14 not present in mapping {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '16': '14', '17': '15', '18': '16', '19': '17', '20': '18', '21': '19', '22': '20', '23': '21', '24': '22', '25': '23', '26': '24', '27': '25', '28': '26', '29': '27', '30': '28', '31': '29', '32': '30', '33': '31', '34': '32', '35': '33', '36': '34', '37': '35', '38': '36', '39': '37', '40': '38', '41': '39', '42': '40', '43': '41', '44': '42', '45': '43', '46': '44', '47': '45', '48': '46', '49': '47', '50': '48', '51': '49', '52': '50', '53': '51', '54': '52', '55': '53', '56': '54', '57': '55', '58': '56', '59': '57', '60': '58', '61': '59', '62': '60', '63': '61', '64': '62', '65': '63', '66': '64', '67': '65', '68': '66', '69': '67', '70': '68', '71': '69', '72': '70', '73': '71', '74': '72', '75': '73', '76': '74', '77': '75', '78': '76', '79': '77', '80': '78', '81': '79', '82





  5%|████████▍                                                                                                                                                      | 288/5444 [01:22<04:26, 19.36it/s]

Indexing error for 1AZP position 30 not present in mapping {}




Indexing error for 1BF4:Y34W position 45 not present in mapping {}
Indexing error for 1BF4:Y34W position 51 not present in mapping {}




Indexing error for 1BF4:Y34W position 35 not present in mapping {}
Indexing error for 1BF4:Y34W position 12 not present in mapping {}
Indexing error for 1BF4:Y34W position 32 not present in mapping {}
Indexing error for 1BF4:Y34W position 27 not present in mapping {}
Indexing error for 1BF4:Y34W position 37 not present in mapping {}




Indexing error for 1BF4:Y34W position 17 not present in mapping {}
Indexing error for 1BF4:Y34W position 30 not present in mapping {}
Indexing error for 1BF4:Y34W position 7 not present in mapping {}
Indexing error for 1BF4:Y34W position 55 not present in mapping {}


  6%|████████▉                                                                                                                                                      | 307/5444 [01:28<09:34,  8.94it/s]

Indexing error for 1BF4:Y34W position 56 not present in mapping {}
Indexing error for 1BF4:Y34W position 59 not present in mapping {}
Indexing error for 1BF4:Y34W position 57 not present in mapping {}
Indexing error for 1BF4:Y34W position 25 not present in mapping {}




Indexing error for 1BF4:Y34W position 43 not present in mapping {}
Indexing error for 1BF4:Y34W position 18 not present in mapping {}
Indexing error for 1BF4:Y34W position 31 not present in mapping {}
Indexing error for 1BF4:Y34W position 47 not present in mapping {}
Indexing error for 1BF4:Y34W position 41 not present in mapping {}
Indexing error for 1BF4:Y34W position 15 not present in mapping {}




Indexing error for 1BF4:Y34W position 23 not present in mapping {}
Indexing error for 1BF4:Y34W position 46 not present in mapping {}
Indexing error for 1BF4:Y34W position 4 not present in mapping {}












































































Error for 1JIW:I expected A at position 8 
Sequence is GRSDAYTQVDNFLHAYARGGDELVNGHPSYTVDQAAEQILREQASWQKAPGDSVLTLSYSFLTKPNDFFNTPWKYVSDIYSLGKFSAFSAQQQAQAKLSLQSWSDVTNIHFVDAGQGDQGDLTFGNFSSSVGGAAFAFLPDVPDALKGQSWYLINSSYSANVNPANGNYGRQTLTHEIGHTLGLSHPGDYNAGEGDPTYADATYAEDTRAYSVMSYWEEQNTGQDFKGAYSSAPLLDDIAAIQKLYGANLTTRTGDTVYGFNSNTERDFYSATSSSSKLVFSVWDAGGNDTLDFSGFSQNQKINLNEKALSDVGGLKGNVSIAAGVTVENAIGGSGSDLLIGNDVANVLKGGAGNDILYGGLGADQLWGGAGADTFVYGDIAESSAAAPDTLRDFVSGQDKIDLSGLDAFVNGGLVLQYVDAFAGKAGQAILSYDAASKAGSLAIDFSGDAHADFAINLIGQATQADIVV
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40', 



Error for 1JIW:I expected W at position 15 
Sequence is GRSDAYTQVDNFLHAYARGGDELVNGHPSYTVDQAAEQILREQASWQKAPGDSVLTLSYSFLTKPNDFFNTPWKYVSDIYSLGKFSAFSAQQQAQAKLSLQSWSDVTNIHFVDAGQGDQGDLTFGNFSSSVGGAAFAFLPDVPDALKGQSWYLINSSYSANVNPANGNYGRQTLTHEIGHTLGLSHPGDYNAGEGDPTYADATYAEDTRAYSVMSYWEEQNTGQDFKGAYSSAPLLDDIAAIQKLYGANLTTRTGDTVYGFNSNTERDFYSATSSSSKLVFSVWDAGGNDTLDFSGFSQNQKINLNEKALSDVGGLKGNVSIAAGVTVENAIGGSGSDLLIGNDVANVLKGGAGNDILYGGLGADQLWGGAGADTFVYGDIAESSAAAPDTLRDFVSGQDKIDLSGLDAFVNGGLVLQYVDAFAGKAGQAILSYDAASKAGSLAIDFSGDAHADFAINLIGQATQADIVV
Mapping is {'1': '1', '2': '2', '3': '3', '4': '4', '5': '5', '6': '6', '7': '7', '8': '8', '9': '9', '10': '10', '11': '11', '12': '12', '13': '13', '14': '14', '15': '15', '16': '16', '17': '17', '18': '18', '19': '19', '20': '20', '21': '21', '22': '22', '23': '23', '24': '24', '25': '25', '26': '26', '27': '27', '28': '28', '29': '29', '30': '30', '31': '31', '32': '32', '33': '33', '34': '34', '35': '35', '36': '36', '37': '37', '38': '38', '39': '39', '40': '40',







Indexing error for 1LMB position 36 not present in mapping {}
Indexing error for 1LMB position 40 not present in mapping {}
Indexing error for 1LMB position 36 not present in mapping {}




Indexing error for 1LMB position 40 not present in mapping {}
Indexing error for 1LMB position 78 not present in mapping {}
Indexing error for 1LMB position 46 not present in mapping {}


 27%|██████████████████████████████████████████▎                                                                                                                   | 1460/5444 [05:14<07:11,  9.23it/s]

Indexing error for 1LMB position 48 not present in mapping {}
Indexing error for 1LMB position 33 not present in mapping {}
Indexing error for 1LMB position 44 not present in mapping {}




Indexing error for 1LMB position 49 not present in mapping {}
Indexing error for 1LMB position 22 not present in mapping {}
Indexing error for 1LMB position 66 not present in mapping {}




Indexing error for 1LMB position 84 not present in mapping {}
Indexing error for 1LMB position 48 not present in mapping {}
Indexing error for 1LMB position 48 not present in mapping {}




Indexing error for 1LMB position 88 not present in mapping {}










Error for 1NFI_67-206 expected V at position 93 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73'

 31%|█████████████████████████████████████████████████▎                                                                                                            | 1697/5444 [05:59<52:39,  1.19it/s]

Error for 1NFI_67-206 expected Q at position 111 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▎                                                                                                            | 1698/5444 [05:59<42:18,  1.48it/s]

Error for 1NFI_67-206 expected T at position 113 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▎                                                                                                            | 1699/5444 [05:59<35:01,  1.78it/s]

Error for 1NFI_67-206 expected L at position 117 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▎                                                                                                            | 1700/5444 [05:59<30:10,  2.07it/s]

Error for 1NFI_67-206 expected N at position 122 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▎                                                                                                            | 1701/5444 [06:00<26:40,  2.34it/s]

Error for 1NFI_67-206 expected A at position 127 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▍                                                                                                            | 1702/5444 [06:00<24:08,  2.58it/s]

Error for 1NFI_67-206 expected L at position 131 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▍                                                                                                            | 1703/5444 [06:00<22:30,  2.77it/s]

Error for 1NFI_67-206 expected T at position 146 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▍                                                                                                            | 1704/5444 [06:01<21:25,  2.91it/s]

Error for 1NFI_67-206 expected V at position 160 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▍                                                                                                            | 1705/5444 [06:01<20:36,  3.02it/s]

Error for 1NFI_67-206 expected L at position 163 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▌                                                                                                            | 1707/5444 [06:02<19:43,  3.16it/s]

Error for 1NFI_67-206 expected T at position 185 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▌                                                                                                            | 1708/5444 [06:02<19:16,  3.23it/s]

Error for 1NFI_67-206 expected C at position 186 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▌                                                                                                            | 1709/5444 [06:02<19:03,  3.27it/s]

Error for 1NFI_67-206 expected G at position 194 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▋                                                                                                            | 1710/5444 [06:03<20:36,  3.02it/s]

Error for 1NFI_67-206 expected V at position 203 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▋                                                                                                            | 1711/5444 [06:03<20:06,  3.09it/s]

Error for 1NFI_67-287 expected W at position 258 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▋                                                                                                            | 1712/5444 [06:03<19:43,  3.15it/s]

Error for 1NFI_67-287 expected Q at position 111 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73

 31%|█████████████████████████████████████████████████▋                                                                                                            | 1714/5444 [06:04<19:18,  3.22it/s]

Error for 1NFI_67-287 expected V at position 203 
Sequence is YVEIIEQPKQRGMRFRYKCEGRSAGSIPGERSTDTTKTHPTIKINGYTGPGTVRISLVTKDPPHRPHPHELVGKDCRDGFYEAELCPDRCIHSFQNLGIQCVKKRDLEQAISQRIQTNNNPFQVPIEEQRGDYDLNAVRLCFQVTVRDPSGRPLRLPPVLPHPIFDNRAPNTAELKICRVNRNSGSCLGGDEIFLLCDKVQKEDIEVYFTGPGWEARGSFSQADVHRQVAIVFRTPPYADPSLQAPVRVSMQLRRPSDRELSEPMEFQYLPDTDDRHRIEEKRKRTYETFKSIMK
Mapping is {'20': '1', '21': '2', '22': '3', '23': '4', '24': '5', '25': '6', '26': '7', '27': '8', '28': '9', '29': '10', '30': '11', '31': '12', '32': '13', '33': '14', '34': '15', '35': '16', '36': '17', '37': '18', '38': '19', '39': '20', '40': '21', '41': '22', '42': '23', '43': '24', '44': '25', '45': '26', '46': '27', '47': '28', '48': '29', '49': '30', '50': '31', '51': '32', '52': '33', '53': '34', '54': '35', '55': '36', '56': '37', '57': '38', '58': '39', '59': '40', '60': '41', '61': '42', '62': '43', '63': '44', '64': '45', '65': '46', '66': '47', '67': '48', '68': '49', '69': '50', '70': '51', '71': '52', '72': '53', '73



 32%|███████████████████████████████████████████████████                                                                                                           | 1759/5444 [06:12<11:15,  5.46it/s]

Indexing error for 1OTR position 5 not present in mapping {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}
Error for 1OTR expected I at position 30 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22',

 32%|███████████████████████████████████████████████████                                                                                                           | 1761/5444 [06:14<25:01,  2.45it/s]

Error for 1OTR expected I at position 36 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 32%|███████████████████████████████████████████████████▏                                                                                                          | 1762/5444 [06:15<31:24,  1.95it/s]

Error for 1OTR expected I at position 36 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 32%|███████████████████████████████████████████████████▏                                                                                                          | 1763/5444 [06:16<37:28,  1.64it/s]

Error for 1OTR expected I at position 36 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 32%|███████████████████████████████████████████████████▏                                                                                                          | 1764/5444 [06:17<42:57,  1.43it/s]

Indexing error for 1OTR position 67 not present in mapping {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 32%|███████████████████████████████████████████████████▏                                                                                                          | 1765/5444 [06:18<50:43,  1.21it/s]

Indexing error for 1OTR position 67 not present in mapping {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 32%|███████████████████████████████████████████████████▎                                                                                                          | 1766/5444 [06:20<54:29,  1.12it/s]

Indexing error for 1OTR position 67 not present in mapping {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 32%|███████████████████████████████████████████████████▎                                                                                                          | 1767/5444 [06:21<58:14,  1.05it/s]

Indexing error for 1OTR position 67 not present in mapping {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 32%|██████████████████████████████████████████████████▋                                                                                                         | 1768/5444 [06:22<1:00:36,  1.01it/s]

Error for 1OTR expected Q at position 41 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 32%|███████████████████████████████████████████████████▎                                                                                                          | 1769/5444 [06:23<58:59,  1.04it/s]

Error for 1OTR expected Q at position 41 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|██████████████████████████████████████████████████▋                                                                                                         | 1770/5444 [06:24<1:05:28,  1.07s/it]

Error for 1OTR expected Q at position 41 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|██████████████████████████████████████████████████▋                                                                                                         | 1771/5444 [06:25<1:02:17,  1.02s/it]

Error for 1OTR expected Q at position 41 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|██████████████████████████████████████████████████▊                                                                                                         | 1772/5444 [06:26<1:07:48,  1.11s/it]

Error for 1OTR expected Q at position 41 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|██████████████████████████████████████████████████▊                                                                                                         | 1773/5444 [06:27<1:03:50,  1.04s/it]

Error for 1OTR expected V at position 17 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|██████████████████████████████████████████████████▊                                                                                                         | 1774/5444 [06:28<1:08:57,  1.13s/it]

Error for 1OTR expected V at position 17 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|██████████████████████████████████████████████████▊                                                                                                         | 1775/5444 [06:29<1:04:30,  1.05s/it]

Error for 1OTR expected V at position 17 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|██████████████████████████████████████████████████▉                                                                                                         | 1776/5444 [06:31<1:09:33,  1.14s/it]

Indexing error for 1OTR position 5 not present in mapping {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|██████████████████████████████████████████████████▉                                                                                                         | 1777/5444 [06:32<1:05:06,  1.07s/it]

Indexing error for 1OTR position 5 not present in mapping {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|███████████████████████████████████████████████████▌                                                                                                        | 1798/5444 [06:55<1:07:44,  1.11s/it]

Error for 1OTR expected K at position 11 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|███████████████████████████████████████████████████▌                                                                                                        | 1799/5444 [06:56<1:07:38,  1.11s/it]

Error for 1OTR_F45W expected P at position 37 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|███████████████████████████████████████████████████▌                                                                                                        | 1800/5444 [06:57<1:07:21,  1.11s/it]

Error for 1OTR_F45W expected P at position 38 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}


 33%|████████████████████████████████████████████████████▎                                                                                                         | 1804/5444 [06:58<30:46,  1.97it/s]

Error for 1OTR_F45W expected S at position 19 
Sequence is NDDHESKLSILMDMFPAISKSKLQVHLLENNNDLDLTIGLLLKENDDKS
Mapping is {'6': '1', '7': '2', '8': '3', '9': '4', '10': '5', '11': '6', '12': '7', '13': '8', '14': '9', '15': '10', '16': '11', '17': '12', '18': '13', '19': '14', '20': '15', '21': '16', '22': '17', '23': '18', '24': '19', '25': '20', '26': '21', '27': '22', '28': '23', '29': '24', '30': '25', '31': '26', '32': '27', '33': '28', '34': '29', '35': '30', '36': '31', '37': '32', '38': '33', '39': '34', '40': '35', '41': '36', '42': '37', '43': '38', '44': '39', '45': '40', '46': '41', '47': '42', '48': '43', '49': '44', '50': '45', '51': '46', '52': '47', '53': '48', '54': '49'}
































Indexing error for 1TUP position 282 not present in mapping {}
Indexing error for 1TUP position 129 not present in mapping {}


 56%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3054/5444 [08:35<03:52, 10.28it/s]

Indexing error for 1TUP position 129 not present in mapping {}


 56%|████████████████████████████████████████████████████████████████████████████████████████▋                                                                     | 3056/5444 [08:35<05:11,  7.67it/s]

Indexing error for 1TUP position 129 not present in mapping {}
Indexing error for 1TUP position 182 not present in mapping {}


 56%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3058/5444 [08:36<05:27,  7.29it/s]

Indexing error for 1TUP position 242 not present in mapping {}
Indexing error for 1TUP position 148 not present in mapping {}




Indexing error for 1TUP position 148 not present in mapping {}




Indexing error for 1TUP position 228 not present in mapping {}


 56%|████████████████████████████████████████████████████████████████████████████████████████▊                                                                     | 3062/5444 [08:36<06:36,  6.01it/s]

Indexing error for 1TUP position 134 not present in mapping {}
Indexing error for 1TUP position 270 not present in mapping {}




Indexing error for 1TUP position 245 not present in mapping {}




Indexing error for 1TUP position 168 not present in mapping {}


 56%|████████████████████████████████████████████████████████████████████████████████████████▉                                                                     | 3066/5444 [08:37<06:54,  5.74it/s]

Indexing error for 1TUP position 195 not present in mapping {}
Indexing error for 1TUP position 232 not present in mapping {}




Indexing error for 1TUP position 255 not present in mapping {}




Indexing error for 1TUP position 145 not present in mapping {}


 56%|█████████████████████████████████████████████████████████████████████████████████████████                                                                     | 3070/5444 [08:38<07:04,  5.59it/s]

Indexing error for 1TUP position 201 not present in mapping {}
Indexing error for 1TUP position 206 not present in mapping {}




Indexing error for 1TUP position 133 not present in mapping {}




Indexing error for 1TUP position 237 not present in mapping {}


 56%|█████████████████████████████████████████████████████████████████████████████████████████▏                                                                    | 3074/5444 [08:39<07:02,  5.61it/s]

Indexing error for 1TUP position 239 not present in mapping {}
Indexing error for 1TUP position 268 not present in mapping {}


 57%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 3076/5444 [08:39<08:05,  4.88it/s]

Indexing error for 1TUP position 151 not present in mapping {}
Indexing error for 1TUP position 104 not present in mapping {}


 57%|█████████████████████████████████████████████████████████████████████████████████████████▎                                                                    | 3078/5444 [08:39<07:09,  5.51it/s]

Indexing error for 1TUP position 104 not present in mapping {}
Indexing error for 1TUP position 165 not present in mapping {}




Indexing error for 1TUP position 167 not present in mapping {}




Indexing error for 1TUP position 174 not present in mapping {}
Indexing error for 1TUP position 175 not present in mapping {}


 57%|█████████████████████████████████████████████████████████████████████████████████████████▍                                                                    | 3082/5444 [08:40<07:07,  5.53it/s]

Indexing error for 1TUP position 175 not present in mapping {}




Indexing error for 1TUP position 248 not present in mapping {}


 57%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3085/5444 [08:41<07:50,  5.01it/s]

Indexing error for 1TUP position 249 not present in mapping {}
Indexing error for 1TUP position 273 not present in mapping {}


 57%|█████████████████████████████████████████████████████████████████████████████████████████▌                                                                    | 3087/5444 [08:41<07:03,  5.57it/s]

Indexing error for 1TUP position 282 not present in mapping {}
Indexing error for 1TUP position 260 not present in mapping {}


 57%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3089/5444 [08:42<07:44,  5.06it/s]

Indexing error for 1TUP position 123 not present in mapping {}
Indexing error for 1TUP position 150 not present in mapping {}


 57%|█████████████████████████████████████████████████████████████████████████████████████████▋                                                                    | 3091/5444 [08:42<06:52,  5.70it/s]

Indexing error for 1TUP position 143 not present in mapping {}
Indexing error for 1TUP position 157 not present in mapping {}




Indexing error for 1TUP position 203 not present in mapping {}




Indexing error for 1TUP position 220 not present in mapping {}


 57%|█████████████████████████████████████████████████████████████████████████████████████████▉                                                                    | 3097/5444 [08:43<04:10,  9.37it/s]

Indexing error for 1TUP position 236 not present in mapping {}


 58%|███████████████████████████████████████████████████████████████████████████████████████████▌                                                                  | 3157/5444 [09:04<27:59,  1.36it/s]

In [None]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [float(t.replace(',','.')) for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/deepddg_train.csv')

# DeepDDG test

In [None]:
df_deepddg_test = pd.read_csv('DATA/deep_ddg_test.csv', sep = ';')

In [None]:
print('Total dataset length', len(df_deepddg_test))
pdb_ids = list(set([t.split()[0].upper() for t in df_deepddg_test['PDB ID with modifications to be made'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

In [None]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [None]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

#no_verbatim_pdb_ids = {'1C9OA', '1VQBA'}


print('Processing DeepDDG test')

for idx in tqdm(range(len(df_deepddg_test))):
    pdb_id = df_deepddg_test.iloc[idx]['PDB ID with modifications to be made'].upper()
    wild_aa = df_deepddg_test.iloc[idx]['Mutation'][0]
    pos = df_deepddg_test.iloc[idx]['Mutation'][2:-2]
    mutant_aa = df_deepddg_test.iloc[idx]['Mutation'][-1]
    exp_ddg = df_deepddg_test.iloc[idx]['ΔΔG (kcal/mol) positive is stable']
    
    #if pdb_id!= '1CLWA':
    #    continue
        
    
    pdb = PDBParser().get_structure("pdb_id[:4]", f'PDB/{pdb_id[:4]}.pdb')
    chain = next(pdb.get_chains()).get_id()
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', chain)
    
    #if pdb_id not in no_verbatim_pdb_ids:
    #    seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

In [None]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [float(t.replace(',','.')) for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/deepddg_test.csv')

# Ssym

In [None]:
df_ssym = pd.read_csv('DATA/s_sym.txt', sep= ' ', names = ['PDB_ID', '_', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG'])

In [None]:
print('Total dataset length', len(df_ssym))
pdb_ids = list(set([t.split()[0].upper() for t in df_ssym['PDB_ID'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

In [None]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [None]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

print('Processing Ssym')

for idx in tqdm(range(len(df_ssym))):
    pdb_id = df_ssym.iloc[idx]['PDB_ID'].upper()
    wild_aa = df_ssym.iloc[idx]['WILD_RES']
    pos = str(df_ssym.iloc[idx]['POSITION'])
    mutant_aa = df_ssym.iloc[idx]['MUTANT_RES']
    exp_ddg = df_ssym.iloc[idx]['EXP_DDG']
    
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    #if pdb_id not in no_verbatim_pdb_ids:
    #  seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

In [None]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [-t for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/ssym.csv')

In [None]:
pd.DataFrame({'wt_seq': mut, 
              'mut_seq': wt ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': [t[-1] + t[1:-1] + t[0] for t in mut_infos],
              'pos': poss}).to_csv('DATASETS/ssym_r.csv')

# Myoglobin

In [None]:
df_myoglobin = pd.read_csv('DATA/myoglobin.txt', sep= ' ', names = ['PDB_ID', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG'])

In [None]:
print('Total dataset length', len(df_myoglobin))
pdb_ids = list(set([t.split()[0].upper() for t in df_myoglobin['PDB_ID'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

In [None]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [None]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

#no_verbatim_pdb_ids = {'1C9OA', '1VQBA'}


print('Processing myoglobin')

for idx in tqdm(range(len(df_myoglobin))):
    pdb_id = df_myoglobin.iloc[idx]['PDB_ID'].upper()
    wild_aa = df_myoglobin.iloc[idx]['WILD_RES']
    pos = str(df_myoglobin.iloc[idx]['POSITION'])
    mutant_aa = df_myoglobin.iloc[idx]['MUTANT_RES']
    exp_ddg = df_myoglobin.iloc[idx]['EXP_DDG']
    
    #if pdb_id!= '1CLWA':
    #    continue
        
    
    pdb = PDBParser().get_structure(pdb_id[:4], f'PDB/{pdb_id[:4]}.pdb')
    chain = next(pdb.get_chains()).get_id()
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    #if pdb_id not in no_verbatim_pdb_ids:
    #  seq2pdb_pos = {str(i):str(i) for i in range(len(sequence))}
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

In [None]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [-t for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/myoglobin.csv')

In [None]:
pd.DataFrame({'wt_seq': mut, 
              'mut_seq': wt ,
              'ddg': ddg, 
              'pdb_id': pdb_ids, 
              'mut_info': [t[-1] + t[1:-1] + t[0] for t in mut_infos],
              'pos': poss}).to_csv('DATASETS/myoglobin_r.csv')

# P53

In [None]:
df_p53 = pd.read_csv('DATA/p53.txt', sep= ' ', names = ['PDB_ID', 'POSITION', 'WILD_RES', 'MUTANT_RES', 'EXP_DDG'])

In [None]:
print('Total dataset length', len(df_p53))
pdb_ids = list(set([t.split()[0].upper() for t in df_p53['PDB_ID'].to_list()]))
print('Total number of different chains in dataset', len(pdb_ids))

In [None]:
for pdb_id in pdb_ids:
    if not os.path.isfile(f"PDB/{pdb_id[:4]}.pdb"):
        with open(f"PDB/{pdb_id[:4]}.pdb", "w") as fh:
            fh.write(pdb_client.get_pdb_file(f"{pdb_id[:4]}", compression=False))

In [None]:
wt = []
mut = []
ddg = []
pdb_ids = []
mut_infos = []
poss = []

print('Processing p53')

for idx in tqdm(range(len(df_myoglobin))):
    pdb_id = df_myoglobin.iloc[idx]['PDB_ID'].upper()
    wild_aa = df_myoglobin.iloc[idx]['WILD_RES']
    pos = str(df_myoglobin.iloc[idx]['POSITION'])
    mutant_aa = df_myoglobin.iloc[idx]['MUTANT_RES']
    exp_ddg = df_myoglobin.iloc[idx]['EXP_DDG']
    
        
    
    pdb = PDBParser().get_structure(pdb_id[:4], f'PDB/{pdb_id[:4]}.pdb')
    chain = next(pdb.get_chains()).get_id()
        
    _, _, sequence, pdb2seq_pos, seq2pdb_pos = pdb2info(f'PDB/{pdb_id[:4]}.pdb', pdb_id[-1])
    
    
    if pos not in seq2pdb_pos:
        print(f'Indexing error for {pdb_id} position {pos} not present in mapping {seq2pdb_pos}')
        
    else:
        if sequence[int(seq2pdb_pos[pos])-1]!=wild_aa:
            print(f'Error for {pdb_id} expected {wild_aa} at position {pos} ')
            print(f'Sequence is {sequence}')
            print(f'Mapping is {seq2pdb_pos}')
        
        else:
            wt.append(sequence)
            tt = list(sequence)
            tt[int(seq2pdb_pos[pos])-1] = mutant_aa
            poss.append(int(seq2pdb_pos[pos])-1)
            mut.append(''.join(tt))
            ddg.append(exp_ddg)
            pdb_ids.append(pdb_id)
            mut_infos.append(str(wild_aa) + pos + str(mutant_aa))

In [None]:
pd.DataFrame({'wt_seq': wt, 
              'mut_seq': mut ,
              'ddg': [-t for t in ddg], 
              'pdb_id': pdb_ids, 
              'mut_info': mut_infos,
              'pos': poss}).to_csv('DATASETS/p53.csv')