In [1]:
from michelanglo_protein import ProteinCore, global_settings, Structure, ProteinAnalyser, Mutation
from michelanglo_protein.generate import ProteinGatherer
global_settings.startup('/users/brc/matteo/michelanglo/protein-data')

Folder path set to /users/brc/matteo/michelanglo/protein-data


<michelanglo_protein.settings_handler.GlobalSettings at 0x7f45f821d410>

In [2]:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7579412/#TS1
import pandas as pd
import os, json

reference_data = pd.read_csv('Data_Sheet_2.CSV')

with open(os.path.join(global_settings.dictionary_folder, 'uniprot2pdb.json')) as fh:
    uni2pdb = json.load(fh)
# format: uni2pdb['P24565'] = ['1PNB_A']

pdb2uni = {}   # format: pdb2uni['1PNB_A'] = 'P24565'
for uni, pdbs in uni2pdb.items():
    for pdb in pdbs:
        pdb2uni[pdb] = uni

reference_data['PDB_chain'] = reference_data['PDB ID'] + '_' + reference_data.Chain
reference_data['mutation'] = reference_data['Wild Type'] + reference_data['Residue Number'] + reference_data['Mutation']
reference_data['uniprot'] = reference_data['PDB_chain'].map(pdb2uni).fillna('P404')
reference_data

Unnamed: 0,PDB ID,Protherm ID,Residue Number,Chain,Wild Type,Mutation,SASA,Experimental DDG,Classifiers,PDB_chain,mutation,uniprot
0,1STN,2199,122,A,E,F,0.224900,0.800000,'polar to hydrophobic' 'negative to hydrophobi...,1STN_A,E122F,P00644
1,1STN,1711,139,A,I,A,0.326741,3.500000,'hydrophobic to hydrophobic' 'buried',1STN_A,I139A,P00644
2,1AAR,3,45,A,F,W,0.150026,-0.600000,'hydrophobic to hydrophobic' 'buried',1AAR_A,F45W,P0CH28
3,1TUP,2256,273,A,R,H,0.468525,0.350000,'positive to non-charged polar' 'buried',1TUP_A,R273H,P04637
4,1LZ1,1076,110,A,V,L,0.429977,-0.071702,'hydrophobic to hydrophobic' 'buried',1LZ1_A,V110L,P61626
...,...,...,...,...,...,...,...,...,...,...,...,...
763,1STN,2014,78,A,K,C,0.792768,0.400000,'large to small' 'involves cysteine' 'surface',1STN_A,K78C,P00644
764,1STN,2046,82,A,T,C,0.524742,0.100000,'involves cysteine' 'surface',1STN_A,T82C,P00644
765,1SUP,2207,206,A,Q,C,0.328425,-1.250000,'large to small' 'involves cysteine' 'buried',1SUP_A,Q206C,P00782
766,1TUP,2253,242,A,C,S,0.119379,2.940000,'involves cysteine' 'buried',1TUP_A,C242S,P04637


In [3]:
from michelanglo_protein import Structure
import time, pymol2

def check_sequence(protein):
    with pymol2.PyMOL() as pymol:
        pymol.cmd.read_pdbstr(protein.pdbblock, 'xxx')
        atom = pymol.cmd.get_model(f'resi {protein.mutation.residue_index} and name CA').atom
        if atom:
            mapping = {name3.upper(): name1 for name1, name3, fullname in protein.mutation.names}
            assert mapping[atom[0].resn] == protein.mutation.from_residue, 'mismatch'
        else:
            raise ValueError(f'resi {protein.mutation.residue_index} and name CA does not exist')
        

def get_protein(row):
    protein = ProteinAnalyser(uniprot=row['uniprot'])
    protein.mutation = Mutation(row['mutation'])

    model = Structure(id=row['Protherm ID'],
                      description='',
                      x=0, y=99999,
                      code=row['PDB ID'],
                      type='rcsb',
                      chain=row['Chain'],
                      offset = 0)

    protein.pdbs.append(model)
    protein.analyse_structure()
#     if not protein.check_mutation():
#         raise ValueError(f"Discrepancy for {row['PDB ID']}")
    check_sequence(protein)
    return protein

def analyse(protein, radius, scorefxn_name, use_pymol_for_neighbours = False):
    protein.use_pymol_for_neighbours = use_pymol_for_neighbours
    protein.radius = radius
    protein.scorefxn_name = scorefxn_name
    tick = time.time()
    protein.energetics = None
    data = protein.analyse_FF()
    tock = time.time()
    if 'ddG' not in data:
        print(data.keys())
    return dict(radius=protein.radius,
                 use_pymol_for_neighbours=protein.use_pymol_for_neighbours,
                 ddG=data['ddG'],
                 neighs = len(data['neighbours']),
                 scorefxn_name=protein.scorefxn_name,
                 time=tock - tick)

def analyse_row(row):
    try:
        protein = get_protein(row)
        for radius in (8, 10, 12):
            for scorefxn_name in ('ref2015', 'beta_nov16', 'ref2015_cart', 'beta_nov16_cart'):
                acc = f"{row['Protherm ID']}_{scorefxn_name}_{radius}"
                results = SqliteDict('scores.db', encode=json.dumps, decode=json.loads, autocommit=True)
                if acc in results:
                    continue
                results[acc] = {**row.to_dict(),
                                **analyse(protein, radius = radius, scorefxn_name = scorefxn_name)}
    except Exception as error:
        print(error.__class__.__name__, str(error))

In [4]:
import logging, sys

log = logging.getLogger()
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.INFO)
handler.set_name('stdout')
handler.setFormatter(logging.Formatter('[%(asctime)s] %(levelname)s - %(message)s'))
log.addHandler(handler)

In [5]:
from sqlitedict import SqliteDict
results = SqliteDict('scores.db', encode=json.dumps, decode=json.loads, autocommit=True)

In [6]:
# 'Protherm ID' is indeed unique.
reference_data['Protherm ID'].nunique() == len(reference_data['Protherm ID'])

True

In [7]:
# from multiprocessing import Pool
# # AssertionError daemonic processes are not allowed to have children

# with Pool(10) as p:
#     p.map(analyse_row, [row for i, row in reference_data.iloc[120:150].iterrows()])

In [None]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers = 15) as pool:
      pool.map(analyse_row, [row for i, row in reference_data.iterrows()])

definitionless structure: 1STN
definitionless structure: 1STN
definitionless structure: 1AAR
definitionless structure: 1TUP
definitionless structure: 1LZ1
definitionless structure: 1STN
definitionless structure: 1CYC
definitionless structure: 1PIN
definitionless structure: 1LZ1
definitionless structure: 1L63
definitionless structure: 1BVC
definitionless structure: 1STN
definitionless structure: 1A2P
definitionless structure: 2LZM
definitionless structure: 2BQA
definitionless structure: 5AZU
definitionless structure: 4LYZ
definitionless structure: 1QLP
definitionless structure: 1DYJ
definitionless structure: 4LYZ
definitionless structure: 1DYJ
definitionless structure: 1PIN
definitionless structure: 1L63
definitionless structure: 1STN
definitionless structure: 1FXA
definitionless structure: 1VQB
definitionless structure: 1PGA
definitionless structure: 1VQB
definitionless structure: 1STN
definitionless structure: 1QLP
definitionless structure: 1LZ1
definitionless structure: 1FTG
definiti

In [10]:
pd.DataFrame(dict(results.items())).transpose().head(20)

Unnamed: 0,PDB ID,Protherm ID,Residue Number,Chain,Wild Type,Mutation,SASA,Experimental DDG,Classifiers,PDB_chain,mutation,uniprot,radius,use_pymol_for_neighbours,ddG,neighs,scorefxn_name,time
2199_ref2015_8,1STN,2199,122,A,E,F,0.2249,0.8,'polar to hydrophobic' 'negative to hydrophobi...,1STN_A,E122F,P00644,8,False,2.5621,11,ref2015,9.55795
2199_beta_nov16_8,1STN,2199,122,A,E,F,0.2249,0.8,'polar to hydrophobic' 'negative to hydrophobi...,1STN_A,E122F,P00644,8,False,-0.0440791,11,beta_nov16,9.33401
2199_ref2015_10,1STN,2199,122,A,E,F,0.2249,0.8,'polar to hydrophobic' 'negative to hydrophobi...,1STN_A,E122F,P00644,10,False,1.79129,17,ref2015,11.8185
2199_beta_nov16_10,1STN,2199,122,A,E,F,0.2249,0.8,'polar to hydrophobic' 'negative to hydrophobi...,1STN_A,E122F,P00644,10,False,1.34752,17,beta_nov16,11.038
2199_ref2015_12,1STN,2199,122,A,E,F,0.2249,0.8,'polar to hydrophobic' 'negative to hydrophobi...,1STN_A,E122F,P00644,12,False,1.89625,28,ref2015,14.157
2199_beta_nov16_12,1STN,2199,122,A,E,F,0.2249,0.8,'polar to hydrophobic' 'negative to hydrophobi...,1STN_A,E122F,P00644,12,False,-0.696345,28,beta_nov16,15.73
1711_ref2015_8,1STN,1711,139,A,I,A,0.326741,3.5,'hydrophobic to hydrophobic' 'buried',1STN_A,I139A,P00644,8,False,0.765898,12,ref2015,9.74753
1711_beta_nov16_8,1STN,1711,139,A,I,A,0.326741,3.5,'hydrophobic to hydrophobic' 'buried',1STN_A,I139A,P00644,8,False,3.29626,12,beta_nov16,9.09404
1711_ref2015_10,1STN,1711,139,A,I,A,0.326741,3.5,'hydrophobic to hydrophobic' 'buried',1STN_A,I139A,P00644,10,False,0.514132,23,ref2015,11.7462
1711_beta_nov16_10,1STN,1711,139,A,I,A,0.326741,3.5,'hydrophobic to hydrophobic' 'buried',1STN_A,I139A,P00644,10,False,1.60971,23,beta_nov16,11.5977
