# OpenEye Structural Featurizer

In [1]:
from importlib import resources
import inspect

from appdirs import user_cache_dir

from kinoml.core.ligands import Ligand
from kinoml.core.proteins import Protein
from kinoml.core.systems import ProteinSystem, ProteinLigandComplex
from kinoml.features.protein import OEProteinStructureFeaturizer
from kinoml.features.complexes import OEComplexFeaturizer, OEDockingFeaturizer

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  class NCDFPicklable(scipy.io.netcdf.netcdf_file):


## OEProteinStructureFeaturizer

In [2]:
print(inspect.getdoc(OEProteinStructureFeaturizer))

Given systems with exactly one protein, prepare the protein structure by:

 - modeling missing loops
 - building missing side chains
 - mutations, if `uniprot_id` or `sequence` attribute is provided for the protein component
   (see below)
 - removing everything but protein and water
 - protonation at pH 7.4

The protein component of each system must be a `core.proteins.Protein` or a subclass thereof
and must be initialized with toolkit='OpenEye'.

Additionally, the protein component can have the following optional attributes to customize
the protein modeling:

 - `name`: A string specifying the name of the protein, will be used for generating the
   output file name.
 - `chain_id`: A string specifying which chain should be used.
 - `alternate_location`: A string specifying which alternate location should be used.
 - `expo_id`: A string specifying a ligand bound to the protein of interest. This is especially
   useful if multiple proteins are found in one PDB structure.
 - `uniprot_id`

In [3]:
systems = []
protein = Protein(pdb_id="4f8o", name="PsaA")
system = ProteinSystem(components=[protein])
systems.append(system)

In [4]:
protein = Protein.from_pdb(pdb_id="4f8o", name="PsaA")
protein.uniprot_id = "P31522"
protein.chain_id = "A"
protein.alternate_location = "B"
protein.expo_id = "AES"
system = ProteinSystem(components=[protein])
systems.append(system)

In [5]:
with resources.path("kinoml.data.proteins", "4f8o_edit.pdb") as structure_path:
    pass
protein = Protein.from_file(file_path=structure_path, name="PsaA")
protein.uniprot_id = "P31522"
system = ProteinSystem(components=[protein])
systems.append(system)

In [6]:
with resources.path("kinoml.data.proteins", "kinoml_tests_4f8o_spruce.loop_db") as loop_db:
    pass
featurizer = OEProteinStructureFeaturizer(
    loop_db=loop_db,
    output_dir=user_cache_dir() + "/protein",
)

In [7]:
systems = featurizer.featurize(systems)
systems



[<ProteinSystem with 1 components (<Protein name=PsaA>)>,
 <ProteinSystem with 1 components (<Protein name=PsaA>)>,
 <ProteinSystem with 1 components (<Protein name=PsaA>)>]

In [8]:
systems[0]

<ProteinSystem with 1 components (<Protein name=PsaA>)>

In [9]:
systems[0].featurizations["last"]

<Universe with 2381 atoms>

In [10]:
# check number of residues
assert len(systems[0].featurizations["last"].residues) == 239
assert len(systems[1].featurizations["last"].residues) == 216
assert len(systems[2].featurizations["last"].residues) == 109

# check numbering of first residue
assert systems[0].featurizations["last"].residues[0].resid == 1
assert systems[1].featurizations["last"].residues[0].resid == 44
assert systems[2].featurizations["last"].residues[0].resid == 47

## OEComplexFeaturizer

In [11]:
systems = []
protein = Protein(pdb_id="4f8o", name="PsaA")
ligand = Ligand(name="AEBSF")
system = ProteinLigandComplex(components=[protein, ligand])
systems.append(system)

In [12]:
protein = Protein.from_pdb(pdb_id="4f8o", name="PsaA")
protein.uniprot_id = "P31522"
protein.chain_id = "A"
protein.alternate_location = "B"
protein.expo_id = "AES"
ligand = Ligand(name="AEBSF")
system = ProteinLigandComplex(components=[protein, ligand])
systems.append(system)

In [13]:
featurizer = OEComplexFeaturizer(
    output_dir=user_cache_dir() + "/complex",
)

In [14]:
systems = featurizer.featurize(systems)
systems



[<ProteinLigandComplex with 2 components (<Protein name=PsaA>, <Ligand name=AEBSF>)>,
 <ProteinLigandComplex with 2 components (<Protein name=PsaA>, <Ligand name=AEBSF>)>]

## OEDockingFeaturizer

### Fred

In [15]:
systems = []
protein = Protein(pdb_id="4yne", name="NTRK1")
protein.expo_id = "4EK"
ligand = Ligand(smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", name="larotrectinib")
system = ProteinLigandComplex(components=[protein, ligand])
systems.append(system)

In [16]:
protein = Protein(pdb_id="4yne", name="NTRK1")
protein.pocket_resids = [
    516, 517, 521, 524, 542, 544, 573, 589, 590, 591, 592, 595, 596, 654, 655, 656, 657, 667, 668
]
ligand = Ligand(smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", name="larotrectinib_2")
system = ProteinLigandComplex(components=[protein, ligand])
systems.append(system)

In [17]:
featurizer = OEDockingFeaturizer(
    output_dir=user_cache_dir() + "/Fred",
    method="Fred"
)

In [18]:
systems = featurizer.featurize(systems)
systems

Problematic atoms are:
Atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 1, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 3, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 23, aromatic: True, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 35, aromatic: False, chiral: False
Atom atomic num: 6, name: , idx: 20, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 19, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 21, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 22, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 46, aromatic: False, chiral: False

Problematic atoms are:

[<ProteinLigandComplex with 2 components (<Protein name=NTRK1>, <Ligand name=larotrectinib>)>,
 <ProteinLigandComplex with 2 components (<Protein name=NTRK1>, <Ligand name=larotrectinib_2>)>]

### Hybrid

In [19]:
systems = []
protein = Protein(pdb_id="4yne", name="NTRK1")
protein.expo_id = "4EK"
ligand = Ligand(smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", name="larotrectinib")
system = ProteinLigandComplex(components=[protein, ligand])
systems.append(system)

In [20]:
featurizer = OEDockingFeaturizer(
    output_dir=user_cache_dir() + "/Hybrid",
    method="Hybrid"
)

In [21]:
systems = featurizer.featurize(systems)
systems

Problematic atoms are:
Atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 1, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 3, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 23, aromatic: True, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 35, aromatic: False, chiral: False
Atom atomic num: 6, name: , idx: 20, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 19, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 21, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 22, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 46, aromatic: False, chiral: False



[<ProteinLigandComplex with 2 components (<Protein name=NTRK1>, <Ligand name=larotrectinib>)>]

### Posit

In [22]:
systems = []
protein = Protein(pdb_id="4yne", name="NTRK1")
protein.expo_id = "4EK"
ligand = Ligand(smiles="C1CC(N(C1)C2=NC3=C(C=NN3C=C2)NC(=O)N4CCC(C4)O)C5=C(C=CC(=C5)F)F", name="larotrectinib")
system = ProteinLigandComplex(components=[protein, ligand])
systems.append(system)

In [23]:
featurizer = OEDockingFeaturizer(
    output_dir=user_cache_dir() + "/Posit",
    method="Posit"
)

In [24]:
systems = featurizer.featurize(systems)
systems

Problematic atoms are:
Atom atomic num: 6, name: , idx: 2, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 1, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 7, name: , idx: 3, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 23, aromatic: True, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 35, aromatic: False, chiral: False
Atom atomic num: 6, name: , idx: 20, aromatic: False, chiral: True with bonds:
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 19, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 6, name: , idx: 21, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 8, name: , idx: 22, aromatic: False, chiral: False
bond order: 1, chiral: False to atom atomic num: 1, name: , idx: 46, aromatic: False, chiral: False



[<ProteinLigandComplex with 2 components (<Protein name=NTRK1>, <Ligand name=larotrectinib>)>]