In [None]:
import src
from src import oic_dwic
from src import ecif
from src import utils

In [None]:
dwic = oic_dwic.InterAtomicContact(
    pathfiles="./test/",
    filename="dwic_fv.csv",
    ligand_format="mol2",
    amino_acid_classes=utils.amino_acid_classes_DWIC,
    cutoff=12.0,
    feature_type="DWIC",
    exp=2,
)

oic = oic_dwic.InterAtomicContact(
    pathfiles="./test/",
    filename="oic_fv.csv",
    ligand_format="mol2",
    amino_acid_classes=utils.amino_acid_classes_OIC,
    cutoff=12.0,
    feature_type="OIC",
    exp=None,
)

ecif = ecif.ECIF(
    pathfiles="./test/", filename="ecif_fv.csv", ligand_format="sdf", cutoff=6.0
)

In [None]:
dwic.generate_features(n_jobs=-1)
oic.generate_features(n_jobs=-1)
ecif.generate_features(n_jobs=-1)

In [105]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import itertools
from collections import OrderedDict
import re
from Bio.PDB import PDBParser
from src.script import mol2parser

In [4]:
class ParseProtein():
    def __init__(self, rec_fpath):
        with open(rec_fpath) as f:
            self.lines = [x.strip() for x in f.readlines() if x[:4] in ["ATOM", "HETA"]]
        self.defined_res = ['GLY', 'ALA', 'VAL', 'LEU', 'ILE', 'PRO', 'PHE', 'TYR', 'TRP', 'SER',
               'THR', 'CYS', 'MET', 'ASN', 'GLN', 'ASP', 'GLU', 'LYS', 'ARG', 'HIS', 'OTH']
        #self.rec_ele_list = []
        self.all_res_xyz_list = []
        self.res_list = []
          
    def get_res(self, res):
        if res in self.defined_res:
            return res
        else:
            return "OTH"
        
    def extract_letter(self, ele):
        pattern = re.compile(r'([A-Za-z]+)\d+[+-]')
        match = pattern.match(ele)
        if match:
            letter_part = match.group(1)
            return letter_part
        else:
            return ele
            
    def parse_receptor(self): 
        sym_pool = []
        num = -1      
        _temp_res_xyz = []
        for line in self.lines:
            ele = line.split()[-1]
            ele = self.extract_letter(ele)
            if ele == "H":
                continue
            num += 1
            
            res = line[17:20].strip()
            res = self.get_res(res)
            sym = line[17:27].strip()
            x = float(line[30:38].strip())
            y = float(line[38:46].strip())
            z = float(line[46:54].strip())
            if num == 0:
                self.res_list.append(res)
                sym_pool.append(sym)
                _temp_res_xyz.append([x, y, z])
            else:
                #if res == self.res_list[-1]:
                if sym == sym_pool[-1]:
                    _temp_res_xyz.append([x, y, z])
                else:
                    self.all_res_xyz_list.append(np.array(_temp_res_xyz) * 0.1)
                    _temp_res_xyz = [[x, y, z]]
                    sym_pool.append(sym)
                    self.res_list.append(res)
                    
        self.all_res_xyz_list.append(np.array(_temp_res_xyz) * 0.1)
        return self

class ParseLigand():
    def __init__(self, lig_fpath):
        with open(lig_fpath) as f:
            self.lines = [x.strip() for x in f.readlines() if x[:4] in ["ATOM", "HETA"]]
        self.defined_eles = ['H', 'C',  'O', 'N', 'P', 'S', 'Hal', 'DU']
        self.hal_ele = ["F", "Cl", "Br", "I"]
        self.lig_ele_list = []
        self.lig_xyz_array = np.array([])
    
    def get_ele(self, ele):
        if ele in self.defined_eles:
            return ele
        elif ele in self.hal_ele:
            return "Hal"
        else:
            return "DU"
        
    def extract_letter(self, ele):
        pattern = re.compile(r'([A-Za-z]+)\d+[+-]')
        match = pattern.match(ele)
        if match:
            letter_part = match.group(1)
            return letter_part
        else:
            return ele
            
    def parse_ligand(self):
        lig_xyz = []
        for line in self.lines:
            x = float(line[30:38].strip())
            y = float(line[38:46].strip())
            z = float(line[46:54].strip())
            ele = line.split()[-1]
            ele = self.extract_letter(ele)
            ele = self.get_ele(ele)
            self.lig_ele_list.append(ele)
            lig_xyz.append([x, y, z])
        
        self.lig_xyz_array = np.array(lig_xyz) * 0.1
        
        return self
    
class GetFeatures():
    def __init__(self, rec, lig, shell):
        self.rec = rec
        self.lig = lig
        self.N_shell = shell
        self.res_atom_pairs = []
        self.res_atom_dist = []
        
    def cal_distance(self):
        res_atom_dist = []
        for res, res_xyz in zip(self.rec.res_list, self.rec.all_res_xyz_list):
            for ele, atom_xyz in zip(self.lig.lig_ele_list, self.lig.lig_xyz_array):
                pair = f"{res}_{ele}"
                dist_mtx = cdist(atom_xyz.reshape(1, -1), res_xyz, metric='euclidean')
                self.res_atom_pairs.append(pair)
                res_atom_dist.append(dist_mtx.min())
        self.res_atom_dist = np.array(res_atom_dist)
        return self
    
    def count_contacts(self):
        self.cal_distance()
        
        outermost = 0.05 * (self.N_shell + 1)
        ncutoffs = np.linspace(0.1, outermost, self.N_shell)
        
        temp_counts = []
        onion_counts = []
        for i, cutoff in enumerate(ncutoffs):
            _contact_bool = (self.res_atom_dist <= cutoff) * 1
            if i == 0:
                onion_counts.append(_contact_bool)
            else:
                onion_counts.append(_contact_bool - temp_counts[-1])
            temp_counts.append(_contact_bool)
        temp_counts = []
        
        results = []
        for n in range(len(ncutoffs)):
            d = OrderedDict()
            d = d.fromkeys(keys, 0)
            for e_e, c in zip(self.res_atom_pairs, onion_counts[n]):
                d[e_e] += c
            results.append(np.array(list(d.values())).ravel())
        results = np.concatenate(results, axis=0)
        return results
    
def generate_features(rec_fpath, lig_fpath, shells):
    # load receptor
    rec = ParseProtein(rec_fpath)
    rec.parse_receptor()

    # load ligand 
    lig = ParseLigand(lig_fpath)
    lig.parse_ligand()

    # Generate features
    feat = GetFeatures(rec, lig, shells)
    result = feat.count_contacts()

    return result

In [141]:
defined_eles = ['H', 'C',  'O', 'N', 'P', 'S', 'Hal', 'DU']
hal_ele = ["F", "Cl", "Br", "I"]

def loadmol2(ligand_file: str) -> tuple:

    ligand = mol2parser.Mol2Parser(ligand_file)
    ligand.parse()

    ligand_element_list = list(map(lambda x: x[0], ligand.molecule_info["atom_name"].values()))

    for item in range(len(ligand_element_list)):
    
        if ligand_element_list[item] in hal_ele:
            ligand_element_list[item] = "Hal"

        elif ligand_element_list[item] not in ['H', 'C',  'O', 'N', 'P', 'S']:
            ligand_element_list[item] = "DU"

        else:
            continue

    ligand_coords_list = np.array(list(ligand.molecule_info["coords"].values())).astype(np.float32)

    return (ligand_element_list, ligand_coords_list)

In [92]:
defined_residues = ['GLY', 'ALA', 'VAL', 'LEU', 'ILE', 'PRO', 'PHE', 'TYR', 'TRP', 'SER',
               'THR', 'CYS', 'MET', 'ASN', 'GLN', 'ASP', 'GLU', 'LYS', 'ARG', 'HIS', 'OTH']

def loadpdb(protein_file: str) -> tuple:

    parser = PDBParser(PERMISSIVE=True, QUIET=True)
    protein = parser.get_structure("", protein_file)

    residue_list = []
    all_residue_coords_list = []

    for residue in protein.get_residues():

        residue_coords = []
        if residue.get_resname() in defined_residues:
            residue_list.append(residue.get_resname())
        else:
            residue_list.append("OTH")

        for atom in residue.get_atoms():

            if atom.element != "H":
                residue_coords.append(list(atom.get_coord()))

        all_residue_coords_list.append(np.array(residue_coords))

    return (residue_list, all_residue_coords_list)


In [145]:
res_list, all_res_xyz_list = loadpdb(r"C:\Users\Taniyama\Documents\GitHub\OnionNet-2-master\samples\1a30\1a30_protein.pdb")
lig_ele_list, lig_xyz_array = loadmol2(r"C:\Users\Taniyama\Documents\GitHub\OnionNet-2-master\samples\1a30\1a30_ligand.mol2")

In [5]:
rec_fpath = r"C:\Users\Taniyama\Documents\GitHub\OnionNet-2-master\samples\1a30\1a30_ligand.pdb"
lig_fpath = r"C:\Users\Taniyama\Documents\GitHub\OnionNet-2-master\samples\1a30\1a30_protein.pdb"

lig_defined_ele = ['H', 'C',  'O', 'N', 'P', 'S', 'Hal', 'DU']
rec_defined_res = ['GLY', 'ALA', 'VAL', 'LEU', 'ILE', 'PRO', 'PHE', 'TYR', 'TRP', 'SER',
               'THR', 'CYS', 'MET', 'ASN', 'GLN', 'ASP', 'GLU', 'LYS', 'ARG', 'HIS', 'OTH']
keys = ["_".join(x) for x in list(itertools.product(rec_defined_res, lig_defined_ele))]
X_feat = generate_features(rec_fpath, lig_fpath, 62)

In [24]:
path = r"C:\Users\Taniyama\Documents\GitHub\OnionNet-2-master\samples"
pdbids = ['1a30', '1bcu', '1bzc', '1c5z', '1e66']

lig_defined_ele = ['H', 'C',  'O', 'N', 'P', 'S', 'Hal', 'DU']
rec_defined_res = ['GLY', 'ALA', 'VAL', 'LEU', 'ILE', 'PRO', 'PHE', 'TYR', 'TRP', 'SER',
               'THR', 'CYS', 'MET', 'ASN', 'GLN', 'ASP', 'GLU', 'LYS', 'ARG', 'HIS', 'OTH']
keys = ["_".join(x) for x in list(itertools.product(rec_defined_res, lig_defined_ele))]

output = {}
for pdbid in pdbids:

    lig_fpath = os.path.join(path, f"{pdbid}\{pdbid}_ligand.pdb")
    rec_fpath = os.path.join(path, f"{pdbid}\{pdbid}_protein.pdb")
    X_feat = generate_features(rec_fpath, lig_fpath, 62)
    output[pdbid] = X_feat

In [27]:
pd.DataFrame(output).transpose().to_csv("onionet2_feat_samples.csv", index=False)