<a href="https://colab.research.google.com/github/nataliyah123/ibia-Ersilia/blob/main/checkingersiliadescriptor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture

!pip install rdkit-pypi
# !git clone https://github.com/ersilia-os/lazy-qsar.git
!pip install mordred

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import VarianceThreshold
from rdkit import Chem
from mordred import Calculator, descriptors
import joblib

In [None]:

MAX_NA = 0.2

class NanFilter(object):
    def __init__(self):
        self._name = "nan_filter"

    def fit(self, X):
        max_na = int((1 - MAX_NA) * X.shape[0])
        idxs = []
        for j in range(X.shape[1]):
            c = np.sum(np.isnan(X[:, j]))
            if c > max_na:
                continue
            else:
                idxs += [j]
        self.col_idxs = idxs

    def transform(self, X):
        return X[:, self.col_idxs]

    def save(self, file_name):
        joblib.dump(self, file_name)

    def load(self, file_name):
        return joblib.load(file_name)


class Scaler(object):
    def __init__(self):
        self._name = "scaler"
        self.abs_limit = 10
        self.skip = False

    def set_skip(self):
        self.skip = True

    def fit(self, X):
        if self.skip:
            return
        self.scaler = RobustScaler()
        self.scaler.fit(X)

    def transform(self, X):
        if self.skip:
            return X
        X = self.scaler.transform(X)
        X = np.clip(X, -self.abs_limit, self.abs_limit)
        return X

    def save(self, file_name):
        joblib.dump(self, file_name)

    def load(self, file_name):
        return joblib.load(file_name)


class Imputer(object):
    def __init__(self):
        self._name = "imputer"
        self._fallback = 0

    def fit(self, X):
        ms = []
        for j in range(X.shape[1]):
            vals = X[:, j]
            mask = ~np.isnan(vals)
            vals = vals[mask]
            if len(vals) == 0:
                m = self._fallback
            else:
                m = np.median(vals)
            ms += [m]
        self.impute_values = np.array(ms)

    def transform(self, X):
        for j in range(X.shape[1]):
            mask = np.isnan(X[:, j])
            X[mask, j] = self.impute_values[j]
        return X

    def save(self, file_name):
        joblib.dump(self, file_name)

    def load(self, file_name):
        return joblib.load(file_name)


class VarianceFilter(object):
    def __init__(self):
        self._name = "variance_filter"

    def fit(self, X):
        self.sel = VarianceThreshold()
        self.sel.fit(X)
        self.col_idxs = self.sel.transform([[i for i in range(X.shape[1])]]).ravel()

    def transform(self, X):
        return self.sel.transform(X)

    def save(self, file_name):
        joblib.dump(self, file_name)

    def load(self, file_name):
        return joblib.load(file_name)


# MORDRED DESCRIPTORS

def mordred_featurizer(smiles):
    calc = Calculator(descriptors, ignore_3D=True)
    df = calc.pandas([Chem.MolFromSmiles(smi) for smi in smiles])
    return df


In [None]:
class MordredDescriptor(object):

    def __init__(self):
        self.nan_filter = NanFilter()
        self.imputer = Imputer()
        self.variance_filter = VarianceFilter()
        self.scaler = Scaler()

    def fit(self, smiles):
        df = mordred_featurizer(smiles)
        print('initial df straight from mordred_featurize', df.shape,df.size)
        X = np.array(df, dtype=np.float32)
        self.nan_filter.fit(X)
        X = self.nan_filter.transform(X)
        self.imputer.fit(X)
        X = self.imputer.transform(X)
        self.variance_filter.fit(X)
        X = self.variance_filter.transform(X)
        self.scaler.fit(X)
        X = self.scaler.transform(X)
        self.features = list(df.columns)
        self.features = [self.features[i] for i in self.nan_filter.col_idxs]
        self.features = [self.features[i] for i in self.variance_filter.col_idxs]
        return pd.DataFrame(X, columns=self.features)

In [None]:
smilescsv = pd.read_csv('dilismiles.csv', names = ['col_smiles'])
smileslist = smilescsv['col_smiles'].tolist()

In [None]:
data = pd.read_csv('dili_padel_2d.csv')
dataframe = pd.DataFrame(data)
y = dataframe['Outcome']

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
print(smileslist)
print(y.tolist())

['[H][C@]1([S-])O[C@]([H])(CO)[C@@]([H])(O)[C@]([H])(O)[C@@]1([H])O', 'CN(CCCCCCCCCCN(C)C(=O)Oc1cccc(c1)[N+](C)(C)C)C(=O)Oc1cccc(c1)[N+](C)(C)C', '[H][C@]12Cc3ccc(OC)cc3[C@@]3(CCCC[C@]13[H])CCN2C', '[H][C@@]12CCCN1C(=O)[C@]([H])(Cc1ccccc1)N1C(=O)[C@](C)(NC(=O)[C@@]3([H])CN(C)[C@]4([H])Cc5c[nH]c6cccc(C4=C3)c56)O[C@@]21O', '[H][C@]1(C)C[C@@]2([H])[C@]3([H])CCC4=CC(=O)C=C[C@]4(C)[C@@]3(Cl)[C@@]([H])(O)C[C@]2(C)[C@@]1(O)C(=O)CO', '[H][C@@](C)(CCC([O-])=O)[C@@]1([H])CC[C@@]2([H])[C@]3([H])CC[C@]4([H])C[C@]([H])(O)CC[C@]4(C)[C@@]3([H])CC[C@]12C', '[H][C@@](C)(CCC(=O)NCCS(O)(=O)=O)[C@@]1([H])CCC2C3C(CC[C@]12C)[C@@]1(C)CC[C@@]([H])(O)C[C@@]1([H])C[C@]3([H])O', '[H][C@@]1(C)C[C@@]2([H])[C@]3([H])CCC4=CC(=O)C=C[C@]4(C)[C@@]3(Cl)[C@@]([H])(O)C[C@]2(C)[C@@]1(OC(=O)c1ccco1)C(=O)CCl', '[H][C@@]12Oc3c4c(C[C@@]5([H])N(CC6CC6)CC[C@@]14[C@@]5(O)CCC2=C)ccc3O', '[H][C@@]12N3CC[C@@]11c4cc(c(OC)cc4N(C=O)[C@@]1([H])[C@@](O)(C(=O)OC)[C@]([H])(OC(C)=O)[C@]2(CC)C=CC3)[C@]1(C[C@]2([H])CN(C[C@](O)(CC)C2)CCc2c1[nH

In [None]:
# paper reported 1447 but we have 1613 initially and after transformation we have 1340
mod = MordredDescriptor()
mod.fit(smileslist)

  8%|▊         | 46/588 [00:46<11:02,  1.22s/it]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 588/588 [06:02<00:00,  1.62it/s]


initial df straight from mordred_featurize (588, 1613) 948444


Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,-0.680344,-0.663439,1.0,0.0,-0.686454,-0.295165,-0.148878,-0.686454,-0.377390,-0.849714,...,-0.507929,-0.771964,-0.556026,0.453941,-0.439500,-0.428571,-0.600,-0.581818,-0.337505,-0.664589
1,1.286134,1.184398,0.0,2.0,1.215080,-0.489457,-0.341646,1.215080,-0.654050,0.887731,...,0.454385,0.570831,1.382810,-0.752019,4.139907,0.857143,1.100,0.935065,1.978200,1.500000
2,-0.003904,-0.135041,0.0,1.0,0.023104,1.046618,1.182374,0.023104,1.462141,0.013416,...,0.557476,-0.288514,-0.147402,-0.762570,-0.148633,0.357143,0.125,0.228571,-0.471538,-0.105985
3,1.769303,1.781803,0.0,1.0,1.710204,1.398751,1.088610,1.710204,0.966049,1.138820,...,1.290641,1.487008,1.516224,-0.055640,3.056137,2.035714,1.900,2.098701,1.196609,1.524938
4,0.571394,0.752067,0.0,0.0,0.480721,1.714119,1.687515,0.480721,0.088347,0.493219,...,1.152815,0.717455,0.587522,-0.200052,0.426120,1.392857,0.775,1.007792,1.051272,0.427681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
583,-0.667682,-0.631352,0.0,0.0,-0.692527,-0.083155,0.061468,-0.692527,-0.499008,-0.838657,...,-0.357451,-0.756650,-0.636344,-0.030577,-0.444154,-0.357143,-0.575,-0.540260,-0.195398,-0.704489
584,-1.150096,-1.294924,0.0,0.0,-1.129620,-2.882626,-2.716033,-1.129620,-1.411727,-2.002422,...,-2.682226,-1.300148,-0.937134,0.641166,-0.527341,-0.964286,-1.075,-1.090909,-0.905935,-1.063591
585,0.725721,0.711551,0.0,2.0,0.807997,0.860304,0.997522,0.807997,0.398452,0.596091,...,0.582330,0.233092,0.780838,-0.079816,1.197499,0.857143,0.675,0.685714,0.838111,1.011222
586,-0.498231,-0.331637,0.0,0.0,-0.449826,-1.265393,-1.111491,-0.449826,-1.002144,-0.522758,...,-0.864720,-0.621121,-0.014034,0.326958,-0.253345,-0.357143,-0.500,-0.540260,0.101736,-0.175811
