### Imports

In [7]:
import pickle # to open RDKit molecules' lists

import numpy as np # to store data in numpy arrays for future use in ML algorithms
import pandas as pd # to work with Mordred descriptors dataframes

from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
IPythonConsole.ipython_useSVG=True # all of the above – to draw molecules in case I need it
from rdkit.Chem import Descriptors # to get the names of all the RDKit descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors # the names of all desc-s get passed here

from mordred import Calculator, descriptors # this calculates other descriptors

### Loading data

In [2]:
with open("./data/0_all_active_all_inactive/python_lists/all_active.pickle", "rb") as file1:
    all_active_list = pickle.load(file1)

with open("./data/0_all_active_all_inactive/python_lists/all_inactive.pickle", "rb") as file2:
    all_inactive_list = pickle.load(file2)

### Calculating RDKit descriptors for all active compounds

In [3]:
names_of_all_rdkit_descriptors = [x[0] for x in Descriptors._descList]

In [4]:
rdkit_descriptors_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(names_of_all_rdkit_descriptors)

In [5]:
rdkit_descriptors_for_all_active_molecules = np.array([rdkit_descriptors_calculator.CalcDescriptors(active_molecule) for active_molecule in all_active_list])

In [7]:
rdkit_descriptors_for_all_active_molecules.shape

(366, 200)

In [8]:
with open("./data/0_all_active_all_inactive/all_active_descriptors.pickle", "wb") as out_file:
    pickle.dump(rdkit_descriptors_for_all_active_molecules, out_file)

### Calculating RDKit descriptors for all inactive compounds

In [9]:
rdkit_descriptors_for_all_inactive_molecules = np.array([rdkit_descriptors_calculator.CalcDescriptors(inactive_molecule) for inactive_molecule in all_inactive_list])

In [10]:
rdkit_descriptors_for_all_inactive_molecules.shape

(59422, 200)

In [11]:
with open("./data/0_all_active_all_inactive/all_inactive_descriptors.pickle", "wb") as out_file2:
    pickle.dump(rdkit_descriptors_for_all_inactive_molecules, out_file2)

### Calculating RDKit descriptors for 366 randomly selected inactive molecules

In [12]:
with open("./data/1_all_active_random_inactive/python_lists/random_inactive.pickle", "rb") as file3:
    random_inactive_list = pickle.load(file3)

In [14]:
rdkit_descriptors_for_366_random_inactive_molecules = np.array([rdkit_descriptors_calculator.CalcDescriptors(rand_inactive_molecule) for rand_inactive_molecule in random_inactive_list])

In [15]:
rdkit_descriptors_for_366_random_inactive_molecules.shape

(366, 200)

In [16]:
with open("./data/1_all_active_random_inactive/random_inactive_descriptors.pickle", "wb") as out_file3:
    pickle.dump(rdkit_descriptors_for_366_random_inactive_molecules, out_file3)

### Calculating all Mordred descriptors (except for networkx-based ones) for all active molecules 

In [3]:
mordred_desc_calculator = Calculator(descriptors, ignore_3D=True)

In [4]:
mordred_desc_for_all_active_df = mordred_desc_calculator.pandas(all_active_list)

100%|██████████| 366/366 [00:37<00:00,  9.67it/s]


The result is a dataframe. It mostly contains correct descriptors, however, there are some strings there as well. These are, first of all, due to the fall of networkx-2.3 on my machine, and also because some of the counter descriptors cannot be calculated for a given molecule. So let's process the resulting dataframe further.

In [6]:
mordred_desc_for_all_active_df.shape

(366, 1613)

Dropping all detourn-matrix based descriptors

In [8]:
mordred_desc_for_all_active_dropped = mordred_desc_for_all_active_df.drop(["SpAbs_Dt", "SpMax_Dt", "SpDiam_Dt", "SpAD_Dt", "SpMAD_Dt", "LogEE_Dt", 
               "SM1_Dt", "VE1_Dt", "VE2_Dt", "VE3_Dt", "VR1_Dt", "VR2_Dt", "VR3_Dt", 
               "DetourIndex"], axis = 1)

In [10]:
mordred_desc_for_all_active_dropped.shape

(366, 1599)

In [11]:
mordred_desc_for_all_active_final = mordred_desc_for_all_active_dropped.apply(pd.to_numeric, errors='coerce').fillna(0)

In [19]:
mordred_desc_for_all_active_final.to_csv("./data/0_all_active_all_inactive/Mordred/all_active_mordred.csv", index=False)

### Calculating all Mordred descriptors (except for networkx-based ones) for all inactive molecules 

In [20]:
mordred_desc_for_all_inactive_df = mordred_desc_calculator.pandas(all_inactive_list)

  3%|▎         | 1623/59422 [03:05<1:50:20,  8.73it/s]Process ForkPoolWorker-8:

Process ForkPoolWorker-7:
Traceback (most recent call last):
  File "/anaconda3/envs/chem/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/anaconda3/envs/chem/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/chem/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
Traceback (most recent call last):
  File "/anaconda3/envs/chem/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/anaconda3/envs/chem/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/anaconda3/envs/chem/lib/python3.7/multiprocessing/pool.py", line 121, in worker
    result = (True, func(*args, **kwds))
  File "/anaconda3/envs/chem/lib/python3.7/site-packages/mordred/_base/pa

FileNotFoundError: [Errno 2] No such file or directory

In [None]:
mordred_desc_for_all_inactive_dropped = mordred_desc_for_all_inactive_df.drop(["SpAbs_Dt", "SpMax_Dt", "SpDiam_Dt", "SpAD_Dt", "SpMAD_Dt", "LogEE_Dt", 
               "SM1_Dt", "VE1_Dt", "VE2_Dt", "VE3_Dt", "VR1_Dt", "VR2_Dt", "VR3_Dt", 
               "DetourIndex"], axis = 1)

In [None]:
mordred_desc_for_all_inactive_dropped.shape

In [None]:
mordred_desc_for_all_inactive_final = mordred_desc_for_all_inactive_dropped.apply(pd.to_numeric, errors='coerce').fillna(0)

In [None]:
mordred_desc_for_all_inactive_final.to_csv("./data/0_all_active_all_inactive/Mordred/all_inactive_mordred.csv", index=False)

### Calculating all Mordred descriptors (except for networkx-based ones) for 366 random inactive molecules 

In [21]:
with open("./data/1_all_active_random_inactive/python_lists/random_inactive.pickle", "rb") as file4:
    random_inactive_molecules = pickle.load(file4)

In [23]:
mordred_desc_for_366_random_inactive_mols_df = mordred_desc_calculator.pandas(random_inactive_molecules)

# drop the ones that cannot be calculated
mordred_desc_for_366_random_inactive_mols_dropped = mordred_desc_for_366_random_inactive_mols_df.drop(["SpAbs_Dt", "SpMax_Dt", "SpDiam_Dt", "SpAD_Dt", "SpMAD_Dt", "LogEE_Dt", 
               "SM1_Dt", "VE1_Dt", "VE2_Dt", "VE3_Dt", "VR1_Dt", "VR2_Dt", "VR3_Dt", 
               "DetourIndex"], axis = 1)

print(mordred_desc_for_366_random_inactive_mols_dropped.shape)

mordred_desc_for_366_random_inactive_mols_final = mordred_desc_for_366_random_inactive_mols_dropped.apply(pd.to_numeric, errors='coerce').fillna(0)

mordred_desc_for_366_random_inactive_mols_final.to_csv("./data/1_all_active_random_inactive/Mordred/random_inactive_mordred.csv", index=False)

 66%|██████▌   | 242/366 [00:30<00:15,  7.81it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 366/366 [00:47<00:00,  7.76it/s]


(366, 1599)
