In [2]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Lipinski

In [3]:
df_smiles = pd.read_parquet('pubchem_smiles.parquet')
df_properties = pd.read_parquet('qm_properties.parquet')
df_energies = pd.read_parquet('transitions_energies.parquet')

data = df_smiles.merge(df_properties, how='inner', on='cid').merge(df_energies, how='inner', on='cid')

In [4]:
data.head()

Unnamed: 0,cid,canonical smiles,isomeric smiles,charge,total dipole moment,multiplicity,homo,lumo,gap,total energy,...,TD_ET_05,TD_OS_05,TD_ET_06,TD_OS_06,TD_ET_07,TD_OS_07,TD_ET_08,TD_OS_08,TD_ET_09,TD_OS_09
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,0,12.362788,1,-4.655868,-0.302046,4.353822,-19287.579176,...,37641.451596,0.00637,37976.268471,0.001117,38311.230525,0.001396,40062.494092,0.003212,40525.343416,0.00218
1,2,CC(=O)OC(CC(=O)O)C[N+](C)(C)C,CC(=O)OC(CC(=O)O)C[N+](C)(C)C,1,5.855433,1,-11.06687,-3.551086,7.515785,-19299.202955,...,58598.534406,0.005652,59362.010763,0.005563,60451.940025,0.002308,62136.396688,0.025338,62379.60511,0.017436
2,3,C1=CC(C(C(=C1)C(=O)O)O)O,C1=CC(C(C(=C1)C(=O)O)O)O,0,5.266252,1,-6.821894,-2.239497,4.582397,-15575.874068,...,46837.502902,0.004995,48214.775245,0.008408,48851.170886,0.003279,50890.898644,0.04637,51729.941085,0.009966
3,4,CC(CN)O,CC(CN)O,0,2.681395,1,-6.187869,1.847653,8.035522,-6794.53586,...,58571.208342,0.009868,60147.530252,0.014954,60319.624771,0.008397,60994.089781,0.004093,62458.026397,0.000847
4,5,C(C(=O)COP(=O)(O)O)N,C(C(=O)COP(=O)(O)O)N,0,8.447997,1,-7.270882,-1.52928,5.741602,-24256.843704,...,50735.499802,0.000874,53071.087855,0.009599,54805.905776,0.025285,54979.815042,0.000488,55740.210361,0.013173


In [5]:
data['total energy'] = data['total energy']/8066
data.loc[:,'TD_ET_00':'TD_OS_09'] = data.loc[:,'TD_ET_00':'TD_OS_09']/8066

data['diff_ET_00_gap'] = data['TD_ET_00'] - data['gap']

In [6]:
os = data[data.columns[pd.Series(data.columns).str.startswith('TD_OS')]].values
et = data[data.columns[pd.Series(data.columns).str.startswith('TD_ET')]].values
print(os)
max_os = np.argmax(os, axis=1)
max_et = np.choose(max_os, et.T)
data['TD_ET_OS_max'] = max_et


[[2.96677411e-07 2.76345152e-07 4.71113315e-09 ... 1.73072155e-07
  3.98214728e-07 2.70270270e-07]
 [4.98388297e-08 5.10786015e-08 8.00892636e-08 ... 2.86139350e-07
  3.14133399e-06 2.16166625e-06]
 [6.62509298e-06 1.57820481e-05 4.41482767e-06 ... 4.06521200e-07
  5.74882222e-06 1.23555666e-06]
 ...
 [7.72529135e-05 4.84366477e-06 4.77312175e-08 ... 5.67815522e-08
  3.82469626e-07 2.15819489e-06]
 [7.27715100e-05 1.99587156e-05 4.82420035e-06 ... 1.06087280e-06
  7.91656335e-06 2.15695512e-06]
 [3.24572279e-07 5.21212497e-06 1.87701463e-06 ... 9.33969750e-06
  5.47446070e-06 4.61567072e-07]]


In [7]:
eVrange = (1.5498, 4.1328)
data['corante_00'] = data['TD_ET_00'].between(eVrange[0], eVrange[1], inclusive='both')
data['corante_max'] = data['TD_ET_OS_max'].between(eVrange[0], eVrange[1], inclusive='both')

In [8]:
data.to_csv('molecules_properties.csv')

In [8]:
def calcular_descritores(df, coluna_smiles):
    smiles_array = df[coluna_smiles].values
    num_mols = len(smiles_array)

    descritores = {
        'MW': np.zeros(num_mols),
        'NumTotalAtoms': np.zeros(num_mols),
        'NumHeavyAtoms': np.zeros(num_mols),
        'NumHeteroAtoms': np.zeros(num_mols),
        'NumOHCount': np.zeros(num_mols),
        'NumRotatableBonds': np.zeros(num_mols),
        'NumRings': np.zeros(num_mols),
        'NumSaturatedRings': np.zeros(num_mols),
        'NumAromaticRings': np.zeros(num_mols),
        'NumAromaticHeterocycles': np.zeros(num_mols),
        'NumAliphaticHeterocycles': np.zeros(num_mols),
        # 'NumSingleBonds': np.zeros(num_mols),
        # 'NumDoubleBonds': np.zeros(num_mols),
        # 'NumTripleBonds': np.zeros(num_mols),
        # 'NumFunctionalGroups': np.zeros(num_mols),
        'NumSP2Carbons': np.zeros(num_mols),
        'NumSP3Carbons': np.zeros(num_mols),
        'NumAliphaticCarbons': np.zeros(num_mols),
        'NumAliphaticRings': np.zeros(num_mols),
        'NumAromaticCarbocycles': np.zeros(num_mols),
        'NumSaturatedCarbocycles': np.zeros(num_mols),
        'NumHDonors': np.zeros(num_mols),
        'NumHAcceptors': np.zeros(num_mols)
    }

    for i, smiles in enumerate(smiles_array):
        try:
            mol = Chem.MolFromSmiles(smiles)
            descritores['MW'][i] = Descriptors.MolWt(mol)
            descritores['NumTotalAtoms'][i] = mol.GetNumAtoms()
            descritores['NumHeavyAtoms'][i] = rdMolDescriptors.CalcNumHeavyAtoms(mol)
            descritores['NumHeteroAtoms'][i] = Lipinski.NumHeteroatoms(mol)
            descritores['NumOHCount'][i] = Lipinski.NHOHCount(mol)
            descritores['NumRotatableBonds'][i] = Lipinski.NumRotatableBonds(mol)
            descritores['NumRings'][i] = Lipinski.RingCount(mol)
            descritores['NumSaturatedRings'][i] = Lipinski.NumSaturatedRings(mol)
            descritores['NumAromaticRings'][i] = Lipinski.NumAromaticRings(mol)
            descritores['NumAromaticHeterocycles'][i] = Lipinski.NumAromaticHeterocycles(mol)
            descritores['NumAliphaticHeterocycles'][i] = Lipinski.NumAliphaticHeterocycles(mol)
            descritores['NumSP2Carbons'][i] = sum((x.GetHybridization() == Chem.HybridizationType.SP2) for x in mol.GetAtoms())
            descritores['NumSP3Carbons'][i] = sum((x.GetHybridization() == Chem.HybridizationType.SP3) for x in mol.GetAtoms())
            descritores['NumAliphaticCarbons'][i] = Lipinski.NumAliphaticCarbocycles(mol)
            descritores['NumAliphaticRings'][i] = Lipinski.NumAliphaticRings(mol)
            descritores['NumAromaticCarbocycles'][i] = Lipinski.NumAromaticCarbocycles(mol)
            descritores['NumSaturatedCarbocycles'][i] = Lipinski.NumSaturatedCarbocycles(mol)
            descritores['NumHDonors'][i] = Lipinski.NumHDonors(mol)
            descritores['NumHAcceptors'][i] = Lipinski.NumHAcceptors(mol)
        except:
            descritores['MW'][i] = np.nan
            descritores['NumTotalAtoms'][i] = np.nan
            descritores['NumHeavyAtoms'][i] = np.nan
            descritores['NumHeteroAtoms'][i] = np.nan
            descritores['NumOHCount'][i] = np.nan
            descritores['NumRotatableBonds'][i] = np.nan
            descritores['NumRings'][i] = np.nan
            descritores['NumSaturatedRings'][i] = np.nan
            descritores['NumAromaticRings'][i] = np.nan
            descritores['NumAromaticHeterocycles'][i] = np.nan
            descritores['NumAliphaticHeterocycles'][i] = np.nan
            # descritores['NumSingleBonds'][i] = np.nan
            # descritores['NumDoubleBonds'][i] = np.nan
            # descritores['NumTripleBonds'][i] = np.nan
            # descritores['NumFunctionalGroups'][i] = np.nan
            descritores['NumSP2Carbons'][i] = np.nan
            descritores['NumSP3Carbons'][i] = np.nan
            descritores['NumAliphaticCarbons'][i] = np.nan
            descritores['NumAliphaticRings'][i] = np.nan
            descritores['NumAromaticCarbocycles'][i] = np.nan
            descritores['NumSaturatedCarbocycles'][i] = np.nan
            descritores['NumHDonors'][i] = np.nan
            descritores['NumHAcceptors'][i] = np.nan
    df_descritores = pd.DataFrame(descritores)
    df_final = pd.concat([df[['cid', 'isomeric smiles']], df_descritores], axis=1)
    return df_final

In [12]:
rdkit_descriptor = calcular_descritores(data, 'isomeric smiles')

[08:23:23] Explicit valence for atom # 1 Br, 5, is greater than permitted
[08:23:23] Explicit valence for atom # 1 Si, 8, is greater than permitted
[08:23:24] Explicit valence for atom # 1 Cl, 5, is greater than permitted
[08:24:23] Explicit valence for atom # 1 Ge, 8, is greater than permitted
[08:24:36] Explicit valence for atom # 0 Br, 2, is greater than permitted
[08:24:41] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[08:26:23] Explicit valence for atom # 9 Cl, 3, is greater than permitted
[08:26:32] Explicit valence for atom # 2 Si, 8, is greater than permitted
[08:26:52] Explicit valence for atom # 5 Cl, 3, is greater than permitted
[08:26:58] Explicit valence for atom # 5 Br, 5, is greater than permitted
[08:27:01] Explicit valence for atom # 0 Si, 8, is greater than permitted
[08:27:07] Explicit valence for atom # 4 Cl, 3, is greater than permitted
[08:27:18] Explicit valence for atom # 3 Br, 3, is greater than permitted
[08:27:21] Explicit valence for atom #

In [13]:
rdkit_descriptor.head()

Unnamed: 0,cid,isomeric smiles,MW,NumTotalAtoms,NumHeavyAtoms,NumHeteroAtoms,NumOHCount,NumRotatableBonds,NumRings,NumSaturatedRings,...,NumAromaticHeterocycles,NumAliphaticHeterocycles,NumSP2Carbons,NumSP3Carbons,NumAliphaticCarbons,NumAliphaticRings,NumAromaticCarbocycles,NumSaturatedCarbocycles,NumHDonors,NumHAcceptors
0,1,CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C,203.238,14.0,14.0,5.0,0.0,5.0,0.0,0.0,...,0.0,0.0,6.0,8.0,0.0,0.0,0.0,0.0,0.0,4.0
1,2,CC(=O)OC(CC(=O)O)C[N+](C)(C)C,204.246,14.0,14.0,5.0,1.0,5.0,0.0,0.0,...,0.0,0.0,6.0,8.0,0.0,0.0,0.0,0.0,1.0,3.0
2,3,C1=CC(C(C(=C1)C(=O)O)O)O,156.137,11.0,11.0,4.0,3.0,1.0,1.0,0.0,...,0.0,0.0,7.0,4.0,1.0,1.0,0.0,0.0,3.0,3.0
3,4,CC(CN)O,75.111,5.0,5.0,2.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,2.0,2.0
4,5,C(C(=O)COP(=O)(O)O)N,169.073,10.0,10.0,7.0,4.0,4.0,0.0,0.0,...,0.0,0.0,3.0,7.0,0.0,0.0,0.0,0.0,3.0,4.0


In [1]:
rdkit_descriptor.to_csv("rdkit_descriptors.csv")
data.to_csv('molecules_properties.csv')

NameError: name 'rdkit_descriptor' is not defined

In [9]:
from ipynb.fs.full.Atomic_Descriptors_v3 import create_property_descriptors

{'H': [1, 'Hydrogen', 1.007, 2.2, 1359844.0, 25.0, 120.0, 38, 1.0], 'He': [2, 'Helium', 4.002602, nan, 2458741.0, 31.0, 140.0, 32, 2.0], 'Li': [3, 'Lithium', 6.941, 0.98, 539172.0, 145.0, 182.0, 134, 1.0], 'Be': [4, 'Beryllium', 9.012182, 1.57, 9.3227, 105.0, nan, 90, 2.0], 'B': [5, 'Boron', 10.811, 2.04, 829803.0, 85.0, nan, 82, 3.0], 'C': [6, 'Carbon', 12.0107, 2.55, 11.2603, 70.0, 170.0, 77, 4.0], 'N': [7, 'Nitrogen', 14.0067, 3.04, 1453414.0, 65.0, 155.0, 75, 5.0], 'O': [8, 'Oxygen', 15.9994, 3.44, 1361806.0, 60.0, 152.0, 73, 6.0], 'F': [9, 'Fluorine', 18.9984032, 3.98, 1742282.0, 50.0, 147.0, 71, 7.0], 'Ne': [10, 'Neon', 20.1797, nan, 215646.0, 38.0, 154.0, 69, 8.0], 'Na': [11, 'Sodium', 22.98976928, 0.93, 513908.0, 180.0, 227.0, 154, 1.0], 'Mg': [12, 'Magnesium', 24.305, 1.31, 764624.0, 150.0, 173.0, 130, 2.0], 'Al': [13, 'Aluminium', 26.9815386, 1.61, 598577.0, 125.0, nan, 118, 3.0], 'Si': [14, 'Silicon', 28.0855, 1.9, 815169.0, 110.0, 210.0, 111, 4.0], 'P': [15, 'Phosphorus', 3

In [16]:
elem_data = pd.read_csv('Atomic properties DB.csv')
properties_dict = pd.read_csv('Atomic properties DB.csv', index_col='Symbol').T.to_dict('list')

In [15]:
elem_data.head()

Unnamed: 0,Z,Name,Symbol,Average atomic mass,Electronegativity (Pauling),First Ionization Energy (eV),Atomic Radii (pm),Van der Waals Radii (pm),Covalent Radii (pm),Valence electrons
0,1,Hydrogen,H,1.007,2.2,1359844.0,25.0,120.0,38,1.0
1,2,Helium,He,4.002602,,2458741.0,31.0,140.0,32,2.0
2,3,Lithium,Li,6.941,0.98,539172.0,145.0,182.0,134,1.0
3,4,Beryllium,Be,9.012182,1.57,9.3227,105.0,,90,2.0
4,5,Boron,B,10.811,2.04,829803.0,85.0,,82,3.0


In [None]:
RACs_result = []

for smile, cid in zip(data['isomeric smiles'], data['cid']):
    try:
        mass = create_property_descriptors(smile, 3, properties_dict, 2)
        EN = create_property_descriptors(smile, 3, properties_dict, 3)
        In = create_property_descriptors(smile, 3, properties_dict, 4)
        aRadius = create_property_descriptors(smile, 3, properties_dict, 5)
        VdW = create_property_descriptors(smile, 3, properties_dict, 6)
        covRadius = create_property_descriptors(smile, 3, properties_dict, 7)
        valence = create_property_descriptors(smile, 3, properties_dict, 8)

        dict_RACs = {
            'cid': cid,
            'mass dZero': mass[0],
            'mass dOne': mass[1],
            'mass dTwo': mass[2],
            'mass dThree': mass[3],
            'EN dZero': EN[0],
            'EN dOne': EN[1],
            'EN dTwo': EN[2],
            'EN dThree': EN[3],
            'In dZero': In[0],
            'In dOne': In[1],
            'In dTwo': In[2],
            'In dThree': In[3],
            'aRadius dZero': aRadius[0],
            'aRadius dOne': aRadius[1],
            'aRadius dTwo': aRadius[2],
            'aRadius dThree': aRadius[3],
            'VdW dZero': VdW[0],
            'VdW dOne': VdW[1],
            'VdW dTwo': VdW[2],
            'VdW dThree': VdW[3],
            'covRadius dZero': covRadius[0],
            'covRadius dOne': covRadius[1],
            'covRadius dTwo': covRadius[2],
            'covRadius dThree': covRadius[3],
            'valence dZero': valence[0],
            'valence dOne': valence[1],
            'valence dTwo': valence[2],
            'valence dThree': valence[3]
        }

        RACs_result.append(dict_RACs)

    except:

        dict_RACs = {
            'cid': cid,
            'mass dZero': np.nan,
            'mass dOne': np.nan,
            'mass dTwo': np.nan,
            'mass dThree': np.nan,
            'EN dZero': np.nan,
            'EN dOne': np.nan,
            'EN dTwo': np.nan,
            'EN dThree': np.nan,
            'In dZero': np.nan,
            'In dOne': np.nan,
            'In dTwo': np.nan,
            'In dThree': np.nan,
            'aRadius dZero': np.nan,
            'aRadius dOne': np.nan,
            'aRadius dTwo': np.nan,
            'aRadius dThree': np.nan,
            'VdW dZero': np.nan,
            'VdW dOne': np.nan,
            'VdW dTwo': np.nan,
            'VdW dThree': np.nan,
            'covRadius dZero': np.nan,
            'covRadius dOne': np.nan,
            'covRadius dTwo': np.nan,
            'covRadius dThree': np.nan,
            'valence dZero': np.nan,
            'valence dOne': np.nan,
            'valence dTwo': np.nan,
            'valence dThree': np.nan
        }

        RACs_result.append(dict_RACs)


RACs = pd.DataFrame(RACs_result)

In [None]:
RACs.isnull().sum()

In [None]:
RACs.to