## Библиотеки

In [71]:
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote
from rdkit import Chem
import numpy as np

In [72]:
#если не запустится ячейка не страшно
from sklearn.model_selection import train_test_split
import scipy.stats # При работе со статистикой
import seaborn as sns
import matplotlib.pyplot as plt  # Библиотека для визуализации результатов
%matplotlib inline

#нужны на время для визуализации
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import NeighborhoodComponentsAnalysis

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_selection import RFE
from sklearn.svm import SVR

#нужны на время для проверки фичей
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Работа с данными

загрузка данных

In [73]:
data = pd.read_csv("Task/train.csv")

In [74]:
data = data.loc[:, 'Smiles':]

In [75]:
data

Unnamed: 0,Smiles
0,COC(=O)Nc1ccc(-c2nc(N3CCOCC3)c3cnn(C4CCN(Cc5cc...
1,O=Cc1cn(Cc2ccc(Cl)cc2)c2ccccc12
2,NC(CO)C(=O)NNCc1ccc(O)c(O)c1O
3,Nc1nc(=O)n([C@H]2C[C@H](O)[C@@H](CO)O2)cc1Cl
4,Nc1ccc(C(=O)Nc2cccc(-c3nc(N4CCOCC4)c4oc5ncccc5...
...,...
1609,NS(=O)(=O)c1cc2c(cc1Cl)NC(C1CC3C=CC1C3)NS2(=O)=O
1610,Cc1cccc(Nc2ccncc2S(=O)(=O)NC(=O)NC(C)C)c1
1611,CCCC(=O)O[C@]1(C(=O)CO)CC[C@H]2[C@@H]3CCC4=CC(...
1612,CN(C)c1cccc(Oc2cnc(Nc3cccc(O)c3)nc2)c1


Генерация фичей

In [76]:
ssr = []
num_atoms = []
num_atoms_with_hs = []
atomic_num = []
binStr = []

for mol in tqdm(data["Smiles"]):
    a = []
    m = Chem.MolFromSmiles(mol)
    ssr.append(Chem.GetSSSR(m))
    num_atoms.append(m.GetNumAtoms())
    m2 = Chem.AddHs(m)
    num_atoms_with_hs.append(m2.GetNumAtoms())
    for atom in m.GetAtoms():
        a.append(atom.GetAtomicNum())
    atomic_num.append(sum(a))
    binStr.append(len(m.ToBinary()))

100%|█████████████████████████████████████| 1614/1614 [00:01<00:00, 1494.49it/s]


In [77]:
from rdkit.Chem.Descriptors import ExactMolWt

mol_wt = []

for mol in tqdm(data["Smiles"]):
    m = Chem.MolFromSmiles(mol)
    mol_wt.append(ExactMolWt(m)) #weight of the molecule

100%|█████████████████████████████████████| 1614/1614 [00:00<00:00, 3141.14it/s]


In [78]:
from rdkit.Chem import Fragments

num_of_carboxylic = []
num_of_hydroxyl = []
num_of_hydroxylOH = []
num_of_nfunctional = []
num_of_aromatics = []
num_of_arom_nitrogens = []
num_of_arom_amines = []
num_of_arom_hydroxyl = []
num_of_hydroxyl_acids = []
num_of_carbonyl = []
num_of_carbonyl_excluding = []
num_of_thiocarbonyl = []
num_of_sml = []
num_of_Imines = []
num_of_Tertiary_amines = []
num_of_Secondary_amines = []
num_of_Primary_amines = []
num_of_hydroxylamine = []
num_of_XCCNR = []
num_of_tertalicyclic = []
num_of_Hpyrrole = []
num_of_thiol = []

for mol in tqdm(data["Smiles"]):
    
    m = Chem.MolFromSmiles(mol)
    num_of_carboxylic.append(Fragments.fr_Al_COO(m)) #Number of aliphatic carboxylic acids
    num_of_hydroxyl.append(Fragments.fr_Al_OH(m)) #Number of aliphatic hydroxyl groups
    num_of_hydroxylOH.append(Fragments.fr_Al_OH_noTert(m)) #Number of aliphatic hydroxyl groups excluding tert-OH
    num_of_nfunctional.append(Fragments.fr_ArN(m)) #Number of N functional groups attached to aromatics
    num_of_aromatics.append(Fragments.fr_Ar_COO(m)) #Number of Aromatic carboxylic acide
    num_of_arom_nitrogens.append(Fragments.fr_Ar_N(m)) #Number of aromatic nitrogens
    num_of_arom_amines.append(Fragments.fr_Ar_NH(m)) #Number of aromatic amines
    num_of_arom_hydroxyl.append(Fragments.fr_Ar_OH(m)) #Number of aromatic hydroxyl groups
    num_of_hydroxyl_acids.append(Fragments.fr_COO(m)) #Number of carboxylic acids
    num_of_carbonyl.append(Fragments.fr_C_O(m)) #Number of carbonyl O
    num_of_carbonyl_excluding.append(Fragments.fr_C_O_noCOO(m)) #Number of carbonyl O, excluding COOH
    num_of_thiocarbonyl.append(Fragments.fr_C_S(m)) #Number of thiocarbonyl
    num_of_sml.append(Fragments.fr_HOCCN(m)) #Number of C(OH)CCN-Ctert-alkyl or C(OH)CCNcyclic
    num_of_Imines.append(Fragments.fr_Imine(m)) #Number of Imines
    num_of_Tertiary_amines.append(Fragments.fr_NH0(m)) #Number of Tertiary amines
    num_of_Secondary_amines.append(Fragments.fr_NH1(m)) #Number of Secondary amines
    num_of_Primary_amines.append(Fragments.fr_NH2(m)) #Number of Primary amines
    num_of_hydroxylamine.append(Fragments.fr_N_O(m)) #Number of hydroxylamine groups
    num_of_XCCNR.append(Fragments.fr_Ndealkylation1(m)) #Number of XCCNR groups
    num_of_tertalicyclic.append(Fragments.fr_Ndealkylation2(m)) #Number of tert-alicyclic amines (no heteroatoms, not quinine-like bridged N)
    num_of_Hpyrrole.append(Fragments.fr_Nhpyrrole(m)) #Number of H-pyrrole nitrogens
    num_of_thiol.append(Fragments.fr_SH(m)) #Number of thiol groups

100%|█████████████████████████████████████| 1614/1614 [00:01<00:00, 1472.28it/s]


In [79]:
########################################
#  Писать тут, и только тут))))))))))  #
########################################

from rdkit.Chem import Fragments

num_of_aldehyde = []
num_of_alkyl_carbamate = []
num_of_alkyl_halide = []
num_of_allylic_oxid = []
num_of_amide = []
num_of_amidine = []
num_of_aniline = []
num_of_aryl_methyl = []
num_of_azide = []
num_of_azo = []
num_of_barbitur = []
num_of_benzene = []
num_of_benzodiazepine = []
num_of_bicyclic = []
num_of_diazo = []
num_of_dihydropyridine = []
num_of_epoxide = []
num_of_ester = []
num_of_ether = []
num_of_furan = []
num_of_guanido = []
num_of_halogen = []
num_of_hdrzine = []
num_of_hdrzone = []
num_of_imidazole = []
num_of_imide = []
num_of_isothiocyan = []
num_of_ketone = []
num_of_ketone_Topliss = []
num_of_lactam = []
num_of_lactone = []
num_of_methoxy = []
num_of_morpholine = []
num_of_nitrile = []
num_of_nitro = []
num_of_nitro_arom = []
num_of_nitro_arom_nonortho = []
num_of_nitroso = []
num_of_oxazole = []
num_of_oxime = []
num_of_para_hydroxylation = []
num_of_phenol = []
num_of_phenol_noOrthoHbond = []
num_of_phos_acid = []
num_of_phos_ester = []
num_of_piperdine = []
num_of_piperzine = []
num_of_priamide = []
num_of_pyridine = []
num_of_quatN = []
num_of_sulfide = []
num_of_sulfonamd = []
num_of_sulfone = []
num_of_term_acetylene = []
num_of_tetrazole = []
num_of_thiazole = []
num_of_thiocyan = []
num_of_thiophene = []
num_of_unbrch_alkane = []
num_of_urea = []

for mol in tqdm(data["Smiles"]):
    
    m = Chem.MolFromSmiles(mol)
    num_of_aldehyde.append(Fragments.fr_aldehyde(m)) # Number of aldehydes
    num_of_alkyl_carbamate.append(Fragments.fr_alkyl_carbamate(m)) # Number of alkyl carbamates (subject to hydrolysis)
    num_of_alkyl_halide.append(Fragments.fr_alkyl_halide(m)) # Number of alkyl halides
    num_of_allylic_oxid.append(Fragments.fr_allylic_oxid(m)) # Number of allylic oxidation sites excluding steroid dienone
    num_of_amide.append(Fragments.fr_amide(m)) # Number of amides
    num_of_amidine.append(Fragments.fr_amidine(m)) # Number of amidine groups
    num_of_aniline.append(Fragments.fr_aniline(m)) # Number of anilines
    num_of_aryl_methyl.append(Fragments.fr_aryl_methyl(m)) # Number of aryl methyl sites for hydroxylation
    num_of_azide.append(Fragments.fr_azide(m)) # Number of azide groups
    num_of_azo.append(Fragments.fr_azo(m)) # Number of azo groups
    num_of_barbitur.append(Fragments.fr_barbitur(m)) # Number of barbiturate groups
    num_of_benzene.append(Fragments.fr_benzene(m)) # Number of benzene rings
    num_of_benzodiazepine.append(Fragments.fr_benzodiazepine(m)) # Number of benzodiazepines with no additional fused rings
    num_of_bicyclic.append(Fragments.fr_bicyclic(m)) # Bicyclic
    num_of_diazo.append(Fragments.fr_diazo(m)) # Number of diazo groups
    num_of_dihydropyridine.append(Fragments.fr_dihydropyridine(m)) # Number of dihydropyridines
    num_of_epoxide.append(Fragments.fr_epoxide(m)) # Number of epoxide rings
    num_of_ester.append(Fragments.fr_ester(m)) # Number of esters
    num_of_ether.append(Fragments.fr_ether(m)) # Number of ether oxygens (including phenoxy)
    num_of_furan.append(Fragments.fr_furan(m)) # Number of furan rings
    num_of_guanido.append(Fragments.fr_guanido(m)) # Number of guanidine groups
    num_of_halogen.append(Fragments.fr_halogen(m)) # Number of halogens
    num_of_hdrzine.append(Fragments.fr_hdrzine(m)) # Number of hydrazine groups
    num_of_hdrzone.append(Fragments.fr_hdrzone(m)) # Number of hydrazone groups
    num_of_imidazole.append(Fragments.fr_imidazole(m)) # Number of imidazole rings
    num_of_imide.append(Fragments.fr_imide(m)) # Number of imide groups
    num_of_isothiocyan.append(Fragments.fr_isothiocyan(m)) # Number of isothiocyanates
    num_of_ketone.append(Fragments.fr_ketone(m)) # Number of ketones
    num_of_ketone_Topliss.append(Fragments.fr_ketone_Topliss(m)) # Number of ketones excluding diaryl, a,b-unsat. dienones, heteroatom on Calpha
    num_of_lactam.append(Fragments.fr_lactam(m)) # Number of beta lactams
    num_of_lactone.append(Fragments.fr_lactone(m)) # Number of cyclic esters (lactones)
    num_of_methoxy.append(Fragments.fr_methoxy(m)) # Number of methoxy groups -OCH3
    num_of_morpholine.append(Fragments.fr_morpholine(m)) # Number of morpholine rings
    num_of_nitrile.append(Fragments.fr_nitrile(m)) # Number of nitriles
    num_of_nitro.append(Fragments.fr_nitro(m)) # Number of nitro groups
    num_of_nitro_arom.append(Fragments.fr_nitro_arom(m)) # Number of nitro benzene ring substituents
    num_of_nitro_arom_nonortho.append(Fragments.fr_nitro_arom_nonortho(m)) # Number of non-ortho nitro benzene ring substituents
    num_of_nitroso.append(Fragments.fr_nitroso(m)) # Number of nitroso groups, excluding NO2
    num_of_oxazole.append(Fragments.fr_oxazole(m)) # Number of oxazole rings
    num_of_oxime.append(Fragments.fr_oxime(m)) # Number of oxime groups
    num_of_para_hydroxylation.append(Fragments.fr_para_hydroxylation(m)) # Number of para-hydroxylation sites
    num_of_phenol.append(Fragments.fr_phenol(m)) # Number of phenols
    num_of_phenol_noOrthoHbond.append(Fragments.fr_phenol_noOrthoHbond(m)) # Number of phenolic OH excluding ortho intramolecular Hbond substituents
    num_of_phos_acid.append(Fragments.fr_phos_acid(m)) # Number of phosphoric acid groups
    num_of_phos_ester.append(Fragments.fr_phos_ester(m)) # Number of phosphoric ester groups
    num_of_piperdine.append(Fragments.fr_piperdine(m)) # Number of piperdine rings
    num_of_piperzine.append(Fragments.fr_piperzine(m)) # Number of piperzine rings
    num_of_priamide.append(Fragments.fr_priamide(m)) # Number of primary amides
    num_of_pyridine.append(Fragments.fr_pyridine(m)) # Number of pyridine rings
    num_of_quatN.append(Fragments.fr_quatN(m)) # Number of quarternary nitrogens
    num_of_sulfide.append(Fragments.fr_sulfide(m)) # Number of thioether
    num_of_sulfonamd.append(Fragments.fr_sulfonamd(m)) # Number of sulfonamides
    num_of_sulfone.append(Fragments.fr_sulfone(m)) # Number of sulfone groups
    num_of_term_acetylene.append(Fragments.fr_term_acetylene(m)) # Number of terminal acetylenes
    num_of_tetrazole.append(Fragments.fr_tetrazole(m)) # Number of tetrazole rings
    num_of_thiazole.append(Fragments.fr_thiazole(m)) # Number of thiazole rings
    num_of_thiocyan.append(Fragments.fr_thiocyan(m)) # Number of thiocyanates
    num_of_thiophene.append(Fragments.fr_thiophene(m)) # Number of thiophene rings
    num_of_unbrch_alkane.append(Fragments.fr_unbrch_alkane(m)) # Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes)
    num_of_urea.append(Fragments.fr_urea(m)) # Number of urea groups

100%|██████████████████████████████████████| 1614/1614 [00:02<00:00, 760.99it/s]


In [80]:
from rdkit.Chem import GraphDescriptors

value_balabanJ = []
complexity_of_mol = []
value_Chi0 = []
value_Chi0n = []
value_Chi0v = []
value_Chi1 = []
value_Chi1n = []
value_Chi1v = []
value_Chi2n = []
value_Chi2v = []
value_Chi3n = []
value_Chi3v = []
value_Chi4n = []
value_Chi4v = []
value_HallKierAlpha = []
value_Ipc = []
value_Kappa1 = []
value_Kappa2 = []
value_Kappa3 = []

for mol in tqdm(data["Smiles"]):
    
    m = Chem.MolFromSmiles(mol)
    value_balabanJ.append(GraphDescriptors.BalabanJ(m)) #Calculate Balaban’s J value for a molecule
    complexity_of_mol.append(GraphDescriptors.BertzCT(m)) #A topological index meant to quantify “complexity” of molecules.
    value_Chi0.append(GraphDescriptors.Chi0(m))
    value_Chi0n.append(GraphDescriptors.Chi0n(m))
    value_Chi0v.append(GraphDescriptors.Chi0v(m))
    value_Chi1.append(GraphDescriptors.Chi1(m))
    value_Chi1n.append(GraphDescriptors.Chi1n(m))
    value_Chi1v.append(GraphDescriptors.Chi1v(m))
    value_Chi2n.append(GraphDescriptors.Chi2n(m))
    value_Chi2v.append(GraphDescriptors.Chi2v(m))
    value_Chi3n.append(GraphDescriptors.Chi3n(m))
    value_Chi3v.append(GraphDescriptors.Chi3v(m))
    value_Chi4n.append(GraphDescriptors.Chi4n(m))
    value_Chi4v.append(GraphDescriptors.Chi4v(m))
    value_HallKierAlpha.append(GraphDescriptors.HallKierAlpha(m))
    value_Ipc.append(GraphDescriptors.Ipc(m))
    value_Kappa1.append(GraphDescriptors.Kappa1(m))
    value_Kappa2.append(GraphDescriptors.Kappa2(m))
    value_Kappa3.append(GraphDescriptors.Kappa3(m))

100%|██████████████████████████████████████| 1614/1614 [00:14<00:00, 113.13it/s]


In [81]:
from rdkit.Chem import Lipinski

parameters_L = []
num_of_heavy_a = []
num_of_NHs_or_ONs = []
num_of_nitrogens_oxygens = []
num_of_aliphaticC = []
num_of_aliphaticH = []
num_of_aliphaticR = []
num_of_aliphaticR = []
num_of_aromatic_carbocycles = []
num_of_aromatic_heterocycles = []
num_of_aromatic_rings = []
num_of_haccept = []
num_of_hdonor = []
num_of_heteroatoms = []
num_of_rotatable = []
num_of_sat_carbocycles = []
num_of_sat_heterocycles = []
num_of_sat_rings = []
count_of_rings = []

for mol in tqdm(data["Smiles"]):
    m = Chem.MolFromSmiles(mol)
    parameters_L.append(Lipinski.FractionCSP3(m)) #Calculation of Lipinski parameters for molecules
    num_of_heavy_a.append(Lipinski.HeavyAtomCount(m)) #Number of heavy atoms a molecule.
    num_of_NHs_or_ONs.append(Lipinski.NHOHCount(m)) #Number of NHs or OHs
    num_of_nitrogens_oxygens.append(Lipinski.NOCount(m)) #Number of Nitrogens and Oxygens
    num_of_aliphaticC.append(Lipinski.NumAliphaticCarbocycles(m)) #number of aliphatic
    num_of_aliphaticH.append(Lipinski.NumAliphaticHeterocycles(m)) #number of aliphatic
    num_of_aliphaticR.append(Lipinski.NumAliphaticRings(m)) #number of aliphatic
    num_of_aromatic_carbocycles.append(Lipinski.NumAromaticCarbocycles(m)) #number of aromatic carbocycles for a molecule
    num_of_aromatic_heterocycles.append(Lipinski.NumAromaticHeterocycles(m)) #number of aromatic heterocycles for a molecule
    num_of_aromatic_rings.append(Lipinski.NumAromaticRings(m)) #number of aromatic rings for a molecule
    num_of_haccept.append(Lipinski.NumHAcceptors(m)) #Number of Hydrogen Bond Acceptors
    num_of_hdonor.append(Lipinski.NumHDonors(m)) #Number of Hydrogen Bond Donors
    num_of_heteroatoms.append(Lipinski.NumHeteroatoms(m)) #Number of Heteroatoms
    num_of_rotatable.append(Lipinski.NumRotatableBonds(m)) #Number of Rotatable Bonds
    num_of_sat_carbocycles.append(Lipinski.NumSaturatedCarbocycles(m)) #number of saturated carbocycles for a molecule
    num_of_sat_heterocycles.append(Lipinski.NumSaturatedHeterocycles(m)) #number of saturated heterocycles for a molecule
    num_of_sat_rings.append(Lipinski.NumSaturatedRings(m)) #number of saturated rings for a molecule
    count_of_rings.append(Lipinski.RingCount(m)) #

100%|█████████████████████████████████████| 1614/1614 [00:01<00:00, 1413.85it/s]


In [82]:
from rdkit.Chem import MolSurf

labute = []
peoe_vsa1 = []
peoe_vsa10 = []
peoe_vsa11 = []
peoe_vsa12 = []
peoe_vsa13 = []
peoe_vsa14 = []
peoe_vsa2 = []
peoe_vsa3 = []
peoe_vsa4 = []
peoe_vsa5 = []
peoe_vsa6 = []
peoe_vsa7 = []
peoe_vsa8 = []
peoe_vsa9 = []
smr_vsa1 = []
smr_vsa10 = []
smr_vsa2 = []
smr_vsa3 = []
smr_vsa4 = []
smr_vsa5 = []
smr_vsa6 = []
smr_vsa7 = []
smr_vsa9 = []
slogp_vsa1 = []
slogp_vsa10 = []
slogp_vsa11 = []
slogp_vsa12 = []
slogp_vsa2 = []
slogp_vsa3 = []
slogp_vsa4 = []
slogp_vsa5 = []
slogp_vsa6 = []
slogp_vsa7 = []
slogp_vsa8 = []
tpsa = []
pyLabuteASA = []

for mol in tqdm(data["Smiles"]):
    m = Chem.MolFromSmiles(mol)
    labute.append(MolSurf.LabuteASA(m))
    peoe_vsa1.append(MolSurf.PEOE_VSA1(m))
    peoe_vsa10.append(MolSurf.PEOE_VSA10(m))
    peoe_vsa11.append(MolSurf.PEOE_VSA11(m))
    peoe_vsa12.append(MolSurf.PEOE_VSA12(m)) 
    peoe_vsa13.append(MolSurf.PEOE_VSA13(m))  
    peoe_vsa14.append(MolSurf.PEOE_VSA14(m))   
    peoe_vsa2.append(MolSurf.PEOE_VSA2(m)) 
    peoe_vsa3.append(MolSurf.PEOE_VSA3(m))
    peoe_vsa4.append(MolSurf.PEOE_VSA4(m))
    peoe_vsa5.append(MolSurf.PEOE_VSA5(m))
    peoe_vsa6.append(MolSurf.PEOE_VSA6(m))
    peoe_vsa7.append(MolSurf.PEOE_VSA7(m))
    peoe_vsa8.append(MolSurf.PEOE_VSA8(m)) 
    peoe_vsa9.append(MolSurf.PEOE_VSA9(m))
    smr_vsa1.append(MolSurf.SMR_VSA1(m))
    smr_vsa10.append(MolSurf.SMR_VSA10(m))
    smr_vsa2.append(MolSurf.SMR_VSA2(m))
    smr_vsa3.append(MolSurf.SMR_VSA3(m))
    smr_vsa4.append(MolSurf.SMR_VSA4(m))
    smr_vsa5.append(MolSurf.SMR_VSA5(m))
    smr_vsa6.append(MolSurf.SMR_VSA6(m))
    smr_vsa7.append(MolSurf.SMR_VSA7(m))  
    smr_vsa9.append(MolSurf.SMR_VSA9(m))
    slogp_vsa1.append(MolSurf.SlogP_VSA1(m))
    slogp_vsa10.append(MolSurf.SlogP_VSA10(m))
    slogp_vsa11.append(MolSurf.SlogP_VSA11(m))
    slogp_vsa12.append(MolSurf.SlogP_VSA12(m))
    slogp_vsa2.append(MolSurf.SlogP_VSA2(m))
    slogp_vsa3.append(MolSurf.SlogP_VSA3(m))
    slogp_vsa4.append(MolSurf.SlogP_VSA4(m))
    slogp_vsa5.append(MolSurf.SlogP_VSA5(m))
    slogp_vsa6.append(MolSurf.SlogP_VSA6(m))
    slogp_vsa7.append(MolSurf.SlogP_VSA7(m))
    slogp_vsa8.append(MolSurf.SlogP_VSA8(m)) 
    tpsa.append(MolSurf.TPSA(m))
    pyLabuteASA.append(MolSurf.pyLabuteASA(m))

100%|██████████████████████████████████████| 1614/1614 [00:02<00:00, 548.73it/s]


In [83]:
data["SSR"] = ssr
data["Num atoms"] = num_atoms
data["Num atoms with Hs"] = num_atoms_with_hs
data["Atomic num"] = atomic_num
data["Binary"] = binStr
data["Weight"] = mol_wt

In [84]:
data["Aliphatic carboxylic acids"] = num_of_carboxylic
data["aliphatic hydroxyl groups"] = num_of_hydroxyl
data["aliphatic hydroxyl groups excluding tert-OH"] = num_of_hydroxylOH
data["N functional groups attached to aromatics"] = num_of_nfunctional
data["Aromatic carboxylic acide"] = num_of_aromatics
data["aromatic nitrogens"] = num_of_arom_nitrogens
data["aromatic amines"] = num_of_arom_amines
data["aromatic hydroxyl groups"] = num_of_arom_hydroxyl
data["carboxylic acids"] = num_of_hydroxyl_acids
data["Number of carbonyl O"] = num_of_carbonyl
data["carbonyl O, excluding COOH"] = num_of_carbonyl_excluding
data["C(OH)CCN-Ctert-alkyl or C(OH)CCNcyclic"] = num_of_thiocarbonyl
data["Number of Imines"] = num_of_Imines
data["Tertiary amines"] = num_of_Tertiary_amines
data["Secondary amines"] = num_of_Secondary_amines
data["Primary amines"] = num_of_Primary_amines
data["hydroxylamine groups"] = num_of_hydroxylamine
data["XCCNR groups"] = num_of_XCCNR
data["tert-alicyclic amines"] = num_of_tertalicyclic
data["H-pyrrole nitrogens"] = num_of_Hpyrrole
data["thiol groups"] = num_of_thiol

In [85]:
data["Number of aldehydes"] = value_balabanJ
data["Balaban’s J value"] = value_balabanJ
data["“complexity” of molecules"] = complexity_of_mol
data["value_Chi0"] = value_Chi0
data["value_Chi0n"] = value_Chi0n
data["value_Chi0v"] = value_Chi0v
data["value_Chi1"] = value_Chi1
data["value_Chi1n"] = value_Chi1n
data["value_Chi1v"] = value_Chi1v
data["value_Chi2n"] = value_Chi2n
data["value_Chi2v"] = value_Chi2v
data["value_Chi3n"] = value_Chi3n
data["value_Chi3v"] = value_Chi3v
data["value_Chi4n"] = value_Chi4n
data["value_Chi4v"] = value_Chi4v
data["value_HallKierAlpha"] = value_HallKierAlpha
data["value_Ipc"] = value_Ipc
data["value_Kappa1"] = value_Kappa1
data["value_Kappa2"] = value_Kappa2
data["value_Kappa3"] = value_Kappa3

In [86]:
data["parameters_L"] = parameters_L
data["num_of_heavy_a"] = num_of_heavy_a
data["num_of_NHs_or_ONs"] = num_of_NHs_or_ONs
data["num_of_nitrogens_oxygens"] = num_of_nitrogens_oxygens 
data["num_of_aliphaticC"] = num_of_aliphaticC
data["num_of_aliphaticH"] = num_of_aliphaticH 
data["num_of_aliphaticR"] = num_of_aliphaticR 
data["num_of_aliphaticR"] = num_of_aliphaticR
data["num_of_aromatic_carbocycles"] = num_of_aromatic_carbocycles
data["num_of_aromatic_heterocycles"] = num_of_aromatic_heterocycles
data["num_of_aromatic_rings"] = num_of_aromatic_rings
data["num_of_haccept"] = num_of_haccept
data["num_of_hdonor"] = num_of_hdonor
data["num_of_heteroatoms"] = num_of_heteroatoms
data["num_of_rotatable"] = num_of_rotatable
data["num_of_sat_carbocycles"] = num_of_sat_carbocycles
data["num_of_sat_heterocycles"] = num_of_sat_heterocycles
data["num_of_sat_rings"] = num_of_sat_rings
data["count_of_rings"] = count_of_rings

In [87]:
data["labute"] = labute
data["peoe_vsa1"] = peoe_vsa1
data["peoe_vsa10"] = peoe_vsa10
data["peoe_vsa11"] = peoe_vsa11
data["peoe_vsa12"] = peoe_vsa12
data["peoe_vsa13"] = peoe_vsa13
data["peoe_vsa14"] = peoe_vsa14
data["peoe_vsa2"] = peoe_vsa2
data["peoe_vsa3"] = peoe_vsa3
data["peoe_vsa4"] = peoe_vsa4
data["peoe_vsa5"] = peoe_vsa5
data["peoe_vsa6"] = peoe_vsa6
data["peoe_vsa7"] = peoe_vsa7
data["peoe_vsa8"] = peoe_vsa8
data["peoe_vsa9"] = peoe_vsa9
data["smr_vsa1"] = smr_vsa1
data["smr_vsa10"] = smr_vsa10
data["smr_vsa2"] = smr_vsa2
data["smr_vsa3"] = smr_vsa3
data["smr_vsa4"] = smr_vsa4
data["smr_vsa5"] = smr_vsa5
data["smr_vsa6"] = smr_vsa6
data["smr_vsa7"] = smr_vsa7
data["smr_vsa9"] = smr_vsa9
data["slogp_vsa1"] = slogp_vsa1
data["slogp_vsa10"] = slogp_vsa10
data["slogp_vsa11"] = slogp_vsa11
data["slogp_vsa12"] = slogp_vsa12
data["slogp_vsa2"] = slogp_vsa2
data["slogp_vsa3"] = slogp_vsa3
data["slogp_vsa4"] = slogp_vsa4
data["slogp_vsa5"] = slogp_vsa5
data["slogp_vsa6"] = slogp_vsa6
data["slogp_vsa7"] = slogp_vsa7
data["slogp_vsa8"] = slogp_vsa8
data["tpsa"] = tpsa
data["pyLabuteASA"] = pyLabuteASA

  data["slogp_vsa8"] = slogp_vsa8
  data["tpsa"] = tpsa
  data["pyLabuteASA"] = pyLabuteASA


In [88]:
data["num_of_aldehyde"] = num_of_aldehyde
data["num_of_amide"] = num_of_amide
data["num_of_amidine"] = num_of_amidine
data["num_of_aniline"] = num_of_aniline
data["num_of_aryl_methyl"] = num_of_aryl_methyl
data["num_of_azide"] = num_of_azide
data["num_of_azo"] = num_of_azo
data["num_of_barbitur"] = num_of_barbitur
data["num_of_benzene"] = num_of_benzene
data["num_of_benzodiazepine"] = num_of_benzodiazepine
data["num_of_bicyclic"] = num_of_bicyclic
data["num_of_diazo"] = num_of_diazo
data["num_of_dihydropyridine"] = num_of_dihydropyridine
data["num_of_epoxide"] = num_of_epoxide
data["num_of_ester"] = num_of_ester
data["num_of_ether"] = num_of_ether
data["num_of_furan"] = num_of_furan
data["num_of_guanido"] = num_of_guanido
data["num_of_halogen"] = num_of_halogen
data["num_of_hdrzine"] = num_of_hdrzine
data["num_of_hdrzone"] = num_of_hdrzone
data["num_of_imidazole"] = num_of_imidazole
data["num_of_imide"] = num_of_imide
data["num_of_isothiocyan"] = num_of_isothiocyan
data["num_of_ketone"] = num_of_ketone
data["num_of_ketone_Topliss"] = num_of_ketone_Topliss
data["num_of_lactam"] = num_of_lactam
data["num_of_lactone"] = num_of_lactone
data["num_of_methoxy"] = num_of_methoxy
data["num_of_morpholine"] = num_of_morpholine
data["num_of_nitrile"] = num_of_nitrile
data["num_of_nitro"] = num_of_nitro
data["num_of_nitro_arom"] = num_of_nitro_arom
data["num_of_nitro_arom_nonortho"] = num_of_nitro_arom_nonortho
data["num_of_nitroso"] = num_of_nitroso
data["num_of_oxazole"] = num_of_oxazole
data["num_of_oxime"] = num_of_oxime
data["num_of_para_hydroxylation"] = num_of_para_hydroxylation
data["num_of_phenol"] = num_of_phenol
data["num_of_phenol_noOrthoHbond"] = num_of_phenol_noOrthoHbond
data["num_of_phos_acid"] = num_of_phos_acid
data["num_of_phos_ester"] = num_of_phos_ester
data["num_of_piperdine"] = num_of_piperdine
data["num_of_piperzine"] = num_of_piperzine
data["num_of_priamide"] = num_of_priamide
data["num_of_pyridine"] = num_of_pyridine
data["num_of_quatN"] = num_of_quatN
data["num_of_sulfide"] = num_of_sulfide
data["num_of_sulfonamd"] = num_of_sulfonamd
data["num_of_sulfone"] = num_of_sulfone
data["num_of_term_acetylene"] = num_of_term_acetylene
data["num_of_tetrazole"] = num_of_tetrazole
data["num_of_thiazole"] = num_of_thiazole
data["num_of_thiocyan"] = num_of_thiocyan
data["num_of_thiophene"] = num_of_thiophene
data["num_of_unbrch_alkane"] = num_of_unbrch_alkane
data["num_of_urea"] = num_of_urea

  data["num_of_aldehyde"] = num_of_aldehyde
  data["num_of_amide"] = num_of_amide
  data["num_of_amidine"] = num_of_amidine
  data["num_of_aniline"] = num_of_aniline
  data["num_of_aryl_methyl"] = num_of_aryl_methyl
  data["num_of_azide"] = num_of_azide
  data["num_of_azo"] = num_of_azo
  data["num_of_barbitur"] = num_of_barbitur
  data["num_of_benzene"] = num_of_benzene
  data["num_of_benzodiazepine"] = num_of_benzodiazepine
  data["num_of_bicyclic"] = num_of_bicyclic
  data["num_of_diazo"] = num_of_diazo
  data["num_of_dihydropyridine"] = num_of_dihydropyridine
  data["num_of_epoxide"] = num_of_epoxide
  data["num_of_ester"] = num_of_ester
  data["num_of_ether"] = num_of_ether
  data["num_of_furan"] = num_of_furan
  data["num_of_guanido"] = num_of_guanido
  data["num_of_halogen"] = num_of_halogen
  data["num_of_hdrzine"] = num_of_hdrzine
  data["num_of_hdrzone"] = num_of_hdrzone
  data["num_of_imidazole"] = num_of_imidazole
  data["num_of_imide"] = num_of_imide
  data["num_of_isothio

  data["num_of_urea"] = num_of_urea


In [90]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1614 entries, 0 to 1613
Columns: 160 entries, Smiles to num_of_urea
dtypes: float64(59), int64(100), object(1)
memory usage: 2.0+ MB


In [91]:
data.loc[:, :"N functional groups attached to aromatics"].describe()

Unnamed: 0,SSR,Num atoms,Num atoms with Hs,Atomic num,Binary,Weight,Aliphatic carboxylic acids,aliphatic hydroxyl groups,aliphatic hydroxyl groups excluding tert-OH,N functional groups attached to aromatics
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,3.080545,26.48575,49.123296,178.920074,384.817224,382.282513,0.164808,0.454151,0.390954,0.105948
std,1.665732,14.449887,29.049079,95.752466,225.753264,205.968318,0.515213,1.242487,1.196789,0.352902
min,0.0,5.0,6.0,35.0,94.0,75.032028,0.0,0.0,0.0,0.0
25%,2.0,19.0,33.0,129.0,284.25,274.348463,0.0,0.0,0.0,0.0
50%,3.0,25.0,46.0,169.0,367.0,360.091873,0.0,0.0,0.0,0.0
75%,4.0,31.0,58.0,212.0,450.0,452.766151,0.0,0.0,0.0,0.0
max,21.0,295.0,577.0,1950.0,5537.0,4184.027307,6.0,21.0,21.0,4.0


In [92]:
data.loc[:, "Aromatic carboxylic acide":"Tertiary amines"].describe()

Unnamed: 0,Aromatic carboxylic acide,aromatic nitrogens,aromatic amines,aromatic hydroxyl groups,carboxylic acids,Number of carbonyl O,"carbonyl O, excluding COOH",C(OH)CCN-Ctert-alkyl or C(OH)CCNcyclic,Number of Imines,Tertiary amines
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,0.050186,1.065675,0.141884,0.197646,0.214994,1.107807,0.905824,0.013631,0.023544,1.595415
std,0.240034,1.487088,0.394089,0.619957,0.558093,1.984947,1.787135,0.148771,0.159636,1.625282
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
75%,0.0,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0
max,3.0,7.0,3.0,8.0,6.0,47.0,41.0,4.0,2.0,8.0


In [93]:
data.loc[:, "Secondary amines":"Balaban’s J value"].describe()

Unnamed: 0,Secondary amines,Primary amines,hydroxylamine groups,XCCNR groups,tert-alicyclic amines,H-pyrrole nitrogens,thiol groups,Number of aldehydes,Balaban’s J value
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,0.915118,0.273234,0.019827,0.107187,0.101611,0.141884,0.008055,1.729356,1.729356
std,1.789161,0.636025,0.185269,0.359493,0.344414,0.394089,0.102345,0.904837,0.904837
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6e-06,-6e-06
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.446822,1.446822
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.787375,1.787375
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.208945,2.208945
max,38.0,7.0,3.0,4.0,4.0,3.0,2.0,5.603987,5.603987


In [94]:
data.loc[:, "Balaban’s J value":"value_Chi2v"].describe()

Unnamed: 0,Balaban’s J value,“complexity” of molecules,value_Chi0,value_Chi0n,value_Chi0v,value_Chi1,value_Chi1n,value_Chi1v,value_Chi2n,value_Chi2v
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,1.729356,856.459498,19.007053,15.022682,15.719544,12.58322,8.680626,9.229701,6.701638,7.306723
std,0.904837,576.854974,10.572967,8.378189,8.419696,6.872416,4.967464,5.027132,4.020872,4.088197
min,-6e-06,42.854753,4.284457,2.211244,2.855462,2.0,0.804738,1.177731,0.398755,0.66799
25%,1.446822,528.058942,13.543606,10.407611,11.132546,8.897065,5.90223,6.438623,4.401829,5.032502
50%,1.787375,803.185427,17.907567,14.186507,14.975879,12.029372,8.211637,8.726504,6.13869,6.769618
75%,2.208945,1091.982442,22.520133,17.941787,18.703549,15.041469,10.537556,11.119502,8.063161,8.796426
max,5.603987,11276.457411,220.060743,166.478617,167.295113,139.435414,96.728699,98.122546,73.348746,74.334344


In [95]:
data.loc[:, "value_Chi3n":"parameters_L"].describe()

Unnamed: 0,value_Chi3n,value_Chi3v,value_Chi4n,value_Chi4v,value_HallKierAlpha,value_Ipc,value_Kappa1,value_Kappa2,value_Kappa3,parameters_L
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,4.710516,5.181277,3.331197,3.711622,-2.336408,5.543539e+109,19.277386,8.199297,5.237015,0.372583
std,2.964335,3.010615,2.258895,2.314557,1.609315,2.227095e+111,11.798551,5.992332,20.333953,0.250247
min,0.0,0.0,0.0,0.0,-29.28,3.60964,3.401939,0.925121,0.406543,0.0
25%,2.970059,3.429051,1.951389,2.304734,-3.1,19484.13,13.303981,5.199046,2.62761,0.1875
50%,4.220723,4.772936,2.912654,3.349629,-2.27,537862.3,18.062876,7.415765,3.811235,0.345491
75%,5.767288,6.341112,4.060704,4.572803,-1.3825,12134740.0,22.815204,9.829041,5.39403,0.5
max,47.980484,48.624435,31.850107,32.321511,2.52,8.947271e+112,248.533359,127.016899,800.769408,1.0


In [96]:
data.loc[:, "num_of_heavy_a":"num_of_aromatic_rings"].describe()

Unnamed: 0,num_of_heavy_a,num_of_NHs_or_ONs,num_of_nitrogens_oxygens,num_of_aliphaticC,num_of_aliphaticH,num_of_aliphaticR,num_of_aromatic_carbocycles,num_of_aromatic_heterocycles,num_of_aromatic_rings
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,26.483891,2.442379,6.234201,0.375465,0.689591,1.065056,1.267038,0.748451,2.015489
std,14.448984,3.282158,5.135575,0.929795,1.04975,1.400153,0.979614,0.921558,1.323312
min,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,1.0,4.0,0.0,0.0,0.0,1.0,0.0,1.0
50%,25.0,2.0,6.0,0.0,0.0,1.0,1.0,0.0,2.0
75%,31.0,3.0,8.0,0.0,1.0,2.0,2.0,1.0,3.0
max,295.0,65.0,110.0,6.0,21.0,21.0,8.0,4.0,8.0


In [97]:
data.loc[:, "num_of_haccept":"labute"].describe()

Unnamed: 0,num_of_haccept,num_of_hdonor,num_of_heteroatoms,num_of_rotatable,num_of_sat_carbocycles,num_of_sat_heterocycles,num_of_sat_rings,count_of_rings,labute
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,5.106568,2.144981,7.303594,5.225527,0.249071,0.467782,0.716853,3.080545,158.272307
std,3.635234,2.865022,5.332191,6.028857,0.740249,0.920869,1.189852,1.665732,83.738139
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.56385
25%,3.0,1.0,4.0,2.0,0.0,0.0,0.0,2.0,114.073019
50%,4.0,2.0,7.0,4.0,0.0,0.0,0.0,3.0,149.81651
75%,6.0,3.0,9.0,7.0,0.0,1.0,1.0,4.0,186.956637
max,60.0,58.0,111.0,134.0,5.0,21.0,21.0,21.0,1699.043459


In [98]:
data.loc[:, "peoe_vsa1":"peoe_vsa9"].describe()

Unnamed: 0,peoe_vsa1,peoe_vsa10,peoe_vsa11,peoe_vsa12,peoe_vsa13,peoe_vsa14,peoe_vsa2,peoe_vsa3,peoe_vsa4,peoe_vsa5,peoe_vsa6,peoe_vsa7,peoe_vsa8,peoe_vsa9
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,16.445709,9.880563,4.550247,3.699185,2.512951,4.671918,7.423256,5.353382,2.389805,2.660514,24.857342,39.536452,20.857238,13.407602
std,17.177514,13.35069,7.006647,9.968877,4.293827,9.054125,10.313679,6.189118,4.980129,6.180591,23.070405,26.063939,16.348623,12.381003
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.473726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.923737,22.41665,10.408858,5.563451
50%,14.37006,5.824404,0.0,0.0,0.0,0.0,4.89991,4.794537,0.0,0.0,19.056471,36.591933,17.802136,11.620667
75%,20.460712,12.356394,5.949142,5.90718,5.879988,5.969305,9.778516,9.499376,4.37354,0.0,36.043523,53.945927,29.303293,18.752689
max,322.175726,199.380747,59.209859,242.194369,40.472507,142.102226,230.752531,36.540978,39.513735,69.605639,250.627296,272.080698,120.905902,135.224112


In [99]:
data.loc[:, "smr_vsa1":"slogp_vsa11"].describe()

Unnamed: 0,smr_vsa1,smr_vsa10,smr_vsa2,smr_vsa3,smr_vsa4,smr_vsa5,smr_vsa6,smr_vsa7,smr_vsa9,slogp_vsa1,slogp_vsa10,slogp_vsa11
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,18.580167,22.958366,0.409921,9.559266,4.570873,32.427049,15.422421,47.964202,6.353898,9.905955,5.360675,3.517112
std,19.4351,18.939359,1.842178,11.463638,8.804604,37.727332,15.755956,28.758005,9.087935,12.908632,7.463758,6.482684
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.184952,11.638529,0.0,0.0,0.0,11.819221,5.316789,28.785186,0.0,4.736863,0.0,0.0
50%,14.325937,18.550501,0.0,5.37129,0.0,24.477833,11.923671,47.544254,0.0,5.733667,0.0,0.0
75%,23.473267,31.120273,0.0,14.783798,5.917906,40.769088,23.822937,65.724207,11.387856,11.467335,8.78083,5.749512
max,291.728104,306.634565,21.637134,220.639176,73.976345,564.974885,124.586109,178.750546,68.994142,226.223273,54.790196,68.994142


In [100]:
data.loc[:, "slogp_vsa11":"num_of_aldehyde"].describe()

Unnamed: 0,slogp_vsa11,slogp_vsa12,slogp_vsa2,slogp_vsa3,slogp_vsa4,slogp_vsa5,slogp_vsa6,slogp_vsa7,slogp_vsa8,tpsa,pyLabuteASA,num_of_aldehyde
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,3.517112,6.870052,39.987693,11.241703,6.197179,29.580045,38.23657,0.902252,6.446929,88.7414,158.272307,0.004957
std,6.482684,11.739771,37.619963,13.344837,9.327772,28.22889,24.171565,2.4695,9.410576,81.838884,83.738139,0.07025
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.56385,0.0
25%,0.0,0.0,19.935914,4.736863,0.0,11.126903,18.592944,0.0,0.0,50.45,114.073019,0.0
50%,0.0,0.0,32.254083,9.589074,0.0,23.527598,36.398202,0.0,0.0,76.775,149.81651,0.0
75%,5.749512,11.60094,50.45229,15.952222,10.738296,39.189689,54.597304,0.0,11.033401,106.8375,186.956637,0.0
max,68.994142,135.545224,726.116816,251.026534,56.664483,370.380023,145.723286,20.090533,75.407139,1749.75,1699.043459,1.0


In [101]:
data.loc[:, "num_of_aldehyde":"num_of_benzene"].describe()

Unnamed: 0,num_of_aldehyde,num_of_amide,num_of_amidine,num_of_aniline,num_of_aryl_methyl,num_of_azide,num_of_azo,num_of_barbitur,num_of_benzene
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,0.004957,0.682156,0.019207,0.566914,0.295539,0.0,0.001859,0.001859,1.268278
std,0.07025,1.760971,0.150231,0.902663,0.619995,0.0,0.055645,0.043086,0.981805
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
max,1.0,41.0,2.0,6.0,4.0,0.0,2.0,1.0,8.0


In [102]:
data.loc[:, "num_of_benzodiazepine":"num_of_hdrzine"].describe()

Unnamed: 0,num_of_benzodiazepine,num_of_bicyclic,num_of_diazo,num_of_dihydropyridine,num_of_epoxide,num_of_ester,num_of_ether,num_of_furan,num_of_guanido,num_of_halogen,num_of_hdrzine
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,0.00062,1.007435,0.0,0.005576,0.008055,0.128253,0.76456,0.024783,0.037175,0.697646,0.009294
std,0.024891,1.490601,0.0,0.074489,0.096096,0.43015,1.363898,0.16329,0.279235,1.211756,0.10224
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,1.0,14.0,0.0,1.0,2.0,5.0,21.0,2.0,4.0,10.0,2.0


In [103]:
data.loc[:, "num_of_hdrzone":"num_of_methoxy"].describe()

Unnamed: 0,num_of_hdrzone,num_of_imidazole,num_of_imide,num_of_isothiocyan,num_of_ketone,num_of_ketone_Topliss,num_of_lactam,num_of_lactone,num_of_methoxy
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,0.019207,0.095415,0.033457,0.001859,0.123296,0.075589,0.011152,0.019827,0.226146
std,0.173231,0.302199,0.196361,0.055645,0.427267,0.30574,0.105047,0.148072,0.699152
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4.0,2.0,2.0,2.0,3.0,3.0,1.0,2.0,10.0


In [104]:
data.loc[:, "num_of_morpholine":"num_of_phenol_noOrthoHbond"].describe()

Unnamed: 0,num_of_morpholine,num_of_nitrile,num_of_nitro,num_of_nitro_arom,num_of_nitro_arom_nonortho,num_of_nitroso,num_of_oxazole,num_of_oxime,num_of_para_hydroxylation,num_of_phenol,num_of_phenol_noOrthoHbond
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,0.026642,0.038414,0.032218,0.022305,0.015489,0.001859,0.003717,0.010533,0.210037,0.181537,0.1772
std,0.161084,0.201695,0.193388,0.155887,0.123527,0.043086,0.060877,0.10802,0.518593,0.59644,0.594103
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,2.0,3.0,2.0,1.0,1.0,1.0,2.0,3.0,8.0,8.0


In [105]:
data.loc[:, "num_of_phos_acid":"num_of_sulfonamd"].describe()

Unnamed: 0,num_of_phos_acid,num_of_phos_ester,num_of_piperdine,num_of_piperzine,num_of_priamide,num_of_pyridine,num_of_quatN,num_of_sulfide,num_of_sulfonamd
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,0.016729,0.011772,0.122677,0.057001,0.042131,0.197026,0.01425,0.060099,0.065675
std,0.187237,0.146833,0.422338,0.231917,0.224278,0.466713,0.133326,0.26952,0.273933
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,3.0,4.0,1.0,3.0,4.0,2.0,4.0,2.0


In [106]:
data.loc[:, "num_of_sulfone":].describe()

Unnamed: 0,num_of_sulfone,num_of_term_acetylene,num_of_tetrazole,num_of_thiazole,num_of_thiocyan,num_of_thiophene,num_of_unbrch_alkane,num_of_urea
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,0.017968,0.006196,0.009913,0.027881,0.0,0.031599,0.289963,0.050805
std,0.132875,0.078493,0.099101,0.172048,0.0,0.188623,1.384149,0.243749
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,2.0,0.0,2.0,20.0,4.0


Будем нормализовать все фичи кроме parameters_L

In [107]:
name_col = data.columns.values
# index = [0, 1, 51]
index = [0, 50]
name_col = np.delete(name_col,index)

In [108]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data[name_col])
data[name_col] = scaler.transform(data[name_col])

Только эти столбцы нужны для обучения. Еще поменяем True и False на 1 и 0. Разобьем выборку на трейн и тест со стратификацией

In [128]:
name_col = data.columns.values
index = [0]
name_col = np.delete(name_col, index)

In [129]:
X = data[name_col]

In [111]:
# r = []
# for a in data["Active"]:
#     r.append(1 if a else 0)
# y = pd.Series(r, copy=False)

In [112]:
# y

In [113]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [114]:
# X_train

In [115]:
# data.to_csv('train_new.csv')

In [116]:
# from tensorflow import keras

# hid_size = 159
# model = keras.Sequential(
#     [
#         keras.layers.Dense(
#             hid_size, activation="relu", input_shape=(X_train.shape[-1],)
#         ),
#         keras.layers.Dense(300, activation="relu", kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)),
#         keras.layers.Dense(400, activation="relu", kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)),
#         keras.layers.Dense(300, activation="relu", kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)),
#         keras.layers.Dense(1, activation="sigmoid"),
#     ]
# )

In [117]:
# metrics = [
#     keras.metrics.FalseNegatives(name="fn"),
#     keras.metrics.FalsePositives(name="fp"),
#     keras.metrics.TrueNegatives(name="tn"),
#     keras.metrics.TruePositives(name="tp"),
#     keras.metrics.Precision(name="precision"),
#     keras.metrics.Recall(name="recall"),
# ]

In [118]:
# model.compile(
#     optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
# )

In [119]:
# model.fit(
#     X_train,
#     y_train,
#     batch_size=2048,
#     epochs=2000,
#     verbose=2,
# )

In [120]:
# y_pred = model.predict(X_test)

In [121]:
# np.mean(y_pred)

In [122]:
# y_predict = []
# for y in y_pred:
#     int_y = 1 if y > 0.1 else 0
#     y_predict.append(int_y)

In [123]:
# accuracy_score(y_test, y_predict)

In [124]:
# confusion_matrix(y_test, y_predict)

In [130]:
X

Unnamed: 0,SSR,Num atoms,Num atoms with Hs,Atomic num,Binary,Weight,Aliphatic carboxylic acids,aliphatic hydroxyl groups,aliphatic hydroxyl groups excluding tert-OH,N functional groups attached to aromatics,...,num_of_sulfide,num_of_sulfonamd,num_of_sulfone,num_of_term_acetylene,num_of_tetrazole,num_of_thiazole,num_of_thiocyan,num_of_thiophene,num_of_unbrch_alkane,num_of_urea
0,1.753199,0.866313,0.753328,0.721666,0.771802,0.708956,-0.319983,-0.365631,-0.326770,-0.300312,...,-0.223055,-0.239824,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,-0.208498
1,-0.048369,-0.518210,-0.624079,-0.531953,-0.358100,-0.549875,-0.319983,-0.365631,-0.326770,-0.300312,...,-0.223055,-0.239824,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,-0.208498
2,-1.249415,-0.587436,-0.555208,-0.605081,-0.548633,-0.607958,-0.319983,0.439456,0.509057,-0.300312,...,-0.223055,-0.239824,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,-0.208498
3,-0.648892,-0.656662,-0.692949,-0.573741,-0.570788,-0.588772,-0.319983,1.244543,1.344885,2.534213,...,-0.223055,-0.239824,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,-0.208498
4,1.753199,0.589409,0.236800,0.460496,0.612286,0.412269,-0.319983,-0.365631,-0.326770,2.534213,...,-0.223055,-0.239824,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,-0.208498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,0.552154,-0.172079,-0.314162,0.073963,-0.132120,0.032756,-0.319983,-0.365631,-0.326770,-0.300312,...,-0.223055,7.063493,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,-0.208498
1610,-0.648892,-0.172079,-0.176422,-0.155868,-0.163137,-0.165887,-0.319983,-0.365631,-0.326770,-0.300312,...,-0.223055,3.411835,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,3.895357
1611,0.552154,0.312504,0.615587,0.199325,0.195773,0.242679,-0.319983,1.244543,1.344885,-0.300312,...,-0.223055,-0.239824,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,-0.208498
1612,-0.048369,-0.172079,-0.245292,-0.281229,-0.092241,-0.292075,-0.319983,-0.365631,-0.326770,-0.300312,...,-0.223055,-0.239824,-0.135265,-0.078958,-0.100063,-0.162105,0.0,-0.167574,-0.209553,-0.208498


In [131]:
test_pred = model.predict(X)

In [132]:
y_predict = []
for y in test_pred:
    int_y = 1 if y > 0.1 else 0
    y_predict.append(int_y)

In [138]:
import re

with open('submission.csv', 'w') as dst:
    dst.write('id,label\n')
    for path, score in zip(range(len(X)), y_predict):
        dst.write(f'{path},{score}\n')

In [None]:
# from rdkit.Chem.BRICS import BRICSDecompose #пока не знаю как использовать
# brics = []
# for mol in tqdm(data["Smiles"]):
#     m = Chem.MolFromSmiles(mol)
#     brics.append(BRICSDecompose(m))

In [None]:
# from rdkit.Chem.Descriptors3D import Asphericity
# den_morgan1 = []
# den_morgan2 = []
# den_morgan3 = []
# heavy_mol_vt = []
# for mol in tqdm(data["Smiles"]):
#     m = Chem.MolFromSmiles(mol)
# #     den_morgan1.append(Descriptors.MaxAbsPartialCharge(mol))
# #     den_morgan2.append(Descriptors.MaxPartialCharge(mol))
# #     den_morgan3.append(Descriptors.MinAbsPartialCharge(mol))
#     heavy_mol_vt.append(PropertyFunctor(mol))