## Библиотеки

In [1]:
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote
from rdkit import Chem
import numpy as np

In [2]:
#если не запустится ячейка не страшно
from sklearn.model_selection import train_test_split
import scipy.stats # При работе со статистикой
import seaborn as sns
import matplotlib.pyplot as plt  # Библиотека для визуализации результатов
%matplotlib inline

#нужны на время для визуализации
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import NeighborhoodComponentsAnalysis

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_selection import RFE
from sklearn.svm import SVR

#нужны на время для проверки фичей
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

## Работа с данными

загрузка данных

In [3]:
data = pd.read_csv("Task/train.csv")

In [4]:
data = data.loc[:, 'Smiles':]

In [5]:
data

Unnamed: 0,Smiles,Active
0,COc1ccc2[nH]cc(CCN)c2c1,False
1,CCCN1CCC[C@H](c2cccc(O)c2)C1.Cl,False
2,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,False
3,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,False
4,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,False
...,...,...
5552,O=C(Oc1ccc([N+](=O)[O-])cc1)N1CCC(C(O)(c2ccc3c...,False
5553,Nc1nonc1/C(=N/O)Nc1ccc(F)c(Br)c1,False
5554,Oc1cccc2cccnc12,False
5555,OC(c1ccc(-c2ccc(CN3CCN(Cc4ccncc4)CC3)cc2)c(F)c...,False


Генерация фичей

In [6]:
ssr = []
num_atoms = []
num_atoms_with_hs = []
atomic_num = []
binStr = []

for mol in tqdm(data["Smiles"]):
    a = []
    m = Chem.MolFromSmiles(mol)
    ssr.append(Chem.GetSSSR(m))
    num_atoms.append(m.GetNumAtoms())
    m2 = Chem.AddHs(m)
    num_atoms_with_hs.append(m2.GetNumAtoms())
    for atom in m.GetAtoms():
        a.append(atom.GetAtomicNum())
    atomic_num.append(sum(a))
    binStr.append(len(m.ToBinary()))

100%|█████████████████████████████████████| 5557/5557 [00:03<00:00, 1679.49it/s]


In [7]:
from rdkit.Chem.Descriptors import ExactMolWt

mol_wt = []

for mol in tqdm(data["Smiles"]):
    m = Chem.MolFromSmiles(mol)
    mol_wt.append(ExactMolWt(m)) #weight of the molecule

100%|█████████████████████████████████████| 5557/5557 [00:01<00:00, 3703.42it/s]


In [8]:
from rdkit.Chem import Fragments

num_of_carboxylic = []
num_of_hydroxyl = []
num_of_hydroxylOH = []
num_of_nfunctional = []
num_of_aromatics = []
num_of_arom_nitrogens = []
num_of_arom_amines = []
num_of_arom_hydroxyl = []
num_of_hydroxyl_acids = []
num_of_carbonyl = []
num_of_carbonyl_excluding = []
num_of_thiocarbonyl = []
num_of_sml = []
num_of_Imines = []
num_of_Tertiary_amines = []
num_of_Secondary_amines = []
num_of_Primary_amines = []
num_of_hydroxylamine = []
num_of_XCCNR = []
num_of_tertalicyclic = []
num_of_Hpyrrole = []
num_of_thiol = []

for mol in tqdm(data["Smiles"]):
    
    m = Chem.MolFromSmiles(mol)
    num_of_carboxylic.append(Fragments.fr_Al_COO(m)) #Number of aliphatic carboxylic acids
    num_of_hydroxyl.append(Fragments.fr_Al_OH(m)) #Number of aliphatic hydroxyl groups
    num_of_hydroxylOH.append(Fragments.fr_Al_OH_noTert(m)) #Number of aliphatic hydroxyl groups excluding tert-OH
    num_of_nfunctional.append(Fragments.fr_ArN(m)) #Number of N functional groups attached to aromatics
    num_of_aromatics.append(Fragments.fr_Ar_COO(m)) #Number of Aromatic carboxylic acide
    num_of_arom_nitrogens.append(Fragments.fr_Ar_N(m)) #Number of aromatic nitrogens
    num_of_arom_amines.append(Fragments.fr_Ar_NH(m)) #Number of aromatic amines
    num_of_arom_hydroxyl.append(Fragments.fr_Ar_OH(m)) #Number of aromatic hydroxyl groups
    num_of_hydroxyl_acids.append(Fragments.fr_COO(m)) #Number of carboxylic acids
    num_of_carbonyl.append(Fragments.fr_C_O(m)) #Number of carbonyl O
    num_of_carbonyl_excluding.append(Fragments.fr_C_O_noCOO(m)) #Number of carbonyl O, excluding COOH
    num_of_thiocarbonyl.append(Fragments.fr_C_S(m)) #Number of thiocarbonyl
    num_of_sml.append(Fragments.fr_HOCCN(m)) #Number of C(OH)CCN-Ctert-alkyl or C(OH)CCNcyclic
    num_of_Imines.append(Fragments.fr_Imine(m)) #Number of Imines
    num_of_Tertiary_amines.append(Fragments.fr_NH0(m)) #Number of Tertiary amines
    num_of_Secondary_amines.append(Fragments.fr_NH1(m)) #Number of Secondary amines
    num_of_Primary_amines.append(Fragments.fr_NH2(m)) #Number of Primary amines
    num_of_hydroxylamine.append(Fragments.fr_N_O(m)) #Number of hydroxylamine groups
    num_of_XCCNR.append(Fragments.fr_Ndealkylation1(m)) #Number of XCCNR groups
    num_of_tertalicyclic.append(Fragments.fr_Ndealkylation2(m)) #Number of tert-alicyclic amines (no heteroatoms, not quinine-like bridged N)
    num_of_Hpyrrole.append(Fragments.fr_Nhpyrrole(m)) #Number of H-pyrrole nitrogens
    num_of_thiol.append(Fragments.fr_SH(m)) #Number of thiol groups

100%|█████████████████████████████████████| 5557/5557 [00:03<00:00, 1655.86it/s]


In [17]:
########################################
#  Писать тут, и только тут))))))))))  #
########################################

from rdkit.Chem import Fragments

num_of_aldehyde = []
num_of_alkyl_carbamate = []
num_of_alkyl_halide = []
num_of_allylic_oxid = []
num_of_amide = []
num_of_amidine = []
num_of_aniline = []
num_of_aryl_methyl = []
num_of_azide = []
num_of_azo = []
num_of_barbitur = []
num_of_benzene = []
num_of_benzodiazepine = []
num_of_bicyclic = []
num_of_diazo = []
num_of_dihydropyridine = []
num_of_epoxide = []
num_of_ester = []
num_of_ether = []
num_of_furan = []
num_of_guanido = []
num_of_halogen = []
num_of_hdrzine = []
num_of_hdrzone = []
num_of_imidazole = []
num_of_imide = []
num_of_isothiocyan = []
num_of_ketone = []
num_of_ketone_Topliss = []
num_of_lactam = []
num_of_lactone = []
num_of_methoxy = []
num_of_morpholine = []
num_of_nitrile = []
num_of_nitro = []
num_of_nitro_arom = []
num_of_nitro_arom_nonortho = []
num_of_nitroso = []
num_of_oxazole = []
num_of_oxime = []
num_of_para_hydroxylation = []
num_of_phenol = []
num_of_phenol_noOrthoHbond = []
num_of_phos_acid = []
num_of_phos_ester = []
num_of_piperdine = []
num_of_piperzine = []
num_of_priamide = []
num_of_pyridine = []
num_of_quatN = []
num_of_sulfide = []
num_of_sulfonamd = []
num_of_sulfone = []
num_of_term_acetylene = []
num_of_tetrazole = []
num_of_thiazole = []
num_of_thiocyan = []
num_of_thiophene = []
num_of_unbrch_alkane = []
num_of_urea = []

for mol in tqdm(data["Smiles"]):
    
    m = Chem.MolFromSmiles(mol)
    num_of_aldehyde.append(Fragments.fr_aldehyde(m)) # Number of aldehydes
    num_of_alkyl_carbamate.append(Fragments.fr_alkyl_carbamate(m)) # Number of alkyl carbamates (subject to hydrolysis)
    num_of_alkyl_halide.append(Fragments.fr_alkyl_halide(m)) # Number of alkyl halides
    num_of_allylic_oxid.append(Fragments.fr_allylic_oxid(m)) # Number of allylic oxidation sites excluding steroid dienone
    num_of_amide.append(Fragments.fr_amide(m)) # Number of amides
    num_of_amidine.append(Fragments.fr_amidine(m)) # Number of amidine groups
    num_of_aniline.append(Fragments.fr_aniline(m)) # Number of anilines
    num_of_aryl_methyl.append(Fragments.fr_aryl_methyl(m)) # Number of aryl methyl sites for hydroxylation
    num_of_azide.append(Fragments.fr_azide(m)) # Number of azide groups
    num_of_azo.append(Fragments.fr_azo(m)) # Number of azo groups
    num_of_barbitur.append(Fragments.fr_barbitur(m)) # Number of barbiturate groups
    num_of_benzene.append(Fragments.fr_benzene(m)) # Number of benzene rings
    num_of_benzodiazepine.append(Fragments.fr_benzodiazepine(m)) # Number of benzodiazepines with no additional fused rings
    num_of_bicyclic.append(Fragments.fr_bicyclic(m)) # Bicyclic
    num_of_diazo.append(Fragments.fr_diazo(m)) # Number of diazo groups
    num_of_dihydropyridine.append(Fragments.fr_dihydropyridine(m)) # Number of dihydropyridines
    num_of_epoxide.append(Fragments.fr_epoxide(m)) # Number of epoxide rings
    num_of_ester.append(Fragments.fr_ester(m)) # Number of esters
    num_of_ether.append(Fragments.fr_ether(m)) # Number of ether oxygens (including phenoxy)
    num_of_furan.append(Fragments.fr_furan(m)) # Number of furan rings
    num_of_guanido.append(Fragments.fr_guanido(m)) # Number of guanidine groups
    num_of_halogen.append(Fragments.fr_halogen(m)) # Number of halogens
    num_of_hdrzine.append(Fragments.fr_hdrzine(m)) # Number of hydrazine groups
    num_of_hdrzone.append(Fragments.fr_hdrzone(m)) # Number of hydrazone groups
    num_of_imidazole.append(Fragments.fr_imidazole(m)) # Number of imidazole rings
    num_of_imide.append(Fragments.fr_imide(m)) # Number of imide groups
    num_of_isothiocyan.append(Fragments.fr_isothiocyan(m)) # Number of isothiocyanates
    num_of_ketone.append(Fragments.fr_ketone(m)) # Number of ketones
    num_of_ketone_Topliss.append(Fragments.fr_ketone_Topliss(m)) # Number of ketones excluding diaryl, a,b-unsat. dienones, heteroatom on Calpha
    num_of_lactam.append(Fragments.fr_lactam(m)) # Number of beta lactams
    num_of_lactone.append(Fragments.fr_lactone(m)) # Number of cyclic esters (lactones)
    num_of_methoxy.append(Fragments.fr_methoxy(m)) # Number of methoxy groups -OCH3
    num_of_morpholine.append(Fragments.fr_morpholine(m)) # Number of morpholine rings
    num_of_nitrile.append(Fragments.fr_nitrile(m)) # Number of nitriles
    num_of_nitro.append(Fragments.fr_nitro(m)) # Number of nitro groups
    num_of_nitro_arom.append(Fragments.fr_nitro_arom(m)) # Number of nitro benzene ring substituents
    num_of_nitro_arom_nonortho.append(Fragments.fr_nitro_arom_nonortho(m)) # Number of non-ortho nitro benzene ring substituents
    num_of_nitroso.append(Fragments.fr_nitroso(m)) # Number of nitroso groups, excluding NO2
    num_of_oxazole.append(Fragments.fr_oxazole(m)) # Number of oxazole rings
    num_of_oxime.append(Fragments.fr_oxime(m)) # Number of oxime groups
    num_of_para_hydroxylation.append(Fragments.fr_para_hydroxylation(m)) # Number of para-hydroxylation sites
    num_of_phenol.append(Fragments.fr_phenol(m)) # Number of phenols
    num_of_phenol_noOrthoHbond.append(Fragments.fr_phenol_noOrthoHbond(m)) # Number of phenolic OH excluding ortho intramolecular Hbond substituents
    num_of_phos_acid.append(Fragments.fr_phos_acid(m)) # Number of phosphoric acid groups
    num_of_phos_ester.append(Fragments.fr_phos_ester(m)) # Number of phosphoric ester groups
    num_of_piperdine.append(Fragments.fr_piperdine(m)) # Number of piperdine rings
    num_of_piperzine.append(Fragments.fr_piperzine(m)) # Number of piperzine rings
    num_of_priamide.append(Fragments.fr_priamide(m)) # Number of primary amides
    num_of_pyridine.append(Fragments.fr_pyridine(m)) # Number of pyridine rings
    num_of_quatN.append(Fragments.fr_quatN(m)) # Number of quarternary nitrogens
    num_of_sulfide.append(Fragments.fr_sulfide(m)) # Number of thioether
    num_of_sulfonamd.append(Fragments.fr_sulfonamd(m)) # Number of sulfonamides
    num_of_sulfone.append(Fragments.fr_sulfone(m)) # Number of sulfone groups
    num_of_term_acetylene.append(Fragments.fr_term_acetylene(m)) # Number of terminal acetylenes
    num_of_tetrazole.append(Fragments.fr_tetrazole(m)) # Number of tetrazole rings
    num_of_thiazole.append(Fragments.fr_thiazole(m)) # Number of thiazole rings
    num_of_thiocyan.append(Fragments.fr_thiocyan(m)) # Number of thiocyanates
    num_of_thiophene.append(Fragments.fr_thiophene(m)) # Number of thiophene rings
    num_of_unbrch_alkane.append(Fragments.fr_unbrch_alkane(m)) # Number of unbranched alkanes of at least 4 members (excludes halogenated alkanes)
    num_of_urea.append(Fragments.fr_urea(m)) # Number of urea groups

100%|██████████████████████████████████████| 5557/5557 [00:06<00:00, 825.09it/s]


In [10]:
from rdkit.Chem import GraphDescriptors

value_balabanJ = []
complexity_of_mol = []
value_Chi0 = []
value_Chi0n = []
value_Chi0v = []
value_Chi1 = []
value_Chi1n = []
value_Chi1v = []
value_Chi2n = []
value_Chi2v = []
value_Chi3n = []
value_Chi3v = []
value_Chi4n = []
value_Chi4v = []
value_HallKierAlpha = []
value_Ipc = []
value_Kappa1 = []
value_Kappa2 = []
value_Kappa3 = []

for mol in tqdm(data["Smiles"]):
    
    m = Chem.MolFromSmiles(mol)
    value_balabanJ.append(GraphDescriptors.BalabanJ(m)) #Calculate Balaban’s J value for a molecule
    complexity_of_mol.append(GraphDescriptors.BertzCT(m)) #A topological index meant to quantify “complexity” of molecules.
    value_Chi0.append(GraphDescriptors.Chi0(m))
    value_Chi0n.append(GraphDescriptors.Chi0n(m))
    value_Chi0v.append(GraphDescriptors.Chi0v(m))
    value_Chi1.append(GraphDescriptors.Chi1(m))
    value_Chi1n.append(GraphDescriptors.Chi1n(m))
    value_Chi1v.append(GraphDescriptors.Chi1v(m))
    value_Chi2n.append(GraphDescriptors.Chi2n(m))
    value_Chi2v.append(GraphDescriptors.Chi2v(m))
    value_Chi3n.append(GraphDescriptors.Chi3n(m))
    value_Chi3v.append(GraphDescriptors.Chi3v(m))
    value_Chi4n.append(GraphDescriptors.Chi4n(m))
    value_Chi4v.append(GraphDescriptors.Chi4v(m))
    value_HallKierAlpha.append(GraphDescriptors.HallKierAlpha(m))
    value_Ipc.append(GraphDescriptors.Ipc(m))
    value_Kappa1.append(GraphDescriptors.Kappa1(m))
    value_Kappa2.append(GraphDescriptors.Kappa2(m))
    value_Kappa3.append(GraphDescriptors.Kappa3(m))

100%|██████████████████████████████████████| 5557/5557 [00:40<00:00, 137.57it/s]


In [11]:
from rdkit.Chem import Lipinski

parameters_L = []
num_of_heavy_a = []
num_of_NHs_or_ONs = []
num_of_nitrogens_oxygens = []
num_of_aliphaticC = []
num_of_aliphaticH = []
num_of_aliphaticR = []
num_of_aliphaticR = []
num_of_aromatic_carbocycles = []
num_of_aromatic_heterocycles = []
num_of_aromatic_rings = []
num_of_haccept = []
num_of_hdonor = []
num_of_heteroatoms = []
num_of_rotatable = []
num_of_sat_carbocycles = []
num_of_sat_heterocycles = []
num_of_sat_rings = []
count_of_rings = []

for mol in tqdm(data["Smiles"]):
    m = Chem.MolFromSmiles(mol)
    parameters_L.append(Lipinski.FractionCSP3(m)) #Calculation of Lipinski parameters for molecules
    num_of_heavy_a.append(Lipinski.HeavyAtomCount(m)) #Number of heavy atoms a molecule.
    num_of_NHs_or_ONs.append(Lipinski.NHOHCount(m)) #Number of NHs or OHs
    num_of_nitrogens_oxygens.append(Lipinski.NOCount(m)) #Number of Nitrogens and Oxygens
    num_of_aliphaticC.append(Lipinski.NumAliphaticCarbocycles(m)) #number of aliphatic
    num_of_aliphaticH.append(Lipinski.NumAliphaticHeterocycles(m)) #number of aliphatic
    num_of_aliphaticR.append(Lipinski.NumAliphaticRings(m)) #number of aliphatic
    num_of_aromatic_carbocycles.append(Lipinski.NumAromaticCarbocycles(m)) #number of aromatic carbocycles for a molecule
    num_of_aromatic_heterocycles.append(Lipinski.NumAromaticHeterocycles(m)) #number of aromatic heterocycles for a molecule
    num_of_aromatic_rings.append(Lipinski.NumAromaticRings(m)) #number of aromatic rings for a molecule
    num_of_haccept.append(Lipinski.NumHAcceptors(m)) #Number of Hydrogen Bond Acceptors
    num_of_hdonor.append(Lipinski.NumHDonors(m)) #Number of Hydrogen Bond Donors
    num_of_heteroatoms.append(Lipinski.NumHeteroatoms(m)) #Number of Heteroatoms
    num_of_rotatable.append(Lipinski.NumRotatableBonds(m)) #Number of Rotatable Bonds
    num_of_sat_carbocycles.append(Lipinski.NumSaturatedCarbocycles(m)) #number of saturated carbocycles for a molecule
    num_of_sat_heterocycles.append(Lipinski.NumSaturatedHeterocycles(m)) #number of saturated heterocycles for a molecule
    num_of_sat_rings.append(Lipinski.NumSaturatedRings(m)) #number of saturated rings for a molecule
    count_of_rings.append(Lipinski.RingCount(m)) #

100%|█████████████████████████████████████| 5557/5557 [00:03<00:00, 1546.92it/s]


In [12]:
from rdkit.Chem import MolSurf

labute = []
peoe_vsa1 = []
peoe_vsa10 = []
peoe_vsa11 = []
peoe_vsa12 = []
peoe_vsa13 = []
peoe_vsa14 = []
peoe_vsa2 = []
peoe_vsa3 = []
peoe_vsa4 = []
peoe_vsa5 = []
peoe_vsa6 = []
peoe_vsa7 = []
peoe_vsa8 = []
peoe_vsa9 = []
smr_vsa1 = []
smr_vsa10 = []
smr_vsa2 = []
smr_vsa3 = []
smr_vsa4 = []
smr_vsa5 = []
smr_vsa6 = []
smr_vsa7 = []
smr_vsa9 = []
slogp_vsa1 = []
slogp_vsa10 = []
slogp_vsa11 = []
slogp_vsa12 = []
slogp_vsa2 = []
slogp_vsa3 = []
slogp_vsa4 = []
slogp_vsa5 = []
slogp_vsa6 = []
slogp_vsa7 = []
slogp_vsa8 = []
tpsa = []
pyLabuteASA = []

for mol in tqdm(data["Smiles"]):
    m = Chem.MolFromSmiles(mol)
    labute.append(MolSurf.LabuteASA(m))
    peoe_vsa1.append(MolSurf.PEOE_VSA1(m))
    peoe_vsa10.append(MolSurf.PEOE_VSA10(m))
    peoe_vsa11.append(MolSurf.PEOE_VSA11(m))
    peoe_vsa12.append(MolSurf.PEOE_VSA12(m)) 
    peoe_vsa13.append(MolSurf.PEOE_VSA13(m))  
    peoe_vsa14.append(MolSurf.PEOE_VSA14(m))   
    peoe_vsa2.append(MolSurf.PEOE_VSA2(m)) 
    peoe_vsa3.append(MolSurf.PEOE_VSA3(m))
    peoe_vsa4.append(MolSurf.PEOE_VSA4(m))
    peoe_vsa5.append(MolSurf.PEOE_VSA5(m))
    peoe_vsa6.append(MolSurf.PEOE_VSA6(m))
    peoe_vsa7.append(MolSurf.PEOE_VSA7(m))
    peoe_vsa8.append(MolSurf.PEOE_VSA8(m)) 
    peoe_vsa9.append(MolSurf.PEOE_VSA9(m))
    smr_vsa1.append(MolSurf.SMR_VSA1(m))
    smr_vsa10.append(MolSurf.SMR_VSA10(m))
    smr_vsa2.append(MolSurf.SMR_VSA2(m))
    smr_vsa3.append(MolSurf.SMR_VSA3(m))
    smr_vsa4.append(MolSurf.SMR_VSA4(m))
    smr_vsa5.append(MolSurf.SMR_VSA5(m))
    smr_vsa6.append(MolSurf.SMR_VSA6(m))
    smr_vsa7.append(MolSurf.SMR_VSA7(m))  
    smr_vsa9.append(MolSurf.SMR_VSA9(m))
    slogp_vsa1.append(MolSurf.SlogP_VSA1(m))
    slogp_vsa10.append(MolSurf.SlogP_VSA10(m))
    slogp_vsa11.append(MolSurf.SlogP_VSA11(m))
    slogp_vsa12.append(MolSurf.SlogP_VSA12(m))
    slogp_vsa2.append(MolSurf.SlogP_VSA2(m))
    slogp_vsa3.append(MolSurf.SlogP_VSA3(m))
    slogp_vsa4.append(MolSurf.SlogP_VSA4(m))
    slogp_vsa5.append(MolSurf.SlogP_VSA5(m))
    slogp_vsa6.append(MolSurf.SlogP_VSA6(m))
    slogp_vsa7.append(MolSurf.SlogP_VSA7(m))
    slogp_vsa8.append(MolSurf.SlogP_VSA8(m)) 
    tpsa.append(MolSurf.TPSA(m))
    pyLabuteASA.append(MolSurf.pyLabuteASA(m))

100%|██████████████████████████████████████| 5557/5557 [00:09<00:00, 583.33it/s]


In [13]:
data["SSR"] = ssr
data["Num atoms"] = num_atoms
data["Num atoms with Hs"] = num_atoms_with_hs
data["Atomic num"] = atomic_num
data["Binary"] = binStr
data["Weight"] = mol_wt

In [14]:
data["Aliphatic carboxylic acids"] = num_of_carboxylic
data["aliphatic hydroxyl groups"] = num_of_hydroxyl
data["aliphatic hydroxyl groups excluding tert-OH"] = num_of_hydroxylOH
data["N functional groups attached to aromatics"] = num_of_nfunctional
data["Aromatic carboxylic acide"] = num_of_aromatics
data["aromatic nitrogens"] = num_of_arom_nitrogens
data["aromatic amines"] = num_of_arom_amines
data["aromatic hydroxyl groups"] = num_of_arom_hydroxyl
data["carboxylic acids"] = num_of_hydroxyl_acids
data["Number of carbonyl O"] = num_of_carbonyl
data["carbonyl O, excluding COOH"] = num_of_carbonyl_excluding
data["C(OH)CCN-Ctert-alkyl or C(OH)CCNcyclic"] = num_of_thiocarbonyl
data["Number of Imines"] = num_of_Imines
data["Tertiary amines"] = num_of_Tertiary_amines
data["Secondary amines"] = num_of_Secondary_amines
data["Primary amines"] = num_of_Primary_amines
data["hydroxylamine groups"] = num_of_hydroxylamine
data["XCCNR groups"] = num_of_XCCNR
data["tert-alicyclic amines"] = num_of_tertalicyclic
data["H-pyrrole nitrogens"] = num_of_Hpyrrole
data["thiol groups"] = num_of_thiol

In [26]:
data["Number of aldehydes"] = value_balabanJ
data["Balaban’s J value"] = value_balabanJ
data["“complexity” of molecules"] = complexity_of_mol
data["value_Chi0"] = value_Chi0
data["value_Chi0n"] = value_Chi0n
data["value_Chi0v"] = value_Chi0v
data["value_Chi1"] = value_Chi1
data["value_Chi1n"] = value_Chi1n
data["value_Chi1v"] = value_Chi1v
data["value_Chi2n"] = value_Chi2n
data["value_Chi2v"] = value_Chi2v
data["value_Chi3n"] = value_Chi3n
data["value_Chi3v"] = value_Chi3v
data["value_Chi4n"] = value_Chi4n
data["value_Chi4v"] = value_Chi4v
data["value_HallKierAlpha"] = value_HallKierAlpha
data["value_Ipc"] = value_Ipc
data["value_Kappa1"] = value_Kappa1
data["value_Kappa2"] = value_Kappa2
data["value_Kappa3"] = value_Kappa3

In [28]:
data["parameters_L"] = parameters_L
data["num_of_heavy_a"] = num_of_heavy_a
data["num_of_NHs_or_ONs"] = num_of_NHs_or_ONs
data["num_of_nitrogens_oxygens"] = num_of_nitrogens_oxygens 
data["num_of_aliphaticC"] = num_of_aliphaticC
data["num_of_aliphaticH"] = num_of_aliphaticH 
data["num_of_aliphaticR"] = num_of_aliphaticR 
data["num_of_aliphaticR"] = num_of_aliphaticR
data["num_of_aromatic_carbocycles"] = num_of_aromatic_carbocycles
data["num_of_aromatic_heterocycles"] = num_of_aromatic_heterocycles
data["num_of_aromatic_rings"] = num_of_aromatic_rings
data["num_of_haccept"] = num_of_haccept
data["num_of_hdonor"] = num_of_hdonor
data["num_of_heteroatoms"] = num_of_heteroatoms
data["num_of_rotatable"] = num_of_rotatable
data["num_of_sat_carbocycles"] = num_of_sat_carbocycles
data["num_of_sat_heterocycles"] = num_of_sat_heterocycles
data["num_of_sat_rings"] = num_of_sat_rings
data["count_of_rings"] = count_of_rings

In [29]:
data["labute"] = labute
data["peoe_vsa1"] = peoe_vsa1
data["peoe_vsa10"] = peoe_vsa10
data["peoe_vsa11"] = peoe_vsa11
data["peoe_vsa12"] = peoe_vsa12
data["peoe_vsa13"] = peoe_vsa13
data["peoe_vsa14"] = peoe_vsa14
data["peoe_vsa2"] = peoe_vsa2
data["peoe_vsa3"] = peoe_vsa3
data["peoe_vsa4"] = peoe_vsa4
data["peoe_vsa5"] = peoe_vsa5
data["peoe_vsa6"] = peoe_vsa6
data["peoe_vsa7"] = peoe_vsa7
data["peoe_vsa8"] = peoe_vsa8
data["peoe_vsa9"] = peoe_vsa9
data["smr_vsa1"] = smr_vsa1
data["smr_vsa10"] = smr_vsa10
data["smr_vsa2"] = smr_vsa2
data["smr_vsa3"] = smr_vsa3
data["smr_vsa4"] = smr_vsa4
data["smr_vsa5"] = smr_vsa5
data["smr_vsa6"] = smr_vsa6
data["smr_vsa7"] = smr_vsa7
data["smr_vsa9"] = smr_vsa9
data["slogp_vsa1"] = slogp_vsa1
data["slogp_vsa10"] = slogp_vsa10
data["slogp_vsa11"] = slogp_vsa11
data["slogp_vsa12"] = slogp_vsa12
data["slogp_vsa2"] = slogp_vsa2
data["slogp_vsa3"] = slogp_vsa3
data["slogp_vsa4"] = slogp_vsa4
data["slogp_vsa5"] = slogp_vsa5
data["slogp_vsa6"] = slogp_vsa6
data["slogp_vsa7"] = slogp_vsa7
data["slogp_vsa8"] = slogp_vsa8
data["tpsa"] = tpsa
data["pyLabuteASA"] = pyLabuteASA

  data["slogp_vsa7"] = slogp_vsa7
  data["slogp_vsa8"] = slogp_vsa8
  data["tpsa"] = tpsa
  data["pyLabuteASA"] = pyLabuteASA


In [31]:
data["num_of_aldehyde"] = num_of_aldehyde
data["num_of_amide"] = num_of_amide
data["num_of_amidine"] = num_of_amidine
data["num_of_aniline"] = num_of_aniline
data["num_of_aryl_methyl"] = num_of_aryl_methyl
data["num_of_azide"] = num_of_azide
data["num_of_azo"] = num_of_azo
data["num_of_barbitur"] = num_of_barbitur
data["num_of_benzene"] = num_of_benzene
data["num_of_benzodiazepine"] = num_of_benzodiazepine
data["num_of_bicyclic"] = num_of_bicyclic
data["num_of_diazo"] = num_of_diazo
data["num_of_dihydropyridine"] = num_of_dihydropyridine
data["num_of_epoxide"] = num_of_epoxide
data["num_of_ester"] = num_of_ester
data["num_of_ether"] = num_of_ether
data["num_of_furan"] = num_of_furan
data["num_of_guanido"] = num_of_guanido
data["num_of_halogen"] = num_of_halogen
data["num_of_hdrzine"] = num_of_hdrzine
data["num_of_hdrzone"] = num_of_hdrzone
data["num_of_imidazole"] = num_of_imidazole
data["num_of_imide"] = num_of_imide
data["num_of_isothiocyan"] = num_of_isothiocyan
data["num_of_ketone"] = num_of_ketone
data["num_of_ketone_Topliss"] = num_of_ketone_Topliss
data["num_of_lactam"] = num_of_lactam
data["num_of_lactone"] = num_of_lactone
data["num_of_methoxy"] = num_of_methoxy
data["num_of_morpholine"] = num_of_morpholine
data["num_of_nitrile"] = num_of_nitrile
data["num_of_nitro"] = num_of_nitro
data["num_of_nitro_arom"] = num_of_nitro_arom
data["num_of_nitro_arom_nonortho"] = num_of_nitro_arom_nonortho
data["num_of_nitroso"] = num_of_nitroso
data["num_of_oxazole"] = num_of_oxazole
data["num_of_oxime"] = num_of_oxime
data["num_of_para_hydroxylation"] = num_of_para_hydroxylation
data["num_of_phenol"] = num_of_phenol
data["num_of_phenol_noOrthoHbond"] = num_of_phenol_noOrthoHbond
data["num_of_phos_acid"] = num_of_phos_acid
data["num_of_phos_ester"] = num_of_phos_ester
data["num_of_piperdine"] = num_of_piperdine
data["num_of_piperzine"] = num_of_piperzine
data["num_of_priamide"] = num_of_priamide
data["num_of_pyridine"] = num_of_pyridine
data["num_of_quatN"] = num_of_quatN
data["num_of_sulfide"] = num_of_sulfide
data["num_of_sulfonamd"] = num_of_sulfonamd
data["num_of_sulfone"] = num_of_sulfone
data["num_of_term_acetylene"] = num_of_term_acetylene
data["num_of_tetrazole"] = num_of_tetrazole
data["num_of_thiazole"] = num_of_thiazole
data["num_of_thiocyan"] = num_of_thiocyan
data["num_of_thiophene"] = num_of_thiophene
data["num_of_unbrch_alkane"] = num_of_unbrch_alkane
data["num_of_urea"] = num_of_urea

  data["num_of_amide"] = num_of_amide
  data["num_of_amidine"] = num_of_amidine
  data["num_of_aniline"] = num_of_aniline
  data["num_of_aryl_methyl"] = num_of_aryl_methyl
  data["num_of_azide"] = num_of_azide
  data["num_of_azo"] = num_of_azo
  data["num_of_barbitur"] = num_of_barbitur
  data["num_of_benzene"] = num_of_benzene
  data["num_of_benzodiazepine"] = num_of_benzodiazepine
  data["num_of_bicyclic"] = num_of_bicyclic
  data["num_of_diazo"] = num_of_diazo
  data["num_of_dihydropyridine"] = num_of_dihydropyridine
  data["num_of_epoxide"] = num_of_epoxide
  data["num_of_ester"] = num_of_ester
  data["num_of_ether"] = num_of_ether
  data["num_of_furan"] = num_of_furan
  data["num_of_guanido"] = num_of_guanido
  data["num_of_halogen"] = num_of_halogen
  data["num_of_hdrzine"] = num_of_hdrzine
  data["num_of_hdrzone"] = num_of_hdrzone
  data["num_of_imidazole"] = num_of_imidazole
  data["num_of_imide"] = num_of_imide
  data["num_of_isothiocyan"] = num_of_isothiocyan
  data["num_of_k

In [32]:
data[data["Active"] == True]

Unnamed: 0,Smiles,Active,SSR,Num atoms,Num atoms with Hs,Atomic num,Binary,Weight,Aliphatic carboxylic acids,aliphatic hydroxyl groups,...,num_of_sulfide,num_of_sulfonamd,num_of_sulfone,num_of_term_acetylene,num_of_tetrazole,num_of_thiazole,num_of_thiocyan,num_of_thiophene,num_of_unbrch_alkane,num_of_urea
17,Nc1ccc(O)c2ncccc12,True,2,12,20,76,210,160.063663,0,0,...,0,0,0,0,0,0,0,0,0,0
38,C[C@H]1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)...,True,4,26,46,170,381,361.143784,0,0,...,0,0,0,0,0,0,0,0,0,0
52,Cl.NCCc1cc(O)c(O)cc1O,True,1,13,25,96,200,205.050571,0,0,...,0,0,0,0,0,0,0,0,0,0
111,Cc1ccc(-n2sc(=O)n(Cc3ccc(F)cc3)c2=O)cc1,True,3,22,35,151,337,316.068177,0,0,...,0,0,0,0,0,0,0,0,0,0
122,Oc1cc2c(cc1C(c1ccc(C(F)(F)F)cc1)N1CCOCC1)OCO2,True,4,27,45,180,389,381.118793,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5354,CC(C)c1c2cc(-c3ccnc(N[C@H]4CC[C@H](NC5CCOCC5)C...,True,5,33,69,206,475,448.295060,0,0,...,0,0,0,0,0,0,0,0,0,0
5492,Sc1nnc(Nc2ccccc2)s1,True,2,13,20,101,219,209.008139,0,0,...,0,0,0,0,0,0,0,0,0,0
5496,C[C@H]1COC2=C1C(=O)C(=O)c1c2ccc2c1CCCC2(C)C,True,4,22,42,138,332,296.141244,0,0,...,0,0,0,0,0,0,0,0,0,0
5520,Nc1c2ccccc2nc2ccccc12,True,3,15,25,92,260,194.084398,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5557 entries, 0 to 5556
Columns: 161 entries, Smiles to num_of_urea
dtypes: bool(1), float64(59), int64(100), object(1)
memory usage: 6.8+ MB


In [34]:
data.loc[:, :"N functional groups attached to aromatics"].describe()

Unnamed: 0,SSR,Num atoms,Num atoms with Hs,Atomic num,Binary,Weight,Aliphatic carboxylic acids,aliphatic hydroxyl groups,aliphatic hydroxyl groups excluding tert-OH,N functional groups attached to aromatics
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,3.056505,26.057585,48.178154,175.831744,376.974447,375.482504,0.180673,0.42199,0.362966,0.088177
std,1.631243,12.059634,23.95499,81.143572,153.239216,173.692206,0.509203,1.167831,1.102463,0.316567
min,0.0,2.0,2.0,25.0,58.0,59.037114,0.0,0.0,0.0,0.0
25%,2.0,19.0,33.0,128.0,285.0,272.078851,0.0,0.0,0.0,0.0
50%,3.0,25.0,45.0,168.0,365.0,358.179361,0.0,0.0,0.0,0.0
75%,4.0,31.0,57.0,208.0,443.0,443.184506,0.0,0.0,0.0,0.0
max,21.0,229.0,456.0,1561.0,2811.0,3350.524802,8.0,21.0,21.0,4.0


In [35]:
data.loc[:, "Aromatic carboxylic acide":"Tertiary amines"].describe()

Unnamed: 0,Aromatic carboxylic acide,aromatic nitrogens,aromatic amines,aromatic hydroxyl groups,carboxylic acids,Number of carbonyl O,"carbonyl O, excluding COOH",C(OH)CCN-Ctert-alkyl or C(OH)CCNcyclic,Number of Imines,Tertiary amines
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,0.044448,1.083858,0.130466,0.187331,0.225121,1.104013,0.894727,0.010797,0.034191,1.639014
std,0.218815,1.460724,0.371891,0.685653,0.547159,1.529873,1.42369,0.114901,0.194185,1.616814
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0
75%,0.0,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,3.0
max,2.0,9.0,4.0,25.0,8.0,35.0,34.0,2.0,2.0,9.0


In [38]:
data.loc[:, "Secondary amines":"Balaban’s J value"].describe()

Unnamed: 0,Secondary amines,Primary amines,hydroxylamine groups,XCCNR groups,tert-alicyclic amines,H-pyrrole nitrogens,thiol groups,Number of aldehydes,Balaban’s J value
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,0.830124,0.264891,0.019795,0.107252,0.099334,0.130466,0.005938,1.758009,1.758009
std,1.343574,0.624102,0.198909,0.356488,0.324533,0.371891,0.079147,0.892949,0.892949
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4e-06,-4e-06
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.446321,1.446321
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.803971,1.803971
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.236542,2.236542
max,35.0,8.0,6.0,7.0,4.0,4.0,2.0,7.51731,7.51731


In [39]:
data.loc[:, "Balaban’s J value":"value_Chi2v"].describe()

Unnamed: 0,Balaban’s J value,“complexity” of molecules,value_Chi0,value_Chi0n,value_Chi0v,value_Chi1,value_Chi1n,value_Chi1v,value_Chi2n,value_Chi2v
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,1.758009,842.800797,18.695352,14.763264,15.432679,12.391671,8.539267,9.070589,6.582307,7.165387
std,0.892949,492.891742,8.762501,6.97262,7.131845,5.73897,4.160898,4.363433,3.459447,3.66074
min,-4e-06,2.0,0.0,1.377964,2.062949,0.0,0.0,0.0,0.0,0.0
25%,1.446321,523.292025,13.56855,10.524929,11.161204,9.009168,5.923961,6.402689,4.397693,4.879933
50%,1.803971,790.93735,17.792529,14.049143,14.747661,11.891589,8.179911,8.725472,6.19144,6.763086
75%,2.236542,1084.76713,22.076621,17.517826,18.212666,14.774713,10.317286,10.8802,8.029874,8.718261
max,7.51731,8224.508052,170.714748,132.095123,137.810599,108.426511,75.809207,83.946621,56.868707,65.555766


In [40]:
data.loc[:, "value_Chi3n":"parameters_L"].describe()

Unnamed: 0,value_Chi3n,value_Chi3v,value_Chi4n,value_Chi4v,value_HallKierAlpha,value_Ipc,value_Kappa1,value_Kappa2,value_Kappa3,parameters_L
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,4.646357,5.106614,3.273983,3.650168,-2.314441,2.252375e+78,18.88332,8.064413,7.407568,0.373697
std,2.669864,2.809381,2.090955,2.238713,1.367951,1.67904e+80,9.688422,5.530441,142.696899,0.247609
min,0.0,0.0,0.0,0.0,-18.96,0.0,3.142429,0.710529,0.173734,0.0
25%,2.942579,3.289648,1.910272,2.202426,-3.07,20136.83,13.157378,5.265942,2.586795,0.190476
50%,4.271862,4.745121,2.93407,3.319033,-2.25,474549.1,17.690087,7.26098,3.798754,0.333333
75%,5.769388,6.345643,4.078679,4.567857,-1.45,10533590.0,22.131511,9.518994,5.353714,0.517241
max,39.092205,46.813106,23.855705,33.915095,5.759481,1.251645e+82,194.8414,209.688022,9507.96,1.0


In [41]:
data.loc[:, "num_of_heavy_a":"num_of_aromatic_rings"].describe()

Unnamed: 0,num_of_heavy_a,num_of_NHs_or_ONs,num_of_nitrogens_oxygens,num_of_aliphaticC,num_of_aliphaticH,num_of_aliphaticR,num_of_aromatic_carbocycles,num_of_aromatic_heterocycles,num_of_aromatic_rings
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,26.056505,2.289545,6.095915,0.371963,0.669066,1.041029,1.253914,0.761562,2.015476
std,12.059012,2.740555,4.211598,0.955654,0.963743,1.344846,0.975097,0.90147,1.319447
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,1.0,4.0,0.0,0.0,0.0,1.0,0.0,1.0
50%,25.0,2.0,5.0,0.0,0.0,1.0,1.0,1.0,2.0
75%,31.0,3.0,7.0,0.0,1.0,2.0,2.0,1.0,3.0
max,229.0,47.0,80.0,6.0,21.0,21.0,10.0,6.0,10.0


In [42]:
data.loc[:, "num_of_haccept":"labute"].describe()

Unnamed: 0,num_of_haccept,num_of_hdonor,num_of_heteroatoms,num_of_rotatable,num_of_sat_carbocycles,num_of_sat_heterocycles,num_of_sat_rings,count_of_rings,labute
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,5.058665,2.009358,7.167896,5.095195,0.264351,0.449703,0.714054,3.056505,155.567177
std,3.353186,2.353922,4.636738,4.349653,0.788537,0.795867,1.128101,1.631243,70.447341
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.580555
25%,3.0,1.0,4.0,2.0,0.0,0.0,0.0,2.0,113.138464
50%,4.0,2.0,6.0,4.0,0.0,0.0,0.0,3.0,149.216786
75%,6.0,3.0,9.0,7.0,0.0,1.0,1.0,4.0,183.329277
max,75.0,42.0,99.0,67.0,5.0,21.0,21.0,21.0,1355.863229


In [43]:
data.loc[:, "peoe_vsa1":"peoe_vsa9"].describe()

Unnamed: 0,peoe_vsa1,peoe_vsa10,peoe_vsa11,peoe_vsa12,peoe_vsa13,peoe_vsa14,peoe_vsa2,peoe_vsa3,peoe_vsa4,peoe_vsa5,peoe_vsa6,peoe_vsa7,peoe_vsa8,peoe_vsa9
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,15.993623,9.707579,4.514113,3.665595,2.32297,4.52976,7.247206,5.357421,2.483141,2.643651,24.91079,38.161347,20.504935,13.470223
std,14.585663,11.731537,7.013956,7.616448,4.12665,9.958939,8.030479,6.20299,5.725134,5.874714,22.372015,23.981231,15.994826,11.766265
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.523678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.420822,21.563326,6.606882,5.601051
50%,14.373636,5.83562,0.0,0.0,0.0,0.0,4.794537,4.794537,0.0,0.0,19.923495,35.914593,17.696186,11.629515
75%,20.056445,12.356394,5.959555,5.90718,5.559267,5.969305,9.778516,9.499376,4.305216,0.0,36.398202,52.682601,29.757643,18.994027
max,218.152827,170.911059,155.945729,177.215392,40.472507,325.583782,174.282726,55.301319,92.198716,69.605639,206.472123,163.511529,105.229573,118.923875


In [44]:
data.loc[:, "smr_vsa1":"slogp_vsa11"].describe()

Unnamed: 0,smr_vsa1,smr_vsa10,smr_vsa2,smr_vsa3,smr_vsa4,smr_vsa5,smr_vsa6,smr_vsa7,smr_vsa9,slogp_vsa1,slogp_vsa10,slogp_vsa11
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,17.955784,22.73047,0.320784,9.321302,4.646684,30.957122,15.72633,47.417255,6.436622,9.368033,5.362029,3.495276
std,17.508919,17.832934,1.410484,9.630037,8.832412,32.536494,15.678411,28.069127,9.138429,11.287284,7.906245,6.573886
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9.184952,11.594566,0.0,0.0,0.0,10.937324,5.316789,29.288102,0.0,4.736863,0.0,0.0
50%,14.325937,18.500273,0.0,9.384673,0.0,22.987065,12.340549,47.133754,0.0,5.733667,0.0,0.0
75%,23.418758,29.880703,0.0,14.764463,5.733667,41.039508,24.139969,65.482244,11.387856,11.24901,8.78083,5.749512
max,307.222756,325.583782,21.637134,200.398851,81.107899,437.173157,125.530757,180.539237,172.485355,209.244448,92.198716,172.485355


In [45]:
data.loc[:, "slogp_vsa11":"num_of_aldehyde"].describe()

Unnamed: 0,slogp_vsa11,slogp_vsa12,slogp_vsa2,slogp_vsa3,slogp_vsa4,slogp_vsa5,slogp_vsa6,slogp_vsa7,slogp_vsa8,tpsa,pyLabuteASA,num_of_aldehyde
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,3.495276,6.765765,39.550617,10.809562,6.203463,28.538732,37.823203,1.001528,6.594143,86.052973,155.567177,0.006478
std,6.573886,11.307092,31.894554,11.356372,9.311221,25.322125,23.735222,2.732824,9.267508,67.410904,70.447341,0.082447
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.580555,0.0
25%,0.0,0.0,20.165321,4.736863,0.0,11.126903,21.995169,0.0,0.0,50.19,113.138464,0.0
50%,0.0,0.0,32.287866,9.5314,0.0,22.975654,36.398202,0.0,0.0,74.6,149.216786,0.0
75%,5.749512,11.60094,50.277979,15.748277,10.114318,38.52493,53.86519,0.0,11.033401,103.02,183.329277,0.0
max,172.485355,136.767242,544.266083,180.650445,62.079473,251.589915,162.832976,30.1358,90.878495,1257.69,1355.863229,2.0


In [48]:
data.loc[:, "num_of_aldehyde":"num_of_benzene"].describe()

Unnamed: 0,num_of_aldehyde,num_of_amide,num_of_amidine,num_of_aniline,num_of_aryl_methyl,num_of_azide,num_of_azo,num_of_barbitur,num_of_benzene
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,0.006478,0.642433,0.021594,0.556775,0.292064,0.00108,0.003059,0.00036,1.253914
std,0.082447,1.35895,0.151432,0.915169,0.621631,0.032844,0.06701,0.018969,0.976389
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
max,2.0,34.0,2.0,6.0,5.0,1.0,2.0,1.0,10.0


In [49]:
data.loc[:, "num_of_benzodiazepine":"num_of_hdrzine"].describe()

Unnamed: 0,num_of_benzodiazepine,num_of_bicyclic,num_of_diazo,num_of_dihydropyridine,num_of_epoxide,num_of_ester,num_of_ether,num_of_furan,num_of_guanido,num_of_halogen,num_of_hdrzine
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,0.00126,0.997841,0.00018,0.004139,0.007918,0.129026,0.752384,0.019615,0.024474,0.709735,0.009538
std,0.035473,1.590444,0.013415,0.064207,0.105339,0.445053,1.236975,0.139977,0.175267,1.277597,0.099036
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
max,1.0,20.0,1.0,1.0,3.0,10.0,14.0,2.0,4.0,21.0,2.0


In [50]:
data.loc[:, "num_of_hdrzone":"num_of_methoxy"].describe()

Unnamed: 0,num_of_hdrzone,num_of_imidazole,num_of_imide,num_of_isothiocyan,num_of_ketone,num_of_ketone_Topliss,num_of_lactam,num_of_lactone,num_of_methoxy
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,0.013496,0.091596,0.022854,0.00072,0.136584,0.087277,0.013496,0.017995,0.222422
std,0.128672,0.298899,0.161045,0.032854,0.452839,0.339575,0.115398,0.138255,0.62367
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4.0,2.0,2.0,2.0,4.0,4.0,1.0,2.0,7.0


In [51]:
data.loc[:, "num_of_morpholine":"num_of_phenol_noOrthoHbond"].describe()

Unnamed: 0,num_of_morpholine,num_of_nitrile,num_of_nitro,num_of_nitro_arom,num_of_nitro_arom_nonortho,num_of_nitroso,num_of_oxazole,num_of_oxime,num_of_para_hydroxylation,num_of_phenol,num_of_phenol_noOrthoHbond
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,0.029872,0.04031,0.030232,0.022314,0.015116,0.0018,0.004859,0.009538,0.195789,0.168256,0.162678
std,0.178507,0.212535,0.188263,0.167152,0.130576,0.042387,0.072083,0.099036,0.507145,0.664821,0.660374
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,2.0,3.0,3.0,2.0,1.0,2.0,2.0,4.0,25.0,25.0


In [52]:
data.loc[:, "num_of_phos_acid":"num_of_sulfonamd"].describe()

Unnamed: 0,num_of_phos_acid,num_of_phos_ester,num_of_piperdine,num_of_piperzine,num_of_priamide,num_of_pyridine,num_of_quatN,num_of_sulfide,num_of_sulfonamd
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,0.015296,0.009897,0.113371,0.067662,0.046788,0.199568,0.018175,0.064783,0.062804
std,0.174767,0.11261,0.388962,0.258254,0.242911,0.479545,0.153648,0.288575,0.254223
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,4.0,3.0,4.0,3.0,3.0,6.0,3.0,7.0,2.0


In [53]:
data.loc[:, "num_of_sulfone":].describe()

Unnamed: 0,num_of_sulfone,num_of_term_acetylene,num_of_tetrazole,num_of_thiazole,num_of_thiocyan,num_of_thiophene,num_of_unbrch_alkane,num_of_urea
count,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0,5557.0
mean,0.017276,0.006478,0.007198,0.033471,0.00036,0.035991,0.323736,0.043909
std,0.130308,0.080234,0.084543,0.195234,0.026829,0.198447,1.587081,0.210114
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,4.0,2.0,2.0,35.0,2.0


Будем нормализовать все фичи кроме parameters_L

In [54]:
name_col = data.columns.values
index = [0, 1, 51]
name_col = np.delete(name_col,index)

In [55]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(data[name_col])
data[name_col] = scaler.transform(data[name_col])

Только эти столбцы нужны для обучения. Еще поменяем True и False на 1 и 0. Разобьем выборку на трейн и тест со стратификацией

In [56]:
name_col = data.columns.values
index = [0, 1]
name_col = np.delete(name_col, index)

In [57]:
X = data[name_col]

In [58]:
r = []
for a in data["Active"]:
    r.append(1 if a else 0)
y = pd.Series(r, copy=False)

In [59]:
y

0       0
1       0
2       0
3       0
4       0
       ..
5552    0
5553    0
5554    0
5555    0
5556    0
Length: 5557, dtype: int64

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [61]:
X_train

Unnamed: 0,SSR,Num atoms,Num atoms with Hs,Atomic num,Binary,Weight,Aliphatic carboxylic acids,aliphatic hydroxyl groups,aliphatic hydroxyl groups excluding tert-OH,N functional groups attached to aromatics,...,num_of_sulfide,num_of_sulfonamd,num_of_sulfone,num_of_term_acetylene,num_of_tetrazole,num_of_thiazole,num_of_thiocyan,num_of_thiophene,num_of_unbrch_alkane,num_of_urea
5102,1.191527,-0.087704,-0.299679,-0.219775,0.098062,-0.249502,-0.354847,-0.361378,-0.329261,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994
1897,0.578442,-0.668205,-0.842412,-0.786723,-0.463203,-0.814111,-0.354847,-0.361378,-0.329261,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994
2941,1.191527,0.658654,0.451798,0.581346,0.665853,0.568393,-0.354847,-0.361378,-0.329261,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994
2097,-0.647727,-0.834063,-0.758915,-0.909972,-0.815625,-0.906150,1.609182,-0.361378,-0.329261,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994
1911,1.191527,0.907440,1.537265,0.692271,0.861643,0.788061,1.609182,-0.361378,-0.329261,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3516,-0.034643,-0.751134,-0.758915,-0.872997,-0.672045,-0.877329,-0.354847,-0.361378,-0.329261,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994
3144,0.578442,-0.253562,-0.007438,-0.429299,-0.149939,-0.393196,-0.354847,-0.361378,-0.329261,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994
538,-1.260812,-1.580421,-1.468643,-1.624819,-1.553100,-1.620487,-0.354847,-0.361378,-0.329261,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994
959,2.417696,1.653799,1.286772,1.739891,1.697013,1.685133,-0.354847,1.351353,1.485022,-0.278567,...,-0.224514,-0.247064,-0.132587,-0.08075,-0.085149,-0.171457,-0.013416,-0.181378,-0.204,-0.208994


In [38]:
# data.to_csv('train_new.csv')

In [62]:
from tensorflow import keras

hid_size = 159
model = keras.Sequential(
    [
        keras.layers.Dense(
            hid_size, activation="relu", input_shape=(X_train.shape[-1],)
        ),
        keras.layers.Dense(300, activation="relu", kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)),
        keras.layers.Dense(400, activation="relu", kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)),
        keras.layers.Dense(300, activation="relu", kernel_regularizer=keras.regularizers.l1_l2(l1=1e-4, l2=1e-4)),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

2022-02-14 01:18:48.216641: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-02-14 01:18:48.216678: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-02-14 01:18:51.081945: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-02-14 01:18:51.081980: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-02-14 01:18:51.082006: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (fedora): /proc/driver/nvidia/version does not exist


In [63]:
metrics = [
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

In [64]:
model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

In [None]:
model.fit(
    X_train,
    y_train,
    batch_size=2048,
    epochs=2000,
    verbose=2,
)

Epoch 1/2000
2/2 - 2s - loss: 2.3939 - fn: 122.0000 - fp: 334.0000 - tn: 3411.0000 - tp: 22.0000 - precision: 0.0618 - recall: 0.1528 - 2s/epoch - 1s/step
Epoch 2/2000
2/2 - 0s - loss: 1.6420 - fn: 144.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - 271ms/epoch - 135ms/step
Epoch 3/2000
2/2 - 0s - loss: 1.4966 - fn: 144.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - 242ms/epoch - 121ms/step
Epoch 4/2000
2/2 - 0s - loss: 1.3373 - fn: 144.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - 241ms/epoch - 121ms/step
Epoch 5/2000
2/2 - 0s - loss: 1.2799 - fn: 144.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - 250ms/epoch - 125ms/step
Epoch 6/2000
2/2 - 0s - loss: 1.2068 - fn: 144.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 0.0000e+00 - precision: 0.0000e+00 - recall: 0.0000e+00 - 

Epoch 51/2000
2/2 - 0s - loss: 0.0880 - fn: 3.0000 - fp: 9.0000 - tn: 3736.0000 - tp: 141.0000 - precision: 0.9400 - recall: 0.9792 - 241ms/epoch - 121ms/step
Epoch 52/2000
2/2 - 0s - loss: 0.0800 - fn: 2.0000 - fp: 12.0000 - tn: 3733.0000 - tp: 142.0000 - precision: 0.9221 - recall: 0.9861 - 229ms/epoch - 115ms/step
Epoch 53/2000
2/2 - 0s - loss: 0.0787 - fn: 14.0000 - fp: 4.0000 - tn: 3741.0000 - tp: 130.0000 - precision: 0.9701 - recall: 0.9028 - 234ms/epoch - 117ms/step
Epoch 54/2000
2/2 - 0s - loss: 0.0750 - fn: 7.0000 - fp: 3.0000 - tn: 3742.0000 - tp: 137.0000 - precision: 0.9786 - recall: 0.9514 - 258ms/epoch - 129ms/step
Epoch 55/2000
2/2 - 0s - loss: 0.0720 - fn: 4.0000 - fp: 6.0000 - tn: 3739.0000 - tp: 140.0000 - precision: 0.9589 - recall: 0.9722 - 240ms/epoch - 120ms/step
Epoch 56/2000
2/2 - 0s - loss: 0.0688 - fn: 2.0000 - fp: 7.0000 - tn: 3738.0000 - tp: 142.0000 - precision: 0.9530 - recall: 0.9861 - 242ms/epoch - 121ms/step
Epoch 57/2000
2/2 - 0s - loss: 0.0657 - fn: 

Epoch 103/2000
2/2 - 0s - loss: 0.0367 - fn: 3.0000 - fp: 2.0000 - tn: 3743.0000 - tp: 141.0000 - precision: 0.9860 - recall: 0.9792 - 232ms/epoch - 116ms/step
Epoch 104/2000
2/2 - 0s - loss: 0.0371 - fn: 2.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 142.0000 - precision: 0.9930 - recall: 0.9861 - 230ms/epoch - 115ms/step
Epoch 105/2000
2/2 - 0s - loss: 0.0361 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 230ms/epoch - 115ms/step
Epoch 106/2000
2/2 - 0s - loss: 0.0356 - fn: 0.0000e+00 - fp: 2.0000 - tn: 3743.0000 - tp: 144.0000 - precision: 0.9863 - recall: 1.0000 - 231ms/epoch - 115ms/step
Epoch 107/2000
2/2 - 0s - loss: 0.0362 - fn: 1.0000 - fp: 2.0000 - tn: 3743.0000 - tp: 143.0000 - precision: 0.9862 - recall: 0.9931 - 228ms/epoch - 114ms/step
Epoch 108/2000
2/2 - 0s - loss: 0.0354 - fn: 2.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 142.0000 - precision: 0.9930 - recall: 0.9861 - 232ms/epoch - 116ms/step
Epoch 109/2000
2/2 - 0s - loss: 0.03

Epoch 154/2000
2/2 - 0s - loss: 0.0327 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 284ms/epoch - 142ms/step
Epoch 155/2000
2/2 - 0s - loss: 0.0324 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 259ms/epoch - 129ms/step
Epoch 156/2000
2/2 - 0s - loss: 0.0333 - fn: 0.0000e+00 - fp: 3.0000 - tn: 3742.0000 - tp: 144.0000 - precision: 0.9796 - recall: 1.0000 - 242ms/epoch - 121ms/step
Epoch 157/2000
2/2 - 0s - loss: 0.0330 - fn: 3.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 141.0000 - precision: 1.0000 - recall: 0.9792 - 232ms/epoch - 116ms/step
Epoch 158/2000
2/2 - 0s - loss: 0.0334 - fn: 3.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 141.0000 - precision: 0.9930 - recall: 0.9792 - 226ms/epoch - 113ms/step
Epoch 159/2000
2/2 - 0s - loss: 0.0332 - fn: 0.0000e+00 - fp: 3.0000 - tn: 3742.0000 - tp: 144.0000 - precision: 0.9796 - recall: 1.0000 - 244ms/epoch - 122ms/step
Epoch 160/2000
2/2 - 0s - lo

Epoch 205/2000
2/2 - 0s - loss: 0.0317 - fn: 2.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 142.0000 - precision: 0.9930 - recall: 0.9861 - 230ms/epoch - 115ms/step
Epoch 206/2000
2/2 - 0s - loss: 0.0315 - fn: 2.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 142.0000 - precision: 0.9930 - recall: 0.9861 - 221ms/epoch - 111ms/step
Epoch 207/2000
2/2 - 0s - loss: 0.0319 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 230ms/epoch - 115ms/step
Epoch 208/2000
2/2 - 0s - loss: 0.0319 - fn: 1.0000 - fp: 2.0000 - tn: 3743.0000 - tp: 143.0000 - precision: 0.9862 - recall: 0.9931 - 227ms/epoch - 113ms/step
Epoch 209/2000
2/2 - 0s - loss: 0.0319 - fn: 0.0000e+00 - fp: 3.0000 - tn: 3742.0000 - tp: 144.0000 - precision: 0.9796 - recall: 1.0000 - 226ms/epoch - 113ms/step
Epoch 210/2000
2/2 - 0s - loss: 0.0319 - fn: 2.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 142.0000 - precision: 1.0000 - recall: 0.9861 - 226ms/epoch - 113ms/step
Epoch 211/2000
2/2 - 0s - loss: 

Epoch 256/2000
2/2 - 0s - loss: 0.0353 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 236ms/epoch - 118ms/step
Epoch 257/2000
2/2 - 0s - loss: 0.0353 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 239ms/epoch - 119ms/step
Epoch 258/2000
2/2 - 0s - loss: 0.0349 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 228ms/epoch - 114ms/step
Epoch 259/2000
2/2 - 0s - loss: 0.0348 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 224ms/epoch - 112ms/step
Epoch 260/2000
2/2 - 0s - loss: 0.0346 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 226ms/epoch - 113ms/step
Epoch 261/2000
2/2 - 0s - loss: 0.0343 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 227ms/epoch - 114ms/step
Epoch 262/2000
2/2 - 0s - loss: 0.0341 -

Epoch 307/2000
2/2 - 0s - loss: 0.0325 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 229ms/epoch - 114ms/step
Epoch 308/2000
2/2 - 0s - loss: 0.0326 - fn: 1.0000 - fp: 2.0000 - tn: 3743.0000 - tp: 143.0000 - precision: 0.9862 - recall: 0.9931 - 224ms/epoch - 112ms/step
Epoch 309/2000
2/2 - 0s - loss: 0.0325 - fn: 0.0000e+00 - fp: 2.0000 - tn: 3743.0000 - tp: 144.0000 - precision: 0.9863 - recall: 1.0000 - 225ms/epoch - 112ms/step
Epoch 310/2000
2/2 - 0s - loss: 0.0325 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 227ms/epoch - 114ms/step
Epoch 311/2000
2/2 - 0s - loss: 0.0325 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 226ms/epoch - 113ms/step
Epoch 312/2000
2/2 - 0s - loss: 0.0325 - fn: 2.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 142.0000 - precision: 1.0000 - recall: 0.9861 - 224ms/epoch - 112ms/step
Epoch 313/2000
2/2 - 0s - loss: 

Epoch 358/2000
2/2 - 0s - loss: 0.0325 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 230ms/epoch - 115ms/step
Epoch 359/2000
2/2 - 0s - loss: 0.0323 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 225ms/epoch - 112ms/step
Epoch 360/2000
2/2 - 0s - loss: 0.0326 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 224ms/epoch - 112ms/step
Epoch 361/2000
2/2 - 0s - loss: 0.0324 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 225ms/epoch - 113ms/step
Epoch 362/2000
2/2 - 0s - loss: 0.0323 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 227ms/epoch - 113ms/step
Epoch 363/2000
2/2 - 0s - loss: 0.0326 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 231ms/epoch - 115ms/step
Epoch 364/2000
2/2 - 0s - loss: 0.0325 -

Epoch 409/2000
2/2 - 0s - loss: 0.0328 - fn: 2.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 142.0000 - precision: 0.9930 - recall: 0.9861 - 225ms/epoch - 112ms/step
Epoch 410/2000
2/2 - 0s - loss: 0.0327 - fn: 2.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 142.0000 - precision: 1.0000 - recall: 0.9861 - 225ms/epoch - 113ms/step
Epoch 411/2000
2/2 - 0s - loss: 0.0328 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 225ms/epoch - 112ms/step
Epoch 412/2000
2/2 - 0s - loss: 0.0327 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 239ms/epoch - 120ms/step
Epoch 413/2000
2/2 - 0s - loss: 0.0325 - fn: 0.0000e+00 - fp: 2.0000 - tn: 3743.0000 - tp: 144.0000 - precision: 0.9863 - recall: 1.0000 - 228ms/epoch - 114ms/step
Epoch 414/2000
2/2 - 0s - loss: 0.0326 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 230ms/epoch - 115ms/step
Epoch 415/2000
2/2 - 0s - loss: 

Epoch 460/2000
2/2 - 0s - loss: 0.0327 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 237ms/epoch - 118ms/step
Epoch 461/2000
2/2 - 0s - loss: 0.0327 - fn: 2.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 142.0000 - precision: 1.0000 - recall: 0.9861 - 226ms/epoch - 113ms/step
Epoch 462/2000
2/2 - 0s - loss: 0.0329 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 229ms/epoch - 114ms/step
Epoch 463/2000
2/2 - 0s - loss: 0.0330 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 230ms/epoch - 115ms/step
Epoch 464/2000
2/2 - 0s - loss: 0.0330 - fn: 1.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 143.0000 - precision: 0.9931 - recall: 0.9931 - 234ms/epoch - 117ms/step
Epoch 465/2000
2/2 - 0s - loss: 0.0330 - fn: 1.0000 - fp: 2.0000 - tn: 3743.0000 - tp: 143.0000 - precision: 0.9862 - recall: 0.9931 - 227ms/epoch - 113ms/step
Epoch 466/2000
2/2 - 0s - loss: 0.03

Epoch 511/2000
2/2 - 0s - loss: 0.0330 - fn: 2.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 142.0000 - precision: 0.9930 - recall: 0.9861 - 231ms/epoch - 116ms/step
Epoch 512/2000
2/2 - 0s - loss: 0.0329 - fn: 2.0000 - fp: 1.0000 - tn: 3744.0000 - tp: 142.0000 - precision: 0.9930 - recall: 0.9861 - 227ms/epoch - 114ms/step
Epoch 513/2000
2/2 - 0s - loss: 0.0330 - fn: 0.0000e+00 - fp: 2.0000 - tn: 3743.0000 - tp: 144.0000 - precision: 0.9863 - recall: 1.0000 - 240ms/epoch - 120ms/step
Epoch 514/2000
2/2 - 0s - loss: 0.0328 - fn: 2.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 142.0000 - precision: 1.0000 - recall: 0.9861 - 256ms/epoch - 128ms/step
Epoch 515/2000
2/2 - 0s - loss: 0.0329 - fn: 2.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 142.0000 - precision: 1.0000 - recall: 0.9861 - 258ms/epoch - 129ms/step
Epoch 516/2000
2/2 - 0s - loss: 0.0329 - fn: 2.0000 - fp: 0.0000e+00 - tn: 3745.0000 - tp: 142.0000 - precision: 1.0000 - recall: 0.9861 - 273ms/epoch - 136ms/step
Epoch 517/2000
2/2 - 0s 

In [None]:
y_pred = model.predict(X_test)

In [None]:
np.mean(y_pred)

In [None]:
y_predict = []
for y in y_pred:
    int_y = 1 if y > 0.1 else 0
    y_predict.append(int_y)

In [None]:
accuracy_score(y_test, y_predict)

In [None]:
confusion_matrix(y_test, y_predict)

In [None]:
X

In [None]:
test_pred = model.predict(X)

In [None]:
import re

with open('submission.csv', 'w') as dst:
    dst.write('id,label\n')
    for path, score in zip(range(len(test_files)), test_pred):
        dst.write('%s,%f\n' % (re.search('(\d+).jpg$', path).group(1), score))

In [None]:
# from rdkit.Chem.BRICS import BRICSDecompose #пока не знаю как использовать
# brics = []
# for mol in tqdm(data["Smiles"]):
#     m = Chem.MolFromSmiles(mol)
#     brics.append(BRICSDecompose(m))

In [None]:
# from rdkit.Chem.Descriptors3D import Asphericity
# den_morgan1 = []
# den_morgan2 = []
# den_morgan3 = []
# heavy_mol_vt = []
# for mol in tqdm(data["Smiles"]):
#     m = Chem.MolFromSmiles(mol)
# #     den_morgan1.append(Descriptors.MaxAbsPartialCharge(mol))
# #     den_morgan2.append(Descriptors.MaxPartialCharge(mol))
# #     den_morgan3.append(Descriptors.MinAbsPartialCharge(mol))
#     heavy_mol_vt.append(PropertyFunctor(mol))