## Библиотеки

In [1]:
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote
from rdkit import Chem
import numpy as np

In [2]:
from mol_descriptors.topology import TopologyDescriptors
from mol_descriptors.constitution import ConstitutionDescriptors
from mol_descriptors.burden import BurdenDescriptors
from mol_descriptors.basak import BasakDescriptors
from mol_descriptors.cats2d import Cats2dDescriptors
from mol_descriptors.charge import ChargeDescriptors
from mol_descriptors.connectivity import ConnectivityDescriptors
from mol_descriptors.estate import EstateDescriptors
from mol_descriptors.geary import GearyDescriptors
from mol_descriptors.kappa import KappaDescriptors
from mol_descriptors.moe import MoeDescriptors
from mol_descriptors.moran import MoranDescriptors
from mol_descriptors.moreaubroto import MoreaubrotoDescriptors

## Работа с данными

Генерация данных для обучающей выборки

In [3]:
name_df = "Task/train.csv"

In [4]:
df = pd.read_csv(name_df)

In [5]:
df = df.loc[:, 'Smiles':]

Генерация фичей

In [None]:
df_topology = TopologyDescriptors().getTopology(df)
df_constution = ConstitutionDescriptors().getConstitutional(df)
df_burden = BurdenDescriptors().getBurden(df)
df_basak = BasakDescriptors().getBasak(df)
df_cats2d = Cats2dDescriptors().getCATS2D(df)
df_charge = ChargeDescriptors().getCharge(df)
df_connectivity = ConnectivityDescriptors().getConnectivity(df)
df_estate = EstateDescriptors().getEstate(df)
df_geary = GearyDescriptors().getGearyAuto(df)
df_kappa = KappaDescriptors().getKappa(df)
df_moran = MoranDescriptors().getMoranAuto(df)
df_moreaubroto = MoreaubrotoDescriptors().getMoreauBrotoAuto(df)

In [40]:
df1 = pd.concat([df_topology, df_constution], axis=1)
df2 = pd.concat([df_burden, df_basak], axis=1)
df3 = pd.concat([df_cats2d, df_charge], axis=1)
df4 = pd.concat([df_connectivity, df_estate], axis=1)
df5 = pd.concat([df_geary, df_kappa], axis=1)
df6 = pd.concat([df_moran, df_moreaubroto], axis=1)
df7 = pd.concat([df1, df2], axis=1)
df8 = pd.concat([df3, df4], axis=1)
df9 = pd.concat([df5, df6], axis=1)
df10 = pd.concat([df7, df8], axis=1)
df11 = pd.concat([df9, df10], axis=1)

In [41]:
data = pd.concat([df, df11], axis=1)

In [42]:
name_col = data.columns.values

In [48]:
data.isnull().any().unique()

array([False])

In [44]:
data = data.replace(np.inf, 10000) #inf заменяем на большое значение

In [45]:
data.drop("GMTI", axis=1, inplace=True)

In [46]:
data.loc[:, name_col[106]:name_col[120]].describe()

Unnamed: 0,J,Xu,Pol,DZ,Ipc,BertzCT,Thara,Tsch,ZM1,ZM2,MZM1,MZM2,Qindex,Platt
count,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0,1614.0
mean,1.729338,35.472768,41.649318,58.317706,6.012962,2.855469,103.24288,4708569000.0,137.195787,161.048947,9.326083,1.391794,18.626394,80.491945
std,0.904836,29.998353,25.665181,32.635087,4.344087,0.277389,104.846013,24590070000.0,75.078897,89.105311,5.770168,0.806503,9.954506,44.377945
min,0.0,3.972,0.0,11.0,0.557,1.632,6.667,64.0,16.0,14.0,2.111,0.25,-2.0,8.0
25%,1.4465,19.51375,27.0,41.0,4.2895,2.7225,59.034,3368.75,94.0,109.0,6.5,0.976,12.0,54.0
50%,1.787,25.7915,39.0,55.0,5.7305,2.905,89.841,8169.5,130.0,154.0,8.389,1.3085,18.0,76.0
75%,2.209,33.0505,51.0,69.5,7.08375,3.038,124.80625,18894.25,166.0,197.0,10.94075,1.64225,24.0,98.0
max,5.604,207.994,452.0,675.0,112.952,4.052,2514.592,371200000000.0,1426.0,1614.0,122.25,17.405,126.0,820.0


In [34]:
data

Unnamed: 0,Smiles,Active,GATSm1,GATSv1,GATSe1,GATSp1,GATSm2,GATSv2,GATSe2,GATSp2,...,S76,Smax76,Smin76,S77,Smax77,Smin77,S78,Smax78,Smin78,S79
0,COc1ccc2[nH]cc(CCN)c2c1,False,0.588967,0.512278,0.883084,0.219970,0.236793,0.271524,0.316371,0.273058,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CCCN1CCC[C@H](c2cccc(O)c2)C1.Cl,False,-2.228087,-0.098027,-1.250118,-0.503274,-2.458211,0.829169,-0.564047,0.146023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,O=C(NO)c1cnc(N2CCN(S(=O)(=O)c3ccc4ccccc4c3)CC2...,False,1.770597,0.053467,-0.920195,1.516293,1.849606,-1.028073,-0.839486,-0.471630,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Nc1cccc(CNC(=O)c2ccc(Oc3ccc(OCc4cccc(F)c4)cc3)...,False,-0.135924,-0.089370,-0.011777,-0.175362,0.557959,0.796089,0.931188,0.706730,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Fc1ccccc1CNCc1ccc(-c2ccnc3[nH]ccc23)cc1,False,-0.601503,-0.543852,-0.748454,-0.448111,-0.087864,-0.097089,-0.086949,-0.073003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5552,O=C(Oc1ccc([N+](=O)[O-])cc1)N1CCC(C(O)(c2ccc3c...,False,0.191161,-0.002802,0.268432,-0.111006,-0.105319,-0.253040,-0.155809,-0.230701,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5553,Nc1nonc1/C(=N/O)Nc1ccc(F)c(Br)c1,False,-0.840187,-1.509084,-0.721337,-1.272487,-0.032009,0.091944,1.123011,-0.169374,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5554,Oc1cccc2cccnc12,False,-0.539623,-0.885795,-0.870481,-0.674891,0.303121,0.484186,0.395067,0.535890,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5555,OC(c1ccc(-c2ccc(CN3CCN(Cc4ccncc4)CC3)cc2)c(F)c...,False,-0.427648,-0.543852,-0.576713,-0.460370,-0.782561,-0.753976,-0.947693,-0.673134,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
data = data.fillna(0) #заполняем nan-ы

In [51]:
data.to_csv('train_from_md.csv')

Генерация данных для тестовой выборки

In [None]:
name_dt = "Task/test.csv"

In [None]:
df = df.loc[:, 'Smiles':]

In [None]:
dt_topology = TopologyDescriptors().getTopology(dt)
dt_constution = ConstitutionDescriptors().getConstitutional(dt)
dt_burden = BurdenDescriptors().getBurden(dt)
dt_basak = BasakDescriptors().getBasak(dt)
dt_cats2d = Cats2dDescriptors().getCATS2D(dt)
dt_charge = ChargeDescriptors().getCharge(dt)
dt_connectivity = ConnectivityDescriptors().getConnectivity(dt)
dt_estate = EstateDescriptors().getEstate(dt)
dt_geary = GearyDescriptors().getGearyAuto(dt)
dt_kappa = KappaDescriptors().getKappa(dt)
dt_moran = MoranDescriptors().getMoranAuto(dt)
dt_moreaubroto = MoreaubrotoDescriptors().getMoreauBrotoAuto(dt)

In [None]:
dt1 = pd.concat([dt_topology, dt_constution], axis=1)
dt2 = pd.concat([dt_burden, dt_basak], axis=1)
dt3 = pd.concat([dt_cats2d, dt_charge], axis=1)
dt4 = pd.concat([dt_connectivity, dt_estate], axis=1)
dt5 = pd.concat([dt_geary, dt_kappa], axis=1)
dt6 = pd.concat([dt_moran, dt_moreaubroto], axis=1)
dt7 = pd.concat([dt1, dt2], axis=1)
dt8 = pd.concat([dt3, dt4], axis=1)
dt9 = pd.concat([dt5, dt6], axis=1)
dt10 = pd.concat([dt7, dt8], axis=1)
dt11 = pd.concat([dt9, dt10], axis=1)
test = pd.concat([dt, dt11], axis=1)

In [None]:
test.isnull().any().unique()

In [None]:
test = test.replace(np.inf, 10000) #inf заменяем на большое значение

In [None]:
test.drop("GMTI", axis=1, inplace=True)

In [None]:
test = test.fillna(0) #заполняем nan-ы

In [None]:
test.to_csv('test_from_md.csv')