## Библиотеки

In [1]:
import pandas as pd
from tqdm import tqdm
from urllib.request import urlopen
from urllib.parse import quote
from rdkit import Chem
import numpy as np

In [2]:
from mol_descriptors.topology import TopologyDescriptors
from mol_descriptors.constitution import ConstitutionDescriptors
from mol_descriptors.burden import BurdenDescriptors
from mol_descriptors.basak import BasakDescriptors
from mol_descriptors.cats2d import Cats2dDescriptors
from mol_descriptors.charge import ChargeDescriptors
from mol_descriptors.connectivity import ConnectivityDescriptors
from mol_descriptors.estate import EstateDescriptors
from mol_descriptors.geary import GearyDescriptors
from mol_descriptors.kappa import KappaDescriptors
from mol_descriptors.moe import MoeDescriptors
from mol_descriptors.moran import MoranDescriptors
from mol_descriptors.moreaubroto import MoreaubrotoDescriptors

## Работа с данными

Генерация данных для обучающей выборки

In [3]:
name_df = "Task/train.csv"

In [4]:
df = pd.read_csv(name_df)

In [5]:
df = df.loc[:, 'Smiles':]

Генерация фичей

In [6]:
df_topology = TopologyDescriptors().getTopology(df)
df_constution = ConstitutionDescriptors().getConstitutional(df)
df_burden = BurdenDescriptors().getBurden(df)
df_basak = BasakDescriptors().getBasak(df)
df_cats2d = Cats2dDescriptors().getCATS2D(df)
df_charge = ChargeDescriptors().getCharge(df)
df_connectivity = ConnectivityDescriptors().getConnectivity(df)
df_estate = EstateDescriptors().getEstate(df)
df_geary = GearyDescriptors().getGearyAuto(df)
df_kappa = KappaDescriptors().getKappa(df)
df_moran = MoranDescriptors().getMoranAuto(df)
df_moreaubroto = MoreaubrotoDescriptors().getMoreauBrotoAuto(df)

In [7]:
df1 = pd.concat([df_topology, df_constution], axis=1)
df2 = pd.concat([df_burden, df_basak], axis=1)
df3 = pd.concat([df_cats2d, df_charge], axis=1)
df4 = pd.concat([df_connectivity, df_estate], axis=1)
df5 = pd.concat([df_geary, df_kappa], axis=1)
df6 = pd.concat([df_moran, df_moreaubroto], axis=1)
df7 = pd.concat([df1, df2], axis=1)
df8 = pd.concat([df3, df4], axis=1)
df9 = pd.concat([df5, df6], axis=1)
df10 = pd.concat([df7, df8], axis=1)
df11 = pd.concat([df9, df10], axis=1)

In [8]:
data = pd.concat([df, df11], axis=1)

In [9]:
name_col = data.columns.values

In [14]:
data.isnull().any().unique()

array([False])

In [11]:
data = data.replace(np.inf, 10000) #inf заменяем на большое значение

In [12]:
data.drop("GMTI", axis=1, inplace=True)

In [13]:
data = data.fillna(0) #заполняем nan-ы

In [15]:
data.to_csv('Task/train_from_md.csv')

Генерация данных для тестовой выборки

In [19]:
dt = pd.read_csv("Task/test.csv")

In [20]:
dt = dt.loc[:, 'Smiles':]

In [21]:
dt_topology = TopologyDescriptors().getTopology(dt)
dt_constution = ConstitutionDescriptors().getConstitutional(dt)
dt_burden = BurdenDescriptors().getBurden(dt)
dt_basak = BasakDescriptors().getBasak(dt)
dt_cats2d = Cats2dDescriptors().getCATS2D(dt)
dt_charge = ChargeDescriptors().getCharge(dt)
dt_connectivity = ConnectivityDescriptors().getConnectivity(dt)
dt_estate = EstateDescriptors().getEstate(dt)
dt_geary = GearyDescriptors().getGearyAuto(dt)
dt_kappa = KappaDescriptors().getKappa(dt)
dt_moran = MoranDescriptors().getMoranAuto(dt)
dt_moreaubroto = MoreaubrotoDescriptors().getMoreauBrotoAuto(dt)

In [22]:
dt1 = pd.concat([dt_topology, dt_constution], axis=1)
dt2 = pd.concat([dt_burden, dt_basak], axis=1)
dt3 = pd.concat([dt_cats2d, dt_charge], axis=1)
dt4 = pd.concat([dt_connectivity, dt_estate], axis=1)
dt5 = pd.concat([dt_geary, dt_kappa], axis=1)
dt6 = pd.concat([dt_moran, dt_moreaubroto], axis=1)
dt7 = pd.concat([dt1, dt2], axis=1)
dt8 = pd.concat([dt3, dt4], axis=1)
dt9 = pd.concat([dt5, dt6], axis=1)
dt10 = pd.concat([dt7, dt8], axis=1)
dt11 = pd.concat([dt9, dt10], axis=1)
test = pd.concat([dt, dt11], axis=1)

In [27]:
test.isnull().any().unique()

array([False])

In [24]:
test = test.replace(np.inf, 10000) #inf заменяем на большое значение

In [25]:
test.drop("GMTI", axis=1, inplace=True)

In [26]:
test = test.fillna(0) #заполняем nan-ы

In [28]:
test.to_csv('Task/test_from_md.csv')