In [1]:
!pip install rdkit datamol molfeat xgboost catboost joblib

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Collecting datamol
  Downloading datamol-0.12.5-py3-none-any.whl.metadata (8.0 kB)
Collecting molfeat
  Downloading molfeat-0.10.1-py3-none-any.whl.metadata (10 kB)
Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting loguru (from datamol)
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting selfies (from datamol)
  Downloading selfies-2.1.2-py3-none-any.whl.metadata (14 kB)
Collecting s3fs>=2021.9 (from molfeat)
  Downloading s3fs-2024.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting pmapper (from molfeat)
  Downloading pmapper-1.1.1-py3-none-any.whl.metadata (12 kB)
Collecting mordredcommunity (from molfeat)
  Downloading mordredcommunity-2.0.6-py3-none-any.whl.metadata (6.2 kB)
Collecting python-dotenv (from molfeat)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting aioboto

In [2]:
!pip install -U lightautoml

Collecting lightautoml
  Downloading lightautoml-0.3.8.1-py3-none-any.whl.metadata (16 kB)
Collecting autowoe>=1.2 (from lightautoml)
  Downloading AutoWoE-1.3.2-py3-none-any.whl.metadata (2.8 kB)
Collecting cmaes (from lightautoml)
  Downloading cmaes-0.11.1-py3-none-any.whl.metadata (18 kB)
Collecting joblib<1.3.0 (from lightautoml)
  Downloading joblib-1.2.0-py3-none-any.whl.metadata (5.3 kB)
Collecting json2html (from lightautoml)
  Downloading json2html-1.3.0.tar.gz (7.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting lightgbm<=3.2.1,>=2.3 (from lightautoml)
  Downloading lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl.metadata (14 kB)
Collecting optuna (from lightautoml)
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting pandas<2.0.0 (from lightautoml)
  Downloading pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting poetry-core<2.0.0,>=1.0.0 (from lightautoml)
  Downloading poetry_core-1.9.0-p

In [2]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import AllChem, Draw, Descriptors
from rdkit.Chem.Draw import IPythonConsole
from sklearn.preprocessing import FunctionTransformer

import matplotlib.pyplot as plt
import seaborn as sns

def mol_dsc_calc(mols):
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in mols.values)

# список конституционных и физико-химических дескрипторов из библиотеки RDKit
descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
               "NHOHCount": Descriptors.NHOHCount,
               "NOCount": Descriptors.NOCount,
               "NumHAcceptors": Descriptors.NumHAcceptors,
               "NumHDonors": Descriptors.NumHDonors,
               "NumHeteroatoms": Descriptors.NumHeteroatoms,
               "NumRotatableBonds": Descriptors.NumRotatableBonds,
               "NumValenceElectrons": Descriptors.NumValenceElectrons,
               "NumAromaticRings": Descriptors.NumAromaticRings,
               "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
               "RingCount": Descriptors.RingCount,
               "MW": Descriptors.MolWt,
               "LogP": Descriptors.MolLogP,
               "MR": Descriptors.MolMR,
               "TPSA": Descriptors.TPSA,
               "Molecular Weight": Descriptors.MolWt}

def rdkit_fp(smiles_column: pd.Series, radius=3, nBits=2048, useChirality=False):
    # morganFP_rdkit
    def desc_gen(mol):
        mol = Chem.MolFromSmiles(mol)
        bit_vec = np.zeros((1,), np.int16)
        DataStructs.ConvertToNumpyArray(
            AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits, useChirality=useChirality), bit_vec)
        return bit_vec

    return pd.DataFrame.from_records(smiles_column.apply(func=desc_gen), columns=[f'bit_id_{i}' for i in range(nBits)])


def rdkit_2d(smiles_column: pd.Series):
    # 2d_rdkit
    descriptors = {i[0]: i[1] for i in Descriptors._descList}
    return pd.DataFrame({k: f(Chem.MolFromSmiles(m)) for k, f in descriptors.items()} for m in smiles_column)

In [11]:
def extract_smiles(raw_data: pd.DataFrame, smiles: pd.Series, add_bit_vec: bool=True, add_2d_rdkit: bool=False) -> pd.DataFrame:

    data = raw_data.copy()
    columns = data.columns

    descriptors_transformer = FunctionTransformer(mol_dsc_calc)
    X = descriptors_transformer.transform(smiles)
    data = data.join(X)

    if add_bit_vec:
        Y = rdkit_fp(smiles)
        data = data.join(Y)

    if add_2d_rdkit:
        Z = rdkit_2d(smiles)
        data = data.join(Z)

    return data

In [4]:
def draw_hist(data, features, width=5):
  figure, axes = plt.subplots(ncols=2, nrows=len(features), figsize=(width, width*len(features)))
  for i, name in enumerate(features):
      axes[i, 0].set_title(name)
      sns.histplot(data[name], ax=axes[i, 0])
      axes[i, 1].set_title(name)
      sns.scatterplot(data[name], ax=axes[i, 1])

In [5]:
def cut_quantiles(raw_data: pd.DataFrame, cols_to_cut: list=None, q_min: float=0.25, q_max: float=0.75) -> pd.DataFrame:

    data = raw_data.copy()

    quant1 = data[cols_to_cut].quantile(q_min)
    quant2 = data[cols_to_cut].quantile(q_max)
    quants = pd.concat([quant1, quant2], axis=1)

    for name in quants.index:
        data = data[quants.loc[name, q_min] <= data[name]]
        data = data[data[name] <= quants.loc[name, q_max]]

    return data

In [31]:
data = pd.read_csv("/content/ic50_df1.csv")
data = data.head(100)
smiles = data['Smiles']
data.drop(columns=["Smiles", "DOI"], inplace=True)

In [32]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder


cat_cols = ["Strain", "Cell"]
encoder = OneHotEncoder()

# Fit and transform the categorical features
encoded_data = encoder.fit_transform(data[cat_cols]).toarray()

# Create a DataFrame with the one-hot encoded columns
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())

# Concatenate the original dataset and the encoded dataset
df_encoded = pd.concat([data, encoded_df], axis=1)

# Drop original categorical columns
df_encoded.drop(cat_cols, axis=1, inplace=True)

In [33]:
import joblib
joblib.dump(encoder, 'onehot_encoder_ic50.joblib')

['onehot_encoder_ic50.joblib']

In [None]:
data_extract = extract_smiles(df_encoded, smiles)

In [35]:
data_quant = cut_quantiles(data_extract, cols_to_cut=["MW",	"LogP",	"MR",	"TPSA",	"Molecular Weight"], q_min=0.03, q_max=0.97)

In [37]:
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
min_max_target = MinMaxScaler()
cols = list(data_quant.columns)
cols.remove("Standard Value")


min_max.fit(data_quant[cols])
y = pd.DataFrame(min_max_target.fit_transform(pd.DataFrame(data_quant["Standard Value"])), columns=["Standard Value"])
X = pd.DataFrame(min_max.transform(data_quant[cols]), columns=cols)

In [38]:
from sklearn.model_selection import train_test_split
y = data_extract["Standard Value"]
X = data_extract.drop(columns=["Standard Value"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

In [None]:
from xgboost import XGBRegressor
eval_set = [(X_train, y_train), (X_val, y_val)]
reg_xgboost = XGBRegressor(n_estimators=1500, max_depth=100, learning_rate=0.05, early_stopping_rounds=5)
reg_xgboost.fit(X_train, y_train, eval_set=eval_set, verbose=False)
reg_xgboost.score(X_test, y_test)

In [None]:
import joblib
joblib.dump(reg_xgboost, 'reg_ic50_xgboost_1.joblib')

In [None]:
from catboost import CatBoostRegressor

reg_catboost = CatBoostRegressor(iterations=1000, learning_rate=0.02, early_stopping_rounds=5)
reg_catboost.fit(X_train, y_train, logging_level="Silent", eval_set=(X_val, y_val), plot=True, plot_file="graph.txt")
reg_catboost.score(X_test, y_test)

In [None]:
import joblib
joblib.dump(reg_catboost, 'reg_ic50_catboost_1.joblib')

In [5]:
import pandas as pd
from sklearn.metrics import f1_score

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import torch

In [13]:
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

cols = list(data_quant.columns)

X = data_quant[cols]
df_train, df_test = train_test_split(X, random_state=42)

y_true = df_test["Standard Value"]
df_test.drop(columns=["Standard Value"])
automl = TabularAutoML(
    task = Task(
        name = 'reg',
        metric = r2_score),
    timeout=1000
)
oof_pred = automl.fit_predict(
    df_train,
    roles = {'target': "Standard Value"}
)
torch.save(automl, "model.pt")
test_pred = automl.predict(df_test)

INFO:lightautoml.automl.presets.base:Stdout logging level is ERROR.
INFO:lightautoml.automl.presets.base:Task: reg

INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:
INFO:lightautoml.automl.presets.base:- time: 1000.00 seconds
INFO:lightautoml.automl.presets.base:- CPU: 4 cores
INFO:lightautoml.automl.presets.base:- memory: 16 GB

INFO:lightautoml.reader.base:[1mTrain data shape: (21837, 2462)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []
INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 905.38 secs
INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [2164, 2165, 2166, 2167, 2168, 2169, 2170, 2171, 2172, 2173

In [16]:
import joblib
joblib.dump(automl, 'automl_cc50.joblib')

['automl_cc50.joblib']

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score, mean_absolute_percentage_error

test_pred_1 = test_pred.data
y_true_1 = y_true.to_numpy()
print("r2_score: ", r2_score(y_true_1, test_pred_1))
print("mean_absolute_error:", mean_absolute_error(y_true_1, test_pred_1))
print("mean_absolute_percentage_error:", mean_absolute_percentage_error(y_true_1, test_pred_1))