In [26]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from rdkit.Chem import rdFingerprintGenerator
import hyp

def smiles_to_fingerprint(smiles, radius=2, n_bits=2048):
    mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=hyp.fingerprint_radius,fpSize=hyp.fingerprint_length)
    if smiles is None:
        return np.zeros((hyp.fingerprint_length,))
    molecule = Chem.MolFromSmiles(smiles)
    if molecule is None:
        return np.zeros((hyp.fingerprint_length,))
    fingerprint = mfpgen.GetFingerprint(molecule)
    return np.array(fingerprint)

In [356]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

data_perm = pd.read_csv('corneal.csv')

X_perm = np.array([smiles_to_fingerprint(smiles) for smiles in data_perm['SMILES']])
y_perm = data_perm['logPerm'].values

In [295]:
def load_data():
    from datasets import load_dataset
    dataset = load_dataset("scbirlab/hudson-2023-dosedo")
    return dataset['train'].to_pandas()

dataset = load_data()


In [374]:
df = dataset.head(5000)
X = np.array([smiles_to_fingerprint(s) for s in df['SMILES']])
y = df['clogp'].values  # Предполагаем, что целевая переменная - logPerm

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42
)

model = CatBoostRegressor(iterations=5000, learning_rate=0.005, depth= 4, l2_leaf_reg=1)
model.fit(X_train, y_train, eval_set=(X_test, y_test))

y_pred = model.predict(X_test)
print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.3f}")
print(f"R²: {r2_score(y_test, y_pred):.3f}")

0:	learn: 1.2334958	test: 1.2678477	best: 1.2678477 (0)	total: 9.76ms	remaining: 48.8s
1:	learn: 1.2312157	test: 1.2654815	best: 1.2654815 (1)	total: 19.8ms	remaining: 49.4s
2:	learn: 1.2288327	test: 1.2630435	best: 1.2630435 (2)	total: 29.8ms	remaining: 49.6s
3:	learn: 1.2267080	test: 1.2608956	best: 1.2608956 (3)	total: 40.5ms	remaining: 50.6s
4:	learn: 1.2245890	test: 1.2587234	best: 1.2587234 (4)	total: 51.8ms	remaining: 51.7s
5:	learn: 1.2224946	test: 1.2566428	best: 1.2566428 (5)	total: 64.4ms	remaining: 53.6s
6:	learn: 1.2203457	test: 1.2545503	best: 1.2545503 (6)	total: 76ms	remaining: 54.2s
7:	learn: 1.2180925	test: 1.2522631	best: 1.2522631 (7)	total: 86.9ms	remaining: 54.2s
8:	learn: 1.2157607	test: 1.2498900	best: 1.2498900 (8)	total: 97ms	remaining: 53.8s
9:	learn: 1.2135954	test: 1.2476454	best: 1.2476454 (9)	total: 107ms	remaining: 53.5s
10:	learn: 1.2114658	test: 1.2455478	best: 1.2455478 (10)	total: 118ms	remaining: 53.6s
11:	learn: 1.2094485	test: 1.2435247	best: 1.24

In [380]:
mean_squared_error(model.predict(X_perm), y_perm)

2.814897104007728

In [385]:
model.predict(X_perm) - y_perm

array([-1.51026918, -0.39206471, -0.74913469, -0.12695005, -1.16626114,
       -1.39198676, -1.84829749,  1.75268876,  0.04744857, -1.80552606,
        0.50828229, -1.29282405,  0.65645087, -1.18704032, -2.32680281,
       -0.15794967,  1.32375909,  0.27130154, -1.52142355, -0.12442571,
        1.42317199, -3.32585375, -0.78556063, -0.27035372, -2.40370647,
       -0.19137044, -1.6317153 , -1.37399847, -1.47228192, -2.61753684,
       -2.08626116,  1.37896302, -0.21868061,  0.49542155,  1.06211408,
       -0.23431857, -2.62696076, -3.03465384, -2.7058976 ,  1.07293991,
       -1.3245784 , -2.25559252, -1.930926  , -1.13950189, -1.28401201,
        0.85709026,  1.76289008,  0.47047283,  1.93692314,  1.39195901,
       -0.08410343, -1.50369484, -1.54660251,  0.028185  , -2.12249695,
        0.82279605, -1.41256928, -1.62787945, -1.66908734, -1.71235048,
       -1.76142539, -1.59899949, -1.74994616, -2.10412222,  1.27489587,
        2.46822829, -2.2318644 , -1.95216264,  1.29481928, -2.11

In [224]:
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import Crippen


def calculate_logP(smiles):
    mol = Chem.MolFromSmiles(smiles)
    logP_clogp = Crippen.MolLogP(mol)
    logP_mlogp = rdMolDescriptors.CalcCrippenDescriptors(mol)[0]
    return logP_clogp
    
y_lib = np.array([calculate_logP(smiles) for smiles in data_perm['SMILES']])

print(f"RMSE: {np.sqrt(mean_squared_error(y_perm, y_lib)):.3f}")

RMSE: 8.520


In [392]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from joblib import dump, load

data_irrit = pd.read_csv('irritation.csv')

X_irrit = np.array([smiles_to_fingerprint(smiles) for smiles in data_irrit['SMILES']])
y_irrit = data_irrit['Class'].values

X_train, X_test, y_train, y_test = train_test_split(X_irrit, y_irrit, test_size=0.2)

model_irrit = LogisticRegression(max_iter=1000, class_weight='balanced')
model_irrit.fit(X_train, y_train)

dump(model_irrit, 'model_irrit.joblib') 

y_pred = model_irrit.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"AUC-ROC: {roc_auc_score(y_test, model_irrit.predict_proba(X_test)[:, 1]):.3f}")

Accuracy: 0.943
AUC-ROC: 0.967


In [393]:
from sklearn.linear_model import LogisticRegression

data_melanin = pd.read_csv('melanin.csv')

X_melanin = np.array([smiles_to_fingerprint(smiles) for smiles in data_melanin['SMILES']])
y_melanin = data_melanin['Class'].values

X_train, X_test, y_train, y_test = train_test_split(X_melanin, y_melanin, test_size=0.2)

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

model = GridSearchCV(
    LogisticRegression(max_iter=2000, random_state=42),
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

model.fit(X_train, y_train)


print(f"Best params: {model.best_params_}")
best_model = model.best_estimator_
dump(best_model, 'model_melanin.joblib') 

y_pred = best_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print(f"AUC-ROC: {roc_auc_score(y_test, model_melanin.predict_proba(X_test)[:, 1]):.3f}")

Best params: {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'saga'}
Accuracy: 0.821
AUC-ROC: 0.963
