In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("data/data.xlsx")

Подготовка признаков и целевой переменной

In [3]:
target = "SI"
X = data.drop(
    columns=[
        target,
        "IC50, mM",
        "CC50, mM",
        "SI",
        "log10_IC50, mM",
        "log10_SI",
        "log10_CC50, mM",
    ],
    errors="ignore",
)
y = data[target]

# Логарифмирование целевой переменной
y_log = np.log10(y + 1e-9)  # чтобы избежать log(0)

# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

Определение моделей и сеток гиперпараметров

In [4]:
models_params = {
    "Ridge": {
        "model": Pipeline([("scaler", StandardScaler()), ("ridge", Ridge())]),
        "params": {"ridge__alpha": [0.01, 0.1, 1, 10, 100]},
    },
    "Lasso": {
        "model": Pipeline(
            [("scaler", StandardScaler()), ("lasso", Lasso(max_iter=5000))]
        ),
        "params": {"lasso__alpha": [0.001, 0.01, 0.1, 1]},
    },
    "RandomForest": {
        "model": RandomForestRegressor(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
        },
    },
    "XGBoost": {
        "model": xgb.XGBRegressor(objective="reg:squarederror", random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [3, 5, 7],
            "learning_rate": [0.01, 0.1, 0.2],
        },
    },
}

Подбор гиперпараметров и оценка

In [5]:
results = []
best_models = {}

for name, mp in models_params.items():
    print(f"Обучение модели: {name}")
    grid = GridSearchCV(mp["model"], mp["params"], cv=5, scoring="r2", n_jobs=-1)
    grid.fit(X_train, y_train)

    y_pred_log = grid.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred_log)
    r2 = r2_score(y_test, y_pred_log)
    mse = mean_squared_error(y_test, y_pred_log)
    rmse = np.sqrt(mse)

    results.append(
        {
            "Model": name,
            "Best_Params": grid.best_params_,
            "MAE": mae,
            "MSE": mse,
            "RMSE": rmse,
            "R2": r2,
        }
    )

    best_models[name] = grid

results_df = pd.DataFrame(results).sort_values(by="R2", ascending=False)
display(results_df)

Обучение модели: Ridge
Обучение модели: Lasso
Обучение модели: RandomForest
Обучение модели: XGBoost


Unnamed: 0,Model,Best_Params,MAE,MSE,RMSE,R2
3,XGBoost,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.514141,0.42373,0.650945,0.234066
2,RandomForest,"{'max_depth': 10, 'min_samples_split': 2, 'n_e...",0.507325,0.444152,0.666447,0.197151
0,Ridge,{'ridge__alpha': 100},0.550539,0.498331,0.705925,0.099218
1,Lasso,{'lasso__alpha': 0.01},0.557933,0.498677,0.706171,0.098591


In [6]:
# Автоматический выбор лучшей модели по R2
best_idx = results_df["R2"].idxmax()
best_model_name = results_df.loc[best_idx, "Model"]
best_model = best_models[best_model_name]

print(f"Лучшая модель: {best_model_name}")

Лучшая модель: XGBoost
