# Загрузка необходимых библиотек

In [25]:
# стандартные
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# виджеты из-за нехватки памяти
import ipywidgets as widgets
from IPython.display import display

# предобработка
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA

# создание sklearn-трансформеров и пайплайна для обработки выборок
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
import joblib

# GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from numpy import sqrt
import time
import os
import joblib

# Импорт моделей
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

### 3. Упаковка обработки в классы для дальнейшего распространения на прочие части выборок

In [26]:
import sys
sys.path.append("../src")

from preprocessing import ProcessingPipeline

### 4. Обучение пайплайна на train выборке и преобразование train и test

In [27]:
df = pd.read_excel('../data/raw/course_task_data.xlsx')

In [28]:
X_train = pd.read_csv('../data/raw/X_train_raw.csv')
X_test = pd.read_csv('../data/raw/X_test_raw.csv')

In [29]:
group_config = {
    'Mass': ['MolWt', 'HeavyAtomMolWt', 'ExactMolWt', 'NumValenceElectrons', 'HeavyAtomCount'],
    'MorganDensity': ['FpDensityMorgan1', 'FpDensityMorgan2', 'FpDensityMorgan3'],
    'EState': ['MaxAbsEStateIndex', 'MinAbsEStateIndex', 'MaxEStateIndex', 'MinEStateIndex'],
    'PartialCharge': ['MaxPartialCharge', 'MinPartialCharge', 'MaxAbsPartialCharge', 'MinAbsPartialCharge'],
    'Chi': [c for c in df.columns if 'Chi' in c],
    'Kappa': ['Kappa1', 'Kappa2', 'Kappa3'],
    'PEOE_VSA': [c for c in df.columns if 'PEOE_VSA' in c],
    'SMR_VSA': [c for c in df.columns if 'SMR_VSA' in c],
    'EState_VSA': [c for c in df.columns if 'EState_VSA' in c],
    'VSA_EState': [c for c in df.columns if 'VSA_EState' in c],
    'SlogP_VSA': [c for c in df.columns if 'SlogP_VSA' in c],
    'BCUT': [c for c in df.columns if 'BCUT' in c],
    'Complexity': ['BertzCT', 'HallKierAlpha', 'Ipc']
}

In [30]:
pipeline = ProcessingPipeline(group_config, corr_threshold=0.9)

In [31]:
pipeline.fit(X_train)

<preprocessing.ProcessingPipeline at 0x2512dbc9070>

In [32]:
X_train_processed = pipeline.transform(X_train)

In [33]:
X_train_processed

Unnamed: 0,qed,AvgIpc,BalabanJ,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,...,fr_ketone,fr_ketone_Topliss,fr_methoxy,fr_morpholine,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_piperdine,fr_thiazole,fr_thiophene
0,0.179325,-0.161316,0.706106,-0.337818,-1.064960,-0.964537,-1.758636,0.353076,1.062616,0.588096,...,0,0,0,0,0,0,0,0,0,0
1,0.082564,0.578365,-0.812717,-0.460603,-1.064960,1.518637,0.707198,0.353076,-0.843062,-0.108380,...,0,0,0,0,1,0,0,0,0,0
2,0.141045,0.276774,0.404474,-1.425331,-1.064960,-0.964537,-1.758636,1.077439,1.062616,1.093316,...,0,0,0,0,1,0,0,0,0,0
3,-1.695872,0.954808,0.245364,-1.378111,-1.064960,-0.964537,-1.758636,1.547534,1.062616,1.491644,...,0,0,0,0,0,0,0,0,0,0
4,0.137814,1.284755,-0.666360,-0.990549,0.269724,1.188580,0.707198,1.077439,1.062616,1.093316,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,-0.375925,-0.603975,0.318494,1.256113,0.872444,-0.964537,0.018597,-1.073965,-0.843062,-1.257996,...,0,0,0,0,0,0,0,0,0,0
796,1.597438,-0.153040,-0.600898,1.267570,0.872444,0.587098,0.707198,-1.073965,-0.843062,-1.257996,...,0,0,0,0,0,0,0,1,0,0
797,-1.251174,0.282477,-0.804315,-0.756301,-1.064960,1.188580,0.018597,1.547534,-0.843062,1.093316,...,1,1,1,0,0,1,1,0,0,0
798,-1.572410,2.267454,-1.343605,-0.822768,-1.064960,1.518637,0.707198,1.077439,1.735720,1.821345,...,0,0,1,0,0,0,0,0,1,1


In [34]:
X_train_processed.describe()

Unnamed: 0,qed,AvgIpc,BalabanJ,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,...,fr_ketone,fr_ketone_Topliss,fr_methoxy,fr_morpholine,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_piperdine,fr_thiazole,fr_thiophene
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,...,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,-1.7763570000000002e-17,-6.661338e-18,2.220446e-18,5.5511150000000004e-17,2.2204460000000003e-17,-3.885781e-18,-1.1102230000000002e-17,1.021405e-16,-3.3306690000000003e-17,7.327472e-17,...,0.14875,0.08125,0.14625,0.05875,0.13875,0.0775,0.0775,0.05625,0.05125,0.07125
std,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,...,0.356064,0.27339,0.353578,0.235303,0.345902,0.26755,0.26755,0.230548,0.220645,0.257403
min,-1.890044,-2.580071,-5.911687,-1.806572,-1.06496,-0.9645366,-1.758636,-1.073965,-0.843062,-1.257996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.7879827,-0.7398925,-0.6943732,-0.8380886,-1.06496,-0.9645366,-0.7737796,-1.073965,-0.843062,-1.257996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.1242068,0.0326814,0.04724989,0.04549732,0.2697243,0.5870978,0.01859708,0.3530757,-0.843062,-0.1083803,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.7733533,0.6445732,0.5689851,1.004812,0.8724443,1.18858,0.7071978,1.077439,1.062616,0.5880961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.123499,2.467007,3.589303,1.45043,1.93567,1.730612,2.934218,1.889,1.903328,2.349453,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [35]:
X_test_processed = pipeline.transform(X_test)

In [36]:
X_test_processed.describe()

Unnamed: 0,qed,AvgIpc,BalabanJ,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,...,fr_ketone,fr_ketone_Topliss,fr_methoxy,fr_morpholine,fr_para_hydroxylation,fr_phenol,fr_phenol_noOrthoHbond,fr_piperdine,fr_thiazole,fr_thiophene
count,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0,...,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0,201.0
mean,0.034026,-0.035295,0.144546,-0.064029,-0.02288,0.00236,-0.05967,0.00344,-0.123619,-0.042431,...,0.174129,0.094527,0.18408,0.024876,0.179104,0.089552,0.089552,0.069652,0.054726,0.064677
std,0.98557,1.01323,1.030607,1.031499,0.960607,1.00409,1.025231,0.981796,0.98555,0.983155,...,0.380168,0.293291,0.388517,0.156135,0.384397,0.286252,0.286252,0.255195,0.228013,0.246568
min,-1.909517,-2.580071,-2.243516,-1.806572,-1.06496,-0.964537,-1.758636,-1.073965,-0.843062,-1.257996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.75665,-0.802589,-0.623478,-0.957123,-1.06496,-0.964537,-0.77378,-1.073965,-0.843062,-1.257996,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.174784,-0.156829,0.194091,0.067899,0.269724,0.587098,0.018597,0.353076,-0.843062,-0.10838,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.72359,0.617314,0.760956,0.882354,0.872444,1.18858,0.707198,1.077439,1.062616,0.588096,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.280801,2.422547,2.714762,1.45043,1.817859,1.991121,2.934218,1.547534,1.903328,2.349453,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [37]:
print("Пересечение признаков train и test:", len(set(X_train_processed.columns) & set(X_test_processed.columns)))
print("Что есть в train и нет в test:", set(X_train_processed.columns) - set(X_test_processed.columns))

Пересечение признаков train и test: 97
Что есть в train и нет в test: set()


In [38]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestRegressor, GradientBoostingRegressor,
    RandomForestClassifier, GradientBoostingClassifier
)
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [39]:
reg_models = {
    # Регрессия
    'LinReg': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'SVR': SVR(),
    'KNN-Reg': KNeighborsRegressor(),
    'DecisionTree-Reg': DecisionTreeRegressor(random_state=42),
    'RandomForest-Reg': RandomForestRegressor(random_state=42),
    'XGB-Reg': XGBRegressor(random_state=42),
    'GradBoost-Reg': GradientBoostingRegressor(random_state=42),
}

In [40]:
y_train = pd.read_csv('../data/raw/y_train_raw.csv')
y_test = pd.read_csv('../data/raw/y_test_raw.csv')

In [41]:
from sklearn.model_selection import cross_validate

y_cols = ['lg_IC50', 'lg_CC50', 'lg_SI']
results = []

for col in y_cols:
    for name, model in reg_models.items():
        # Расчет метрик
        scoring = ['neg_mean_squared_error', 'r2']
        scores = cross_validate(model,
                               X_train_processed,
                               y_train[col],
                               cv=5,
                               scoring=scoring,
                               n_jobs=-1)

        # Вычисление средних значений метрик
        rmse = np.sqrt(-np.mean(scores['test_neg_mean_squared_error']))
        r2 = np.mean(scores['test_r2'])

        # Сохраняем результаты в список
        results.append({
            'Target': col.replace('lg_', ''),  # Убираем префикс 'lg_'
            'Model': name,
            'RMSE': rmse,
            'R²': r2
        })

# Создаем DataFrame и сортируем
results_df = pd.DataFrame(results)
sorted_results = results_df.sort_values(by=['Target', 'R²'], ascending=[True, False])

# Показываем результаты
sorted_results

Unnamed: 0,Target,Model,RMSE,R²
15,CC50,RandomForest-Reg,1.242078,0.398244
17,CC50,GradBoost-Reg,1.246862,0.393849
12,CC50,SVR,1.288037,0.353111
13,CC50,KNN-Reg,1.294672,0.345258
16,CC50,XGB-Reg,1.297393,0.344662
10,CC50,Ridge,1.343132,0.292626
9,CC50,LinReg,1.358774,0.275148
11,CC50,Lasso,1.606115,-0.007729
14,CC50,DecisionTree-Reg,1.670222,-0.088148
3,IC50,SVR,1.419844,0.393407


In [42]:
param_grids = {
    'SVR': {
        'C': [0.1, 1, 10, 100],
        'gamma': [0.001, 0.01, 0.1, 1],
        'kernel': ['rbf', 'linear', 'poly'],
        'gamma': ['scale', 'auto']
    },
    'RandomForest-Reg': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'GradBoost-Reg': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }
}

In [43]:
GS_results = pd.DataFrame()

In [44]:
target = y_cols[1] # lg_CC50
model = 'SVR'

In [45]:
model = 'SVR'
grid_search = GridSearchCV(
    estimator=reg_models[model],
    param_grid=param_grids[model],
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Запуск поиска
start_time = time.time()
print(f"Starting GridSearch for {model} on {target}...")
grid_search.fit(X_train_processed, y_train[target])
search_time = time.time() - start_time

# Результаты
print(f"\nGridSearch completed in {search_time:.2f} seconds")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score ({'neg_mean_squared_error'}): {grid_search.best_score_:.4f}")

# Оценка на тестовых данных
best_estimators = grid_search.best_estimator_
y_pred = grid_search.predict(X_test_processed)
r2 = r2_score(y_test[target], y_pred)
rmse = sqrt(mean_squared_error(y_test[target], y_pred))
mae = mean_absolute_error(y_test[target], y_pred)

print(f"\nTest results for {model} on {target}:")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

# Сохранение результатов в DataFrame
current_results = pd.DataFrame({
    'Target': [target],
    'Model': [model],
    'Best Params': [grid_search.best_params_],
    'CV R²': [grid_search.best_score_],
    'Test R²': r2,
    'Test RMSE': rmse,
    'Test MAE': mae,
    'Search time': [search_time]
})
GS_results = pd.concat([GS_results, current_results], ignore_index=True)

model_filename = f"{target}_{model}.pkl"
model_path = os.path.join("../models/regression", model_filename)
joblib.dump(best_estimators, model_path)

Starting GridSearch for SVR on lg_CC50...
Fitting 5 folds for each of 24 candidates, totalling 120 fits

GridSearch completed in 49.13 seconds
Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best score (neg_mean_squared_error): 0.3596

Test results for SVR on lg_CC50:
R²: 0.3470
RMSE: 1.2187
MAE: 0.8424


['../models/regression\\lg_CC50_SVR.pkl']

In [46]:
model = 'RandomForest-Reg'
grid_search = GridSearchCV(
    estimator=reg_models[model],
    param_grid=param_grids[model],
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Запуск поиска
start_time = time.time()
print(f"Starting GridSearch for {model} on {target}...")
grid_search.fit(X_train_processed, y_train[target])
search_time = time.time() - start_time

# Результаты
print(f"\nGridSearch completed in {search_time:.2f} seconds")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score ({'neg_mean_squared_error'}): {grid_search.best_score_:.4f}")

# Оценка на тестовых данных
best_estimators = grid_search.best_estimator_
y_pred = grid_search.predict(X_test_processed)
r2 = r2_score(y_test[target], y_pred)
rmse = sqrt(mean_squared_error(y_test[target], y_pred))
mae = mean_absolute_error(y_test[target], y_pred)

print(f"\nTest results for {model} on {target}:")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

# Сохранение результатов в DataFrame
current_results = pd.DataFrame({
    'Target': [target],
    'Model': [model],
    'Best Params': [grid_search.best_params_],
    'CV R²': [grid_search.best_score_],
    'Test R²': r2,
    'Test RMSE': rmse,
    'Test MAE': mae,
    'Search time': [search_time]
})
GS_results = pd.concat([GS_results, current_results], ignore_index=True)

model_filename = f"{target}_{model}.pkl"
model_path = os.path.join("../models/regression", model_filename)
joblib.dump(best_estimators, model_path)

Starting GridSearch for RandomForest-Reg on lg_CC50...
Fitting 5 folds for each of 108 candidates, totalling 540 fits

GridSearch completed in 72.46 seconds
Best parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best score (neg_mean_squared_error): 0.4013

Test results for RandomForest-Reg on lg_CC50:
R²: 0.4526
RMSE: 1.1158
MAE: 0.8105


['../models/regression\\lg_CC50_RandomForest-Reg.pkl']

In [47]:
model = 'GradBoost-Reg'
grid_search = GridSearchCV(
    estimator=reg_models[model],
    param_grid=param_grids[model],
    cv=5,
    scoring='r2',
    n_jobs=-1,
    verbose=1
)

# Запуск поиска
start_time = time.time()
print(f"Starting GridSearch for {model} on {target}...")
grid_search.fit(X_train_processed, y_train[target])
search_time = time.time() - start_time

# Результаты
print(f"\nGridSearch completed in {search_time:.2f} seconds")
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score ({'neg_mean_squared_error'}): {grid_search.best_score_:.4f}")

# Оценка на тестовых данных
best_estimators = grid_search.best_estimator_
y_pred = grid_search.predict(X_test_processed)
r2 = r2_score(y_test[target], y_pred)
rmse = sqrt(mean_squared_error(y_test[target], y_pred))
mae = mean_absolute_error(y_test[target], y_pred)

print(f"\nTest results for {model} on {target}:")
print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

# Сохранение результатов в DataFrame
current_results = pd.DataFrame({
    'Target': [target],
    'Model': [model],
    'Best Params': [grid_search.best_params_],
    'CV R²': [grid_search.best_score_],
    'Test R²': r2,
    'Test RMSE': rmse,
    'Test MAE': mae,
    'Search time': [search_time]
})
GS_results = pd.concat([GS_results, current_results], ignore_index=True)

model_filename = f"{target}_{model}.pkl"
model_path = os.path.join("../models/regression", model_filename)
joblib.dump(best_estimators, model_path)

Starting GridSearch for GradBoost-Reg on lg_CC50...
Fitting 5 folds for each of 54 candidates, totalling 270 fits

GridSearch completed in 32.11 seconds
Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best score (neg_mean_squared_error): 0.3938

Test results for GradBoost-Reg on lg_CC50:
R²: 0.4321
RMSE: 1.1365
MAE: 0.8473


['../models/regression\\lg_CC50_GradBoost-Reg.pkl']

In [48]:
GS_results

Unnamed: 0,Target,Model,Best Params,CV R²,Test R²,Test RMSE,Test MAE,Search time
0,lg_CC50,SVR,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.359587,0.346998,1.218658,0.842355,49.133932
1,lg_CC50,RandomForest-Reg,"{'max_depth': 20, 'min_samples_leaf': 1, 'min_...",0.401274,0.452587,1.115789,0.810468,72.457929
2,lg_CC50,GradBoost-Reg,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.393849,0.432075,1.136502,0.847272,32.114968
