In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

df = pd.read_excel('data/data.xlsx', index_col='Unnamed: 0')
# т. к. пустых значений не много, а заполнять их не представляется возможным - удаляем их
df.dropna(how='any', inplace=True)

In [None]:
# Из EDA мы помним о наличии больших выбросов. Уберем их
q_low, q_high = df["SI"].quantile([0.01, 0.99])
df_filtered = df[(df["SI"] >= q_low) & (df["SI"] <= q_high)]

# Логарифмирование SI (т.к. значения сильно скошены, помним из EDA)
df_filtered['SI_log'] = np.log1p(df_filtered['SI'])

# Создадим столбец, который будет содержать 2 поля: 1 если значение превышает медиану и 
# 0 в обратном случае

df_filtered['Class_3'] = [np.nan]*len(df_filtered)
df_filtered.loc[df_filtered['SI_log']>=df_filtered['SI_log'].median(), 'Class_3'] = 1
df_filtered.loc[df_filtered['SI_log']<df_filtered['SI_log'].median(), 'Class_3'] = 0

X = df_filtered.drop(['IC50, mM', 'CC50, mM', 'SI', 'SI_log','Class_3'], axis=1)
y = df_filtered['Class_3']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['SI_log'] = np.log1p(df_filtered['SI'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['Class_3'] = [np.nan]*len(df_filtered)


In [None]:
# Отбор важных признаков
selector_forest = SelectFromModel(
    RandomForestClassifier(n_estimators=100),
    threshold="median"
)
selector_forest.fit(X, y)

selected_features = [
                        'MolLogP', 'TPSA', 'NumHDonors', 'NumHAcceptors', 
                        'fr_halogen', 'qed', 'FractionCSP3', 'SPS'
                    ]+list(X.columns[selector_forest.get_support()])
selected_features = list(set(selected_features))

print(f"Всего отобрано признаков: {len(selected_features)}")

Всего отобрано признаков: 107


In [None]:
# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X[selected_features], y, test_size=0.2, random_state=42
)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', RandomForestClassifier(random_state=42))
])

param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [5, 10, 15, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 'log2', None]
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("\nМодель классификации случайного леса:")
print("Лучшие параметры:", grid_search.best_params_)
print(f"Основные метрики:\n {classification_report(y_test, y_pred)}")
print(f"Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}")

  _data = np.array(data, dtype=dtype, copy=copy,



Модель классификации случайного леса:
Лучшие параметры: {'clf__max_depth': 5, 'clf__max_features': None, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 5, 'clf__n_estimators': 100}
Основные метрики:
               precision    recall  f1-score   support

         0.0       0.74      0.85      0.79        92
         1.0       0.85      0.74      0.79       104

    accuracy                           0.79       196
   macro avg       0.79      0.79      0.79       196
weighted avg       0.80      0.79      0.79       196

Confusion Matrix:
 [[78 14]
 [27 77]]


In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 7],
    'learning_rate': [0.05, 0.1, 0.15],
    'gamma': [0, 0.1],
    'reg_alpha': [0, 0.1],
    'subsample': [0.8, 0.9], 
    'colsample_bytree': [0.8, 0.9]
}

xgb_classifier = XGBClassifier(
        objective='binary:logistic', # т. к. только 2 класса
        random_state=42
    )

grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
)

# Запуск поиска
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)

print("\nМодель классификации XGB:")
print("Лучшие параметры:", grid_search.best_params_)
print(f"Основные метрики: {classification_report(y_test, y_pred)}")
print(f"Confusion Matrix: {confusion_matrix(y_test, y_pred)}")


Модель классификации XGB:
Лучшие параметры: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50, 'reg_alpha': 0.1, 'subsample': 0.8}
Основные метрики:               precision    recall  f1-score   support

         0.0       0.72      0.83      0.77        92
         1.0       0.82      0.71      0.76       104

    accuracy                           0.77       196
   macro avg       0.77      0.77      0.77       196
weighted avg       0.77      0.77      0.77       196

Confusion Matrix: [[76 16]
 [30 74]]


  _data = np.array(data, dtype=dtype, copy=copy,


В данной задаче выиграл случайный лес, значит, стоит использовать его