In [1]:
import os
current_directory = os.getcwd()
folder_name = os.path.basename(current_directory)
number = folder_name

In [2]:
CONFIG = {
    'data_main': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/main/',
    'data_train_process': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_process/',
    'data_train_split': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_split/',
    'train_path': 'train.csv',
    'folds_path': 'v1.csv', 

    'DEVICE' : 'cuda',
    'SEED' : 42,
}

In [3]:
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
import joblib 
from metric import score_
from lifelines import KaplanMeierFitter

In [4]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y

In [5]:
# Load and prepare data
train = pd.read_csv(f"{CONFIG['data_main']}{CONFIG['train_path']}")
train = train.fillna('-1')

for col in ['donor_age', 'age_at_hct']:
    train[col] = train[col].astype(int)
    
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

cat_columns = [col for col in train.columns if col not in ['efs', 'efs_time', 'y', 'ID']]
train[cat_columns] = train[cat_columns].astype(str)
train_one_hot = pd.get_dummies(train[cat_columns], drop_first=True)
train_one_hot.columns = train_one_hot.columns.str.replace(r'[^\w\s]', '', regex=True)

In [6]:
# Определение признаков и целевой переменной
X = train_one_hot
y = train['y']

In [7]:
# Установка параметров кросс-валидации
n_splits = 2
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CONFIG['SEED'])

In [8]:
# Фиксированные параметры
fixed_params = {
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': CONFIG['SEED'],
    'missing': -1.0,
    'device': 'cuda',
    'eval_metric': 'rmse'
}

In [9]:
# Функция для оптимизации XGBoost
def optimize_xgb(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 100),
        'max_leaves': trial.suggest_int('max_leaves', 3, 100),
        'max_bin': trial.suggest_int('max_bin', 3, 100), 
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']), 
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.01, log=True), #
        'n_estimators': trial.suggest_int('n_estimators', 3, 100),
        'gamma': trial.suggest_float('gamma', 0.01, 1, log=True), 
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 100),
        'max_delta_step': trial.suggest_int('max_delta_step', 1, 100), 
        'subsample': trial.suggest_float('subsample', 0.3, 1),
        'sampling_method': trial.suggest_categorical('sampling_method', ['uniform', 'gradient_based']),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.3, 1),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-7, 1e-2, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-7, 1e-2, log=True),
        'importance_type': trial.suggest_categorical('importance_type', ['weight', 'gain', 'cover']),
        'early_stopping_rounds': 300

    }
    # Объединение оптимизированных и фиксированных параметров
    final_params = {**fixed_params, **params}
    
    scores = []
    
    for train_idx, valid_idx in skf.split(X, train['race_group']):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        dtrain = xgb.DMatrix(X_train.to_numpy(), label=y_train, missing=np.nan)
        dvalid = xgb.DMatrix(X_valid.to_numpy(), label=y_valid, missing=np.nan)
        
        model = xgb.train(final_params, dtrain, evals=[(dvalid, 'eval')], verbose_eval=False)
    
        preds = model.predict(dvalid)
        fold_score = score_(pd.DataFrame({'efs': train.iloc[valid_idx]['efs'].to_list(), 
                                          'efs_time' : train.iloc[valid_idx]['efs_time'].to_list(),
                                          'race_group': train.iloc[valid_idx]['race_group'].to_list()}),
                            pd.DataFrame(preds, columns=['prediction']))
        scores.append(fold_score)
    
    return np.mean(scores)

In [10]:
# Оптимизация гиперпараметров с помощью Optuna
study_xgb = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study_xgb.optimize(optimize_xgb, n_trials=25)

[I 2024-12-29 15:33:43,369] A new study created in memory with name: no-name-b77b0f4a-eb35-4c00-8fca-e3ecc4a524dd
[I 2024-12-29 15:33:45,157] Trial 0 finished with value: 0.6265185595971899 and parameters: {'max_depth': 39, 'max_leaves': 96, 'max_bin': 74, 'grow_policy': 'depthwise', 'learning_rate': 0.00020511104188433984, 'n_estimators': 8, 'gamma': 0.5399484409787431, 'min_child_weight': 61, 'max_delta_step': 71, 'subsample': 0.3144091460070617, 'sampling_method': 'uniform', 'colsample_bytree': 0.44863737747479326, 'colsample_bylevel': 0.42727747704497043, 'reg_alpha': 8.260808399079598e-07, 'reg_lambda': 3.3205591037519573e-06, 'importance_type': 'weight'}. Best is trial 0 with value: 0.6265185595971899.
[I 2024-12-29 15:33:46,183] Trial 1 finished with value: 0.6238514369939092 and parameters: {'max_depth': 62, 'max_leaves': 16, 'max_bin': 31, 'grow_policy': 'lossguide', 'learning_rate': 0.0037183641805732083, 'n_estimators': 22, 'gamma': 0.10677482709481352, 'min_child_weight': 6

In [None]:
best_params_xgb = study_xgb.best_params

In [None]:
# Объединение лучших параметров с фиксированными
final_params = {**fixed_params, **best_params_xgb}

In [None]:
# Финальное обучение на всей выборке и сохранение моделей
final_xgb = xgb.XGBRegressor(**final_params)
final_xgb.fit(X, y)
joblib.dump(final_xgb, 'final_xgb_model.pkl')

In [26]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CONFIG['SEED'])

# Фиксированные параметры для XGBoost
fixed_params = {
    'objective': 'reg:squarederror',
    'verbosity': 0,
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': CONFIG['SEED'],
    'missing': -1.0,
    'device': 'cuda',
    'eval_metric': 'rmse'
}

# Фиксируем один фолд для обучения
train_idx, valid_idx = list(skf.split(X, train['race_group']))[0]  # Берем первый фолд

# Разделение на тренировочные и валидационные данные
X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

# Преобразуем в DMatrix для XGBoost
dtrain = xgb.DMatrix(X_train.to_numpy(), label=y_train, missing=np.nan)
dvalid = xgb.DMatrix(X_valid.to_numpy(), label=y_valid, missing=np.nan)

# Обучение модели
model = xgb.train(fixed_params, dtrain, evals=[(dvalid, 'eval')], verbose_eval=True, num_boost_round=75)

# Предсказания
preds = model.predict(dvalid)

# Вычисление скора
fold_score = score_(pd.DataFrame({'efs': train.iloc[valid_idx]['efs'].to_list(), 
                                  'efs_time' : train.iloc[valid_idx]['efs_time'].to_list(),
                                  'race_group': train.iloc[valid_idx]['race_group'].to_list()}),
                    pd.DataFrame(preds, columns=['prediction']))

# Вывод результата
print(f'Fold score: {fold_score}')

[0]	eval-rmse:0.17302
[1]	eval-rmse:0.17062
[2]	eval-rmse:0.16904
[3]	eval-rmse:0.16791
[4]	eval-rmse:0.16722
[5]	eval-rmse:0.16678
[6]	eval-rmse:0.16635
[7]	eval-rmse:0.16583
[8]	eval-rmse:0.16542
[9]	eval-rmse:0.16502
[10]	eval-rmse:0.16451
[11]	eval-rmse:0.16416
[12]	eval-rmse:0.16410
[13]	eval-rmse:0.16409
[14]	eval-rmse:0.16377
[15]	eval-rmse:0.16376
[16]	eval-rmse:0.16360
[17]	eval-rmse:0.16345
[18]	eval-rmse:0.16344
[19]	eval-rmse:0.16316
[20]	eval-rmse:0.16300
[21]	eval-rmse:0.16287
[22]	eval-rmse:0.16286
[23]	eval-rmse:0.16264
[24]	eval-rmse:0.16263
[25]	eval-rmse:0.16261
[26]	eval-rmse:0.16251
[27]	eval-rmse:0.16263
[28]	eval-rmse:0.16260
[29]	eval-rmse:0.16259
[30]	eval-rmse:0.16262
[31]	eval-rmse:0.16257
[32]	eval-rmse:0.16258
[33]	eval-rmse:0.16265
[34]	eval-rmse:0.16275
[35]	eval-rmse:0.16281
[36]	eval-rmse:0.16282
[37]	eval-rmse:0.16277
[38]	eval-rmse:0.16283
[39]	eval-rmse:0.16283
[40]	eval-rmse:0.16279
[41]	eval-rmse:0.16272
[42]	eval-rmse:0.16264
[43]	eval-rmse:0.1626