In [40]:
import os
current_directory = os.getcwd()
folder_name = os.path.basename(current_directory)
number = folder_name

In [41]:
CONFIG = {
    'data_main': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/main/',
    'data_train_process': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_process/',
    'data_train_split': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_split/',
    'train_path': 'train.csv',
    'folds_path': 'v1.csv', 

    'DEVICE' : 'cuda',
    'SEED' : 42,
}

In [42]:
import optuna
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import joblib
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
import joblib 
from metric import score_
from lifelines import KaplanMeierFitter

In [43]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y

In [44]:
# Load and prepare data
train = pd.read_csv(f"{CONFIG['data_main']}{CONFIG['train_path']}")
train = train.fillna('-1')

for col in ['donor_age', 'age_at_hct']:
    train[col] = train[col].astype(int)
    
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

cat_columns = [col for col in train.columns if col not in ['efs', 'efs_time', 'y', 'ID']]
train[cat_columns] = train[cat_columns].astype(str)
train_one_hot = pd.get_dummies(train[cat_columns], drop_first=True)

In [45]:
# Определение признаков и целевой переменной
X = train.drop(columns=['y', 'efs', 'efs_time', 'ID', 'race_group'])
for col in X.columns:  # Предполагаем, что cat_features — это список индексов категориальных признаков
    X[col] = X[col].astype(str)

y = train['y']

In [46]:
# Установка параметров кросс-валидации
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CONFIG['SEED'])

In [47]:
# Фиксированные параметры
fixed_params = {
    'od_wait': 50,
    'od_type': 'IncToDec',
    'leaf_estimation_method': 'Newton',
    'verbose': False,
    'eval_metric': 'RMSE',
    'allow_writing_files': False,
    'task_type': 'GPU',
    'devices': '0',
    'score_function': 'NewtonL2',
    'leaf_estimation_backtracking': 'Armijo',
    'boost_from_average': False,
    'grow_policy': 'Lossguide',
    
}


In [48]:
# Функция для оптимизации CatBoost
def optimize_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 16),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 100, log=True),
        'model_size_reg': trial.suggest_int('model_size_reg', 0, 25),
        'border_count': trial.suggest_int('border_count', 128, 255),
        'feature_border_type': trial.suggest_categorical('feature_border_type', ['Median', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum', 'MaxLogSum', 'MinEntropy']),
        'fold_permutation_block': trial.suggest_int('fold_permutation_block', 128, 256),
        'od_pval': trial.suggest_float('od_pval', 0.001, 0.1),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 16),
        'best_model_min_trees': trial.suggest_int('best_model_min_trees', 10, 100),
        'random_strength': trial.suggest_float('random_strength', 0.1, 1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 1.0),
        'penalties_coefficient': trial.suggest_float('penalties_coefficient', 0.1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 31, 256),
    }
    # Объединение оптимизированных и фиксированных параметров
    final_params = {**fixed_params, **params}
    
    scores = []
    
    for train_idx, valid_idx in skf.split(X, train['race_group']):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = CatBoostRegressor(cat_features=list(X.columns), **final_params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        
        preds = model.predict(X_valid)
        fold_score = score_(pd.DataFrame({'efs': train.iloc[valid_idx]['efs'].to_list(), 
                                          'efs_time' : train.iloc[valid_idx]['efs_time'].to_list(),
                                          'race_group': train.iloc[valid_idx]['race_group'].to_list()}),
                            pd.DataFrame(preds, columns=['prediction']))
        scores.append(fold_score)
    
    return np.mean(scores)

In [49]:
study_cat = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study_cat.optimize(optimize_cat, n_trials=10)

[I 2024-12-29 16:25:06,184] A new study created in memory with name: no-name-d9c16c42-ce5b-4bb9-8161-600b5ed82383
[I 2024-12-29 16:27:43,104] Trial 0 finished with value: 0.6617160003925173 and parameters: {'iterations': 437, 'learning_rate': 0.07969454818643935, 'depth': 13, 'l2_leaf_reg': 0.1550991398759431, 'model_size_reg': 4, 'border_count': 147, 'feature_border_type': 'MinEntropy', 'fold_permutation_block': 235, 'od_pval': 0.022021571957149343, 'leaf_estimation_iterations': 3, 'best_model_min_trees': 26, 'random_strength': 0.373818018663584, 'bagging_temperature': 0.5722807884690141, 'penalties_coefficient': 4.376255684556946, 'min_data_in_leaf': 3, 'max_leaves': 169}. Best is trial 0 with value: 0.6617160003925173.
[I 2024-12-29 16:29:14,987] Trial 1 finished with value: 0.6468409428225212 and parameters: {'iterations': 225, 'learning_rate': 0.00383962929980417, 'depth': 8, 'l2_leaf_reg': 0.015577217702693031, 'model_size_reg': 20, 'border_count': 153, 'feature_border_type': 'Gr

In [50]:
# Получение лучших параметров
best_params_cat = study_cat.best_params

# Объединение лучших параметров с фиксированными
final_params = {**fixed_params, **best_params_cat}

In [51]:
final_cat = CatBoostRegressor(cat_features=list(X.columns), **final_params)
final_cat.fit(X, y)

<catboost.core.CatBoostRegressor at 0x1a8b9c91ff0>

In [52]:
joblib.dump(final_cat, f"{number}_model.pkl")

['2.1.1_model.pkl']

In [53]:
cat_importance = final_cat.get_feature_importance()

In [54]:
# CatBoost
cat_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': cat_importance
}).sort_values(by='Importance', ascending=False)

print("CatBoost Feature Importances:\n", cat_importance_df)

CatBoost Feature Importances:
                    Feature  Importance
0                dri_score   18.702584
26  conditioning_intensity   10.750125
45       comorbidity_score    8.981928
28                year_hct    6.490198
46         karnofsky_score    6.330510
25       cyto_score_detail    4.972361
13        prim_disease_hct    4.424496
2               cyto_score    3.086496
43               sex_match    2.508860
41              gvhd_proph    2.341421
35               donor_age    2.312849
30                 mrd_hct    2.270646
15              cmv_status    1.716342
12             pulm_severe    1.703385
39              age_at_hct    1.572447
6               tbi_status    1.220389
49           donor_related    1.196831
7               arrhythmia    1.188559
52                 cardiac    1.140521
36             prior_tumor    1.098597
33        hla_match_a_high    0.974048
31             in_vivo_tcd    0.944731
19              hla_nmdp_6    0.901585
3                 diabetes    0.8

In [55]:
cat_importance_df

Unnamed: 0,Feature,Importance
0,dri_score,18.702584
26,conditioning_intensity,10.750125
45,comorbidity_score,8.981928
28,year_hct,6.490198
46,karnofsky_score,6.33051
25,cyto_score_detail,4.972361
13,prim_disease_hct,4.424496
2,cyto_score,3.086496
43,sex_match,2.50886
41,gvhd_proph,2.341421


In [56]:
train['year_hct'].value_counts()

year_hct
2018    7336
2016    5049
2017    4830
2008    2544
2015    2243
2013    1871
2012    1571
2014    1098
2019     774
2011     599
2009     503
2010     378
2020       4
Name: count, dtype: int64