In [31]:
import os
current_directory = os.getcwd()
folder_name = os.path.basename(current_directory)
number = folder_name

In [32]:
CONFIG = {
    'data_main': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/main/',
    'data_train_process': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_process/',
    'data_train_split': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/train_split/',
    'train_path': 'train.csv',
    'folds_path': 'v1.csv', 

    'DEVICE' : 'cuda',
    'SEED' : 42,
}

In [33]:
from catboost import CatBoostRegressor
import optuna
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
import joblib 
from metric import score_
from lifelines import KaplanMeierFitter

In [34]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y

In [35]:
# Load and prepare data
train = pd.read_csv(f"{CONFIG['data_main']}{CONFIG['train_path']}")
train = train.fillna('-1')

# for col in ['donor_age', 'age_at_hct']:
#     train[col] = train[col].astype(int)
    
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

cat_columns = [col for col in train.columns if col not in ['efs', 'efs_time', 'y', 'ID', 'donor_age', 'age_at_hct', 'race_group']]
# train[cat_columns] = train[cat_columns].astype(str)
train_one_hot = pd.get_dummies(train[cat_columns], drop_first=True)

In [36]:
# Определение признаков и целевой переменной
X = train.drop(columns=['y', 'efs', 'efs_time', 'ID', 'race_group'])
for col in X.columns:  # Предполагаем, что cat_features — это список индексов категориальных признаков
    X[col] = X[col].astype(str)
X['donor_age'] = X['donor_age'].astype(float)
X['age_at_hct'] = X['age_at_hct'].astype(float)

y = train['y']

In [37]:
# Установка параметров кросс-валидации
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CONFIG['SEED'])

In [38]:
# Фиксированные параметры
fixed_params = {
    'od_wait': 50,
    'od_type': 'IncToDec',
    'leaf_estimation_method': 'Newton',
    'verbose': False,
    'eval_metric': 'RMSE',
    'allow_writing_files': False,
    'task_type': 'GPU',
    'devices': '0',
    'score_function': 'NewtonL2',
    'leaf_estimation_backtracking': 'Armijo',
    'boost_from_average': False,
    'grow_policy': 'Lossguide',
    
}


In [39]:
# Функция для оптимизации CatBoost
def optimize_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'depth': trial.suggest_int('depth', 4, 16),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 100, log=True),
        'model_size_reg': trial.suggest_int('model_size_reg', 0, 25),
        'border_count': trial.suggest_int('border_count', 128, 255),
        'feature_border_type': trial.suggest_categorical('feature_border_type', ['Median', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum', 'MaxLogSum', 'MinEntropy']),
        'fold_permutation_block': trial.suggest_int('fold_permutation_block', 128, 256),
        'od_pval': trial.suggest_float('od_pval', 0.001, 0.1),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 16),
        'best_model_min_trees': trial.suggest_int('best_model_min_trees', 10, 100),
        'random_strength': trial.suggest_float('random_strength', 0.1, 1),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.1, 1.0),
        'penalties_coefficient': trial.suggest_float('penalties_coefficient', 0.1, 10),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 10),
        'max_leaves': trial.suggest_int('max_leaves', 31, 256),
    }
    # Объединение оптимизированных и фиксированных параметров
    final_params = {**fixed_params, **params}
    
    scores = []
    
    for train_idx, valid_idx in skf.split(X, train['race_group']):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = CatBoostRegressor(cat_features=cat_columns, **final_params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        
        preds = model.predict(X_valid)
        fold_score = score_(pd.DataFrame({'efs': train.iloc[valid_idx]['efs'].to_list(), 
                                          'efs_time' : train.iloc[valid_idx]['efs_time'].to_list(),
                                          'race_group': train.iloc[valid_idx]['race_group'].to_list()}),
                            pd.DataFrame(preds, columns=['prediction']))
        scores.append(fold_score)
    
    return np.mean(scores)

In [40]:
study_cat = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=CONFIG['SEED']))
study_cat.optimize(optimize_cat, n_trials=10)

[I 2024-12-29 19:30:59,106] A new study created in memory with name: no-name-531467fa-a7e2-476a-9b6e-303af4e98dbf
[I 2024-12-29 19:33:02,537] Trial 0 finished with value: 0.6624631553801115 and parameters: {'iterations': 437, 'learning_rate': 0.07969454818643935, 'depth': 13, 'l2_leaf_reg': 0.1550991398759431, 'model_size_reg': 4, 'border_count': 147, 'feature_border_type': 'MinEntropy', 'fold_permutation_block': 235, 'od_pval': 0.022021571957149343, 'leaf_estimation_iterations': 3, 'best_model_min_trees': 26, 'random_strength': 0.373818018663584, 'bagging_temperature': 0.5722807884690141, 'penalties_coefficient': 4.376255684556946, 'min_data_in_leaf': 3, 'max_leaves': 169}. Best is trial 0 with value: 0.6624631553801115.
[I 2024-12-29 19:34:15,035] Trial 1 finished with value: 0.6473148718129715 and parameters: {'iterations': 225, 'learning_rate': 0.00383962929980417, 'depth': 8, 'l2_leaf_reg': 0.015577217702693031, 'model_size_reg': 20, 'border_count': 153, 'feature_border_type': 'Gr

In [41]:
# Получение лучших параметров
best_params_cat = study_cat.best_params

# Объединение лучших параметров с фиксированными
final_params = {**fixed_params, **best_params_cat}

In [44]:
final_cat = CatBoostRegressor(cat_features=cat_columns, **final_params)
final_cat.fit(X, y)

<catboost.core.CatBoostRegressor at 0x201bc4c0bb0>

In [45]:
joblib.dump(final_cat, f"{number}_model.pkl")

['2.1.2_model.pkl']

In [46]:
cat_importance = final_cat.get_feature_importance()

In [47]:
# CatBoost
cat_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': cat_importance
}).sort_values(by='Importance', ascending=False)

print("CatBoost Feature Importances:\n", cat_importance_df)

CatBoost Feature Importances:
                    Feature  Importance
0                dri_score   17.876582
26  conditioning_intensity   10.323241
45       comorbidity_score    8.476335
28                year_hct    6.233181
46         karnofsky_score    5.863658
25       cyto_score_detail    4.725139
35               donor_age    4.149737
13        prim_disease_hct    4.051642
39              age_at_hct    3.415143
2               cyto_score    3.139226
43               sex_match    2.424936
30                 mrd_hct    2.299390
41              gvhd_proph    2.239761
12             pulm_severe    1.697423
15              cmv_status    1.678009
49           donor_related    1.232743
6               tbi_status    1.159096
7               arrhythmia    1.139254
36             prior_tumor    1.049504
52                 cardiac    1.022376
31             in_vivo_tcd    0.989153
33        hla_match_a_high    0.986772
3                 diabetes    0.919529
19              hla_nmdp_6    0.8

In [48]:
cat_importance_df

Unnamed: 0,Feature,Importance
0,dri_score,17.876582
26,conditioning_intensity,10.323241
45,comorbidity_score,8.476335
28,year_hct,6.233181
46,karnofsky_score,5.863658
25,cyto_score_detail,4.725139
35,donor_age,4.149737
13,prim_disease_hct,4.051642
39,age_at_hct,3.415143
2,cyto_score,3.139226
