In [1]:
import os
current_directory = os.getcwd()
folder_name = os.path.basename(current_directory)
number = folder_name

In [2]:
CONFIG = {
    'data_main': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/main/',
    'train_path': 'train.csv',

    'DEVICE' : 'cuda',
    'SEED' : 42,
}

In [3]:
from catboost import CatBoostRegressor
import optuna
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
import joblib 
from metric import score_
from lifelines import KaplanMeierFitter

In [4]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y

In [5]:
train = pd.read_csv(f"{CONFIG['data_main']}{CONFIG['train_path']}")
    
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

num_columns = [col for col in train.columns if pd.api.types.is_numeric_dtype(train[col])]
cat_columns = [col for col in train.columns if col not in num_columns]

In [6]:
len(num_columns), len(cat_columns)

(26, 35)

In [7]:
X = train.drop(columns=['y', 'efs', 'efs_time', 'ID'])

for col in X.columns: 
    if col in cat_columns:
        X[col] = X[col].astype(str)
        X[col] = X[col].fillna('-1')
    elif col in num_columns:
        X[col] = X[col].fillna(X[col].mode()[0])

y = train['y']

In [8]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CONFIG['SEED'])

In [9]:
# Фиксированные параметры
fixed_params = {
    'model_size_reg': 0, # default=0.5, possible=[0, inf)
    'rsm': 1, # default=1, possible=[0, inf), alias='colsample_bylevel'
    'loss_function': 'RMSE', # default='RMSE", possible=..., alias='objective'
    'feature_border_type': 'GreedyLogSum', # default='GreedyLogSum  ", possible=['Median', 'Uniform', 'UniformAndQuantiles', 'MaxLogSum', 'MinEntropy', 'GreedyLogSum']
    'nan_mode': 'Forbidden', # default='Min', possible=['Forbidden', 'Min', 'Max']
    'leaf_estimation_method': 'Newton', # default=..., possible=['Newton', 'Gradient', 'Exact']
    'random_seed': CONFIG['SEED'],
    'use_best_model': False, # default=True, possible=[True, False]
    'logging_level': 'Silent', # default=True, possible=[True, False]
    'max_ctr_complexity': 15, # default=True, possible=[1, 16)
    'one_hot_max_size': 50, # default=True, possible=[2, inf)
    'random_strength' : 0, # default=1, possible=[0, inf)
    'eval_metric': 'RMSE',
    'allow_writing_files': False,
    'task_type': 'GPU',
    'bootstrap_type': 'Bayesian',
    'early_stopping_rounds': 250,
    'grow_policy': 'Lossguide', # default='SymmetricTree', possible=['SymmetricTree', 'Depthwise']
    'score_function': 'NewtonL2', # default='Cosine', possible=['Cosine ', 'L2', 'NewtonCosine', 'NewtonL2']
    'leaf_estimation_backtracking': 'AnyImprovement',
    'boost_from_average': True
}


In [10]:
def optimize_cat(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 2000), # default=1000, possible=[1, inf), alias=['num_boost_round', 'n_estimators', 'num_trees']
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.01), # default=0.03, alias='eta'
        'depth': trial.suggest_int('depth', 6, 16), # default=6 (16 if Lossguide), possible=[1, 16], alias='max_depth'
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10), # default=3.0, alias=reg_lambda
        'border_count': trial.suggest_int('border_count', 256, 1024), # default=128, possible=[0, 65535], alias='max_bin'
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 6),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.01, 1), # default=0.66, possible=[0, inf)
        'max_leaves': trial.suggest_int('max_leaves', 31, 42), # default=31, possible=[?, inf), alias='num_leaves'
    }
    # Объединение оптимизированных и фиксированных параметров
    final_params = {**fixed_params, **params}
    
    scores = []
    
    for train_idx, valid_idx in skf.split(X, train['race_group']):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = CatBoostRegressor(cat_features=cat_columns, **final_params)
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
        
        preds = model.predict(X_valid)
        fold_score = score_(pd.DataFrame({'efs': train.iloc[valid_idx]['efs'].to_list(), 
                                          'efs_time' : train.iloc[valid_idx]['efs_time'].to_list(),
                                          'race_group': train.iloc[valid_idx]['race_group'].to_list()}),
                            pd.DataFrame(preds, columns=['prediction']))
        scores.append(fold_score)
    
    return np.mean(scores)

In [11]:
study_cat = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=CONFIG['SEED']))
study_cat.optimize(optimize_cat, n_trials=43)

[I 2024-12-30 19:19:18,218] A new study created in memory with name: no-name-eecaabcf-35e3-4291-91ee-8cf0043803df
[I 2024-12-30 19:20:16,844] Trial 0 finished with value: 0.6669471372775718 and parameters: {'iterations': 812, 'learning_rate': 0.009556428757689247, 'depth': 14, 'l2_leaf_reg': 5.9865888553855235, 'border_count': 375, 'leaf_estimation_iterations': 1, 'bagging_temperature': 0.06750277604651747, 'max_leaves': 41}. Best is trial 0 with value: 0.6669471372775718.
[I 2024-12-30 19:21:26,751] Trial 1 finished with value: 0.6660540539677214 and parameters: {'iterations': 1242, 'learning_rate': 0.00737265320016441, 'depth': 6, 'l2_leaf_reg': 9.699098822521421, 'border_count': 896, 'leaf_estimation_iterations': 2, 'bagging_temperature': 0.19000671753502962, 'max_leaves': 33}. Best is trial 0 with value: 0.6669471372775718.
[I 2024-12-30 19:22:15,891] Trial 2 finished with value: 0.6623923146879054 and parameters: {'iterations': 678, 'learning_rate': 0.005722807884690141, 'depth': 

In [12]:
best_params_cat = study_cat.best_params
final_params = {**fixed_params, **best_params_cat}  

In [13]:
final_cat = CatBoostRegressor(cat_features=cat_columns, **final_params)
final_cat.fit(X, y)

<catboost.core.CatBoostRegressor at 0x2de6b633cd0>

In [14]:
final_cat.model_number = number

In [15]:
joblib.dump(final_cat, f"{number}_model.pkl")

['2.1.5_model.pkl']

In [16]:
cat_importance = final_cat.get_feature_importance()

In [17]:
cat_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': cat_importance
}).sort_values(by='Importance', ascending=False)

print("CatBoost Feature Importances:\n", cat_importance_df)

CatBoost Feature Importances:
                    Feature  Importance
0                dri_score   15.890126
26  conditioning_intensity    9.393092
46       comorbidity_score    7.658931
28                year_hct    6.191124
35               donor_age    6.049884
39              age_at_hct    5.701769
47         karnofsky_score    4.503757
25       cyto_score_detail    3.973986
43               sex_match    2.904849
13        prim_disease_hct    2.833176
2               cyto_score    2.831998
30                 mrd_hct    2.374350
41              gvhd_proph    2.309265
45              race_group    1.884209
15              cmv_status    1.877941
12             pulm_severe    1.746401
50           donor_related    1.322459
53                 cardiac    1.293587
36             prior_tumor    1.191347
7               arrhythmia    1.148370
6               tbi_status    1.116255
31             in_vivo_tcd    1.023788
19              hla_nmdp_6    0.989357
3                 diabetes    0.9

In [None]:
# auto_params = ['per_float_feature_quantization', 'input_borders', 'output_borders', 'fold_permutation_block', 'counter_calc_method', 'thread_count', 'best_model_min_trees', 'verbose', 'silent', 'metric_period', 'ctr_leaf_count_limit', 'store_all_simple_ctr', 'has_time', 'allow_const_label', 'target_border', 'random_score_type', 'name', 'ignored_features', 'train_dir', 'custom_metric', 'save_snapshot', 'snapshot_file', 'snapshot_interval', 'fold_len_multiplier', 'used_ram_limit', 'gpu_ram_part', 'pinned_memory_size', 'final_ctr_computation_mode', 'simple_ctr', 'combinations_ctr', 'per_feature_ctr', 'ctr_description', 'ctr_target_border_count', 'device_config', 'devices', 'subsample', 'mvs_reg', 'sampling_frequency', 'sampling_unit', 'subsampling_factor', 'dev_score_calc_obj_block_size', 'dev_efb_max_buckets', 'sparse_features_conflict_fraction', 'max_depth', 'n_estimators', 'num_boost_round', 'num_trees', 'colsample_bylevel', 'random_state', 'reg_lambda', 'reg_lambda', 'eta', 'max_bin', 'gpu_cat_features_storage', 'data_partition', 'metadata', 'min_data_in_leaf', 'min_child_samples', 'num_leaves', 'score_function', 'ctr_history_unit', 'monotone_constraints', 'feature_weights', 'penalties_coefficient', 'first_feature_use_penalties', 'per_object_feature_penalties', 'model_shrink_rate', 'model_shrink_mode', 'langevin', 'diffusion_temperature', 'posterior_sampling', 'text_features', 'tokenizers', 'dictionaries', 'feature_calcers', 'text_processing', 'embedding_features', 'eval_fraction', 'fixed_binary_splits', 'od_type', 'od_pval', 'od_wait', 'approx_on_full_history', 'boosting_type']