In [1]:
import os
current_directory = os.getcwd()
folder_name = os.path.basename(current_directory)
number = folder_name

In [2]:
CONFIG = {
    'data_main': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/main/',
    'train_path': 'train.csv',

    'DEVICE' : 'cuda',
    'SEED' : 42,
}

In [3]:
from catboost import CatBoostRegressor
import optuna
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
import joblib 
from metric import score__
from lifelines import KaplanMeierFitter

In [4]:
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):
    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], df[event_col])
    y = kmf.survival_function_at_times(df[time_col]).values
    return y

In [5]:
train = pd.read_csv(f"{CONFIG['data_main']}{CONFIG['train_path']}")
    
train["y"] = transform_survival_probability(train, time_col='efs_time', event_col='efs')

num_columns = ['y', 'efs', 'efs_time', 'ID', 'age_at_hct', 'donor_age']
cat_columns = [col for col in train.columns if col not in num_columns]

In [6]:
X = train.drop(columns=['y', 'efs', 'efs_time', 'ID'])

for col in X.columns: 
    if col in cat_columns:
        X[col] = X[col].astype(str)
        X[col] = X[col].fillna('-1')
    elif col in num_columns:
        X[col] = X[col].fillna(X[col].mode()[0])

y = train['y']

In [7]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CONFIG['SEED'])

In [8]:
fixed_params = {
    'loss_function': 'RMSE', 
    'random_seed': CONFIG['SEED'],
    'logging_level': 'Silent', # default=True, possible=[True, False]
    'eval_metric': 'RMSE',
    'allow_writing_files': False,
    'task_type': 'GPU',
}

In [9]:
# Объединение фиксированных параметров модели и гиперпараметров
final_params = {**fixed_params}

scores = []

for train_idx, valid_idx in skf.split(X, train['race_group']):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = CatBoostRegressor(cat_features=cat_columns, **final_params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

    preds = model.predict(X_valid)
    fold_score = score__(pd.DataFrame({'efs': train.iloc[valid_idx]['efs'].to_list(),
                                       'efs_time': train.iloc[valid_idx]['efs_time'].to_list(),
                                       'race_group': train.iloc[valid_idx]['race_group'].to_list()}),
                         pd.DataFrame(preds, columns=['prediction']))
    scores.append(fold_score)

# Средний результат модели
mean_score = np.mean(scores)
print(f"Средний результат модели: {mean_score}")

Средний результат модели: 0.6774861684100353


In [10]:
final_cat = CatBoostRegressor(cat_features=cat_columns, **fixed_params)
final_cat.fit(X, y)

<catboost.core.CatBoostRegressor at 0x1b980023a00>

In [11]:
final_cat.model_number = number

In [12]:
joblib.dump(final_cat, f"{number}_model.pkl")

['2.1.11_model.pkl']

In [13]:
cat_importance = final_cat.get_feature_importance()

In [14]:
cat_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': cat_importance
}).sort_values(by='Importance', ascending=False)

print("CatBoost Feature Importances:\n", cat_importance_df)

CatBoost Feature Importances:
                    Feature  Importance
0                dri_score   12.150488
28                year_hct    8.264307
26  conditioning_intensity    8.146054
46       comorbidity_score    7.050975
47         karnofsky_score    5.613688
35               donor_age    4.475686
13        prim_disease_hct    4.400802
25       cyto_score_detail    4.315873
43               sex_match    3.359689
30                 mrd_hct    3.270023
2               cyto_score    2.922675
50           donor_related    2.517539
12             pulm_severe    2.159714
39              age_at_hct    2.140378
15              cmv_status    2.072842
41              gvhd_proph    1.829948
31             in_vivo_tcd    1.681116
53                 cardiac    1.592186
36             prior_tumor    1.578811
45              race_group    1.483207
54     hla_match_drb1_high    1.331656
33        hla_match_a_high    1.260799
19              hla_nmdp_6    1.135713
7               arrhythmia    1.0

In [15]:
# auto_params = ['per_float_feature_quantization', 'input_borders', 'output_borders', 'fold_permutation_block', 'counter_calc_method', 'thread_count', 'best_model_min_trees', 'verbose', 'silent', 'metric_period', 'ctr_leaf_count_limit', 'store_all_simple_ctr', 'has_time', 'allow_const_label', 'target_border', 'random_score_type', 'name', 'ignored_features', 'train_dir', 'custom_metric', 'save_snapshot', 'snapshot_file', 'snapshot_interval', 'fold_len_multiplier', 'used_ram_limit', 'gpu_ram_part', 'pinned_memory_size', 'final_ctr_computation_mode', 'simple_ctr', 'combinations_ctr', 'per_feature_ctr', 'ctr_description', 'ctr_target_border_count', 'device_config', 'devices', 'subsample', 'mvs_reg', 'sampling_frequency', 'sampling_unit', 'subsampling_factor', 'dev_score_calc_obj_block_size', 'dev_efb_max_buckets', 'sparse_features_conflict_fraction', 'max_depth', 'n_estimators', 'num_boost_round', 'num_trees', 'colsample_bylevel', 'random_state', 'reg_lambda', 'reg_lambda', 'eta', 'max_bin', 'gpu_cat_features_storage', 'data_partition', 'metadata', 'min_data_in_leaf', 'min_child_samples', 'num_leaves', 'score_function', 'ctr_history_unit', 'monotone_constraints', 'feature_weights', 'penalties_coefficient', 'first_feature_use_penalties', 'per_object_feature_penalties', 'model_shrink_rate', 'model_shrink_mode', 'langevin', 'diffusion_temperature', 'posterior_sampling', 'text_features', 'tokenizers', 'dictionaries', 'feature_calcers', 'text_processing', 'embedding_features', 'eval_fraction', 'fixed_binary_splits', 'od_type', 'od_pval', 'od_wait', 'approx_on_full_history', 'boosting_type']