In [1]:
import os
current_directory = os.getcwd()
folder_name = os.path.basename(current_directory)
number = folder_name

In [2]:
CONFIG = {
    'data_main': 'C:/Users/Николай/PycharmProjects/CIBMTR/D.Data/main/',
    'train_path': 'train.csv',

    'DEVICE' : 'cuda',
    'SEED' : 42,
}

In [3]:
from catboost import CatBoostRegressor
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
import joblib 
from metric import score__
from lifelines import KaplanMeierFitter

In [4]:
train = pd.read_csv(f"{CONFIG['data_main']}{CONFIG['train_path']}")
def transform_survival_probability(df, time_col='efs_time', event_col='efs'):

    kmf = KaplanMeierFitter()
    kmf.fit(df[time_col], event_observed=df[event_col])
    survival_probabilities = kmf.survival_function_at_times(df[time_col]).values.flatten()
    return survival_probabilities

def update_target_with_survival_probabilities(df, time_col='efs_time', event_col='efs'):

    race_group = sorted(df['race_group'].unique())
    survival_probs_dict = {}
    for race in race_group:
        race_df = df[df['race_group'] == race]
        survival_probs_dict[race] = transform_survival_probability(race_df, time_col, event_col)
    for race in race_group:
        df.loc[df['race_group'] == race, 'target'] = survival_probs_dict[race]
    df.loc[df[event_col] == 0, 'target'] -= 0.15
    
    return df

train = update_target_with_survival_probabilities(train, time_col='efs_time', event_col='efs')
train.rename(columns={'target': 'y'}, inplace=True)
train.drop(columns=['ID'], inplace=True)
    

num_columns = ['y', 'efs', 'efs_time', 'age_at_hct', 'donor_age']
cat_columns = [col for col in train.columns if col not in num_columns]
for col in train[['age_at_hct', 'donor_age']].columns:
    train[col] = train[col].fillna(train[col].mode()[0])
for col in train[cat_columns].columns:
    train[col] = train[col].astype(str)
    j_ch = ',[]{}:"\\<'  # набор символов для удаления
    for ch in j_ch:
        train[col] = train[col].apply(lambda x: str(x).replace(ch, ''))
train_one_hot = pd.get_dummies(train[cat_columns])
cat_columns = list(train_one_hot.columns)
train_one_hot = pd.concat([train_one_hot, train[['age_at_hct', 'donor_age']]], axis=1)

In [5]:
X = train_one_hot
y = train['y']

In [6]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CONFIG['SEED'])

In [7]:
fixed_params = {
    'loss_function': 'RMSE', 
    'random_seed': CONFIG['SEED'],
    'logging_level': 'Silent',      
    'eval_metric': 'RMSE',
    'allow_writing_files': False,
    'task_type': 'GPU',
}

In [8]:
# Объединение фиксированных параметров модели и гиперпараметров
final_params = {**fixed_params}

scores = []

for train_idx, valid_idx in skf.split(X, train['race_group']):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model = CatBoostRegressor(cat_features=cat_columns, **final_params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

    preds = model.predict(X_valid)
    fold_score = score__(pd.DataFrame({'efs': train.iloc[valid_idx]['efs'].to_list(),
                                       'efs_time': train.iloc[valid_idx]['efs_time'].to_list(),
                                       'race_group': train.iloc[valid_idx]['race_group'].to_list()}),
                         pd.DataFrame(preds, columns=['prediction']))
    scores.append(fold_score)

# Средний результат модели
mean_score = np.mean(scores)
print(f"Средний результат модели: {mean_score}")

Средний результат модели: 0.6832094177753787


In [9]:
final_cat = CatBoostRegressor(cat_features=cat_columns, **fixed_params)
final_cat.fit(X, y)

<catboost.core.CatBoostRegressor at 0x25f11c84670>

In [10]:
joblib.dump(final_cat, f"{number}_model.pkl")

['2.1.14_model.pkl']

In [11]:
cat_importance = final_cat.get_feature_importance()

In [12]:
cat_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': cat_importance
}).sort_values(by='Importance', ascending=False)

print("CatBoost Feature Importances:\n", cat_importance_df)

CatBoost Feature Importances:
                         Feature  Importance
157  conditioning_intensity_nan    6.322368
316                   donor_age    4.854169
0                dri_score_High    4.773905
315                  age_at_hct    4.192219
148      cyto_score_detail_Poor    2.204659
..                          ...         ...
38           hla_high_res_8_8.0    0.000000
39           hla_high_res_8_nan    0.000000
141      hla_match_dqb1_low_2.0    0.000000
209            peptic_ulcer_nan    0.000000
79         prim_disease_hct_IMD    0.000000

[317 rows x 2 columns]


In [13]:
# auto_params = ['per_float_feature_quantization', 'input_borders', 'output_borders', 'fold_permutation_block', 'counter_calc_method', 'thread_count', 'best_model_min_trees', 'verbose', 'silent', 'metric_period', 'ctr_leaf_count_limit', 'store_all_simple_ctr', 'has_time', 'allow_const_label', 'target_border', 'random_score_type', 'name', 'ignored_features', 'train_dir', 'custom_metric', 'save_snapshot', 'snapshot_file', 'snapshot_interval', 'fold_len_multiplier', 'used_ram_limit', 'gpu_ram_part', 'pinned_memory_size', 'final_ctr_computation_mode', 'simple_ctr', 'combinations_ctr', 'per_feature_ctr', 'ctr_description', 'ctr_target_border_count', 'device_config', 'devices', 'subsample', 'mvs_reg', 'sampling_frequency', 'sampling_unit', 'subsampling_factor', 'dev_score_calc_obj_block_size', 'dev_efb_max_buckets', 'sparse_features_conflict_fraction', 'max_depth', 'n_estimators', 'num_boost_round', 'num_trees', 'colsample_bylevel', 'random_state', 'reg_lambda', 'reg_lambda', 'eta', 'max_bin', 'gpu_cat_features_storage', 'data_partition', 'metadata', 'min_data_in_leaf', 'min_child_samples', 'num_leaves', 'score_function', 'ctr_history_unit', 'monotone_constraints', 'feature_weights', 'penalties_coefficient', 'first_feature_use_penalties', 'per_object_feature_penalties', 'model_shrink_rate', 'model_shrink_mode', 'langevin', 'diffusion_temperature', 'posterior_sampling', 'text_features', 'tokenizers', 'dictionaries', 'feature_calcers', 'text_processing', 'embedding_features', 'eval_fraction', 'fixed_binary_splits', 'od_type', 'od_pval', 'od_wait', 'approx_on_full_history', 'boosting_type']