In [1]:
import sys
sys.path.append(r"../../utils")

import pandas as pd
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, precision_recall_curve, roc_curve, auc, average_precision_score

from questionnaires_aggregation import c_ssrs_aggregation
from utils import impute_from_column, simple_eda
from questions_columns import c_ssrs, sci_af_ca, mfq, sci_af_ca, sci_mother, scs_clin, siq, sdq, c_ssrs_intake, scared, ATHENS, SAS, c_ssrs_clin, demographics_m
from xgboost import XGBClassifier, XGBRegressor

In [2]:
# Splitting the data into train, validation, and test sets
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2
c_ssrs = c_ssrs[:-3]
target_col = 'suicidal_ideation_time2'

sci_cols = ['is_sci_af_ca_missing', 'sci_af_ca_Factor1', 'sci_af_ca_Factor2',
             'sci_af_ca_Factor3', 'sci_af_ca_Factor4', 
             'sci_af_ca_Factor5', 'gender', 'age_child_pre'] + sci_af_ca + mfq + sci_mother + scs_clin+ siq + sdq + c_ssrs_intake + scared + ATHENS + SAS + c_ssrs_clin# + demographics_m

In [3]:
df = pd.read_csv(r"../create_dataset/data_for_prediction_task - measurement time axis.csv")
df_intake = df[df.measurement == 'time1']
df_target = df[df.measurement == 'time2']

In [4]:
df_target = df_target[[target_col, 'id']]
df_intake = df_intake.dropna(subset=sci_af_ca, thresh=20)
df_intake = df_intake[sci_cols + ['id']]
data_for_prediction = pd.merge(df_intake, df_target, on='id', how='inner')


In [5]:
label = data_for_prediction[target_col]
features = data_for_prediction[sci_cols]

In [6]:

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_ratio, random_state=42, stratify=label)

# Splitting the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio/(train_ratio+val_ratio), random_state=42, stratify=y_train)

In [7]:
# Create a CatBoostClassifier object with early stopping based on the validation set
#eval_set  = [(X_val, y_val)]
model = CatBoostClassifier(random_state=420, reg_lambda=10, n_estimators = 20, max_depth= 2)

# Fit the model on the training set
model.fit(X_train, y_train, early_stopping_rounds = 5, verbose=0)#, use_best_model=True)

# Evaluate the model on the training and validation sets

# Make predictions on the training and validation sets
y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:, 1]
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]


y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]


In [8]:
# Function to calculate and print various performance metrics

def print_performance_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    average_precision = average_precision_score(y_true, y_prob)
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'ROC AUC: {roc_auc:.4f}')
    print(f'PR AUC: {average_precision:.4f}')


In [9]:
# Print the performance metrics for the training set
print('Training set performance:')
print_performance_metrics(y_train, y_train_pred, y_train_prob)



Training set performance:
Accuracy: 0.8036
Precision: 0.7848
Recall: 0.9254
ROC AUC: 0.8726
PR AUC: 0.9142


In [10]:
# Print the performance metrics for the validation set
print('Validation set performance:')
print_performance_metrics(y_val, y_val_pred, y_val_prob)


Validation set performance:
Accuracy: 0.7105
Precision: 0.7000
Recall: 0.9130
ROC AUC: 0.7304
PR AUC: 0.8142


In [11]:
# Print the performance metrics for the validation set
print('Test set performance:')
print_performance_metrics(y_test, y_test_pred, y_test_prob)


Test set performance:
Accuracy: 0.7368
Precision: 0.7407
Recall: 0.8696
ROC AUC: 0.7449
PR AUC: 0.7811


In [32]:

# Get feature importance scores
feature_importance = model.get_feature_importance()
feature_names = X_train.columns.tolist()
# Print feature importance scores
print('Feature importance scores:')
for feature_name, score in sorted(zip(feature_names, feature_importance), reverse=1, key = lambda x: x[1]):
    print(f'{feature_name}: {score:.4f}')

Feature importance scores:
c_ssrs_7_intake: 5.3776
scared_21: 4.4480
scared_6: 4.0433
scared_28: 3.7818
siq_4: 3.6380
sci_p_8_m: 3.6259
c_ssrs_6_life: 3.3797
c_ssrs_t_14_2weeks_clin: 3.2001
scs_2_2_clin: 2.5458
sdq_2: 2.5186
mfq_28: 2.4626
c_ssrs_3_life: 2.4613
mfq_14: 2.3781
c_ssrs_2_2weeks: 2.3475
sci_af_ca_40: 2.1486
sci_af_ca_20: 2.0486
c_ssrs_t_time_clin: 1.9112
sdq_20: 1.8456
scared_20: 1.8220
c_ssrs_5_life: 1.7474
sci_af_ca_19: 1.7008
sdq_1: 1.6759
sas_14: 1.6629
sas_9: 1.6145
siq_2: 1.5590
sci_af_ca_13: 1.5134
mfq_27: 1.4924
sci_c_4_2_clin: 1.4670
sci_af_ca_9: 1.4627
c_ssrs_t_13_2weeks_clin: 1.4435
sci_af_ca_5: 1.3773
scs_2_4_clin: 1.3515
sci_af_ca_31: 1.3474
sdq_17: 1.3062
mfq_8: 1.2631
sdq_6: 1.2403
sci_af_ca_2: 1.1415
sas_2: 1.1193
sci_af_ca_39: 1.0984
scared_8: 1.0956
sci_c_5_1_clin: 1.0767
scared_18: 1.0509
c_ssrs_t_life_3_clin: 1.0019
c_ssrs_t_10_clin: 0.9897
scared_22: 0.9796
c_ssrs_t_2weeks_5_clin: 0.9193
siq_1: 0.8066
sci_p_1_m: 0.7907
sas_13: 0.7854
c_ssrs_8_intake: 0

cool, 
use the train-validation-test variables that you created in the last snippet, 
and train CatBoostClassifier, with early_stopping parameter based on the validation set.

after that look at the performance scores on the training & validation sets.
and the feature-importance scores

Since it's a complicated task, we need to use the following metrics: accuracy, precision, recall, precision-recall-auc, roc_auc