https://www.ahajournals.org/doi/10.1161/CIRCOUTCOMES.120.006556

https://www.frontiersin.org/articles/10.3389/fpsyg.2019.02970/full

https://compass.onlinelibrary.wiley.com/doi/full/10.1111/spc3.12579

file:///C:/Users/nogur/Downloads/pone_0224365_pdf.pdf

https://arxiv.org/pdf/2209.04830.pdf

In [14]:
import sys
sys.path.append(r"../../")

import pandas as pd
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, precision_recall_curve, roc_curve, auc, average_precision_score

from utils.questionnaires_scores.scores_computations import calculate_c_ssrs_scores
from utils.utils import impute_from_column, simple_eda
from utils.consts.questions_columns import c_ssrs, sci_af_ca, maris_sci_sf, sci_mother, scs_clin, SAS, c_ssrs_intake
from xgboost import XGBClassifier, XGBRegressor


from sklearn.decomposition import PCA, KernelPCA, NMF
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from umap import UMAP
import seaborn as sns
import dtale


In [24]:
# Splitting the data into train, validation, and test sets
train_ratio = 0.6
val_ratio = 0.1
test_ratio = 0.2
c_ssrs = c_ssrs[:-3]
target_col = 'suicidal_ideation_time2'

sci_cols = ['sci_af_ca_is_new_questions_missing', 'sci_af_ca_Factor1', 'sci_af_ca_Factor2',
             'sci_af_ca_Factor3', 'sci_af_ca_Factor4', 
             'sci_af_ca_Factor5'] + sci_af_ca +['gender', 'age_child_pre'] + sci_mother +scs_clin + SAS +c_ssrs_intake

In [25]:
import os

current_directory = os.getcwd()
print("Current directory:", current_directory)

Current directory: C:\Users\nogur\Documents\DeppClinic\research\Prediction_suicidal_behivior_one_month


In [26]:
df = pd.read_csv(r"../../DeppClinic_prediction_task.csv")
df_intake = df[df.measurement == 'time1']
df_target = df[df.measurement == 'time2']

In [27]:
df_target = df_target[[target_col, 'id']]
df_intake = df_intake.dropna(subset=sci_af_ca, thresh=20)
df_intake = df_intake[sci_cols + ['id']]
data_for_prediction = pd.merge(df_intake, df_target, on='id', how='inner')


In [28]:
label = data_for_prediction[target_col]
features = data_for_prediction[sci_cols]

In [29]:

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_ratio, random_state=24, stratify=label)

# Splitting the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio/(train_ratio+val_ratio), random_state=24, stratify=y_train)

In [30]:
# Create a CatBoostClassifier object with early stopping based on the validation set
eval_set  = [(X_val, y_val)]
model = CatBoostClassifier(random_state=42, eval_metric='AUC', reg_lambda = 5)

# Fit the model on the training set
model.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds = 3, verbose=0)#, use_best_model=True)

# Evaluate the model on the training and validation sets

# Make predictions on the training and validation sets
y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:, 1]
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]


y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]


In [31]:
# Function to calculate and print various performance metrics

def print_performance_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    average_precision = average_precision_score(y_true, y_prob)
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'ROC AUC: {roc_auc:.4f}')
    print(f'PR AUC: {average_precision:.4f}')


In [32]:
# Print the performance metrics for the training set
print('Training set performance:')
print_performance_metrics(y_train, y_train_pred, y_train_prob)



Training set performance:
Accuracy: 0.8906
Precision: 0.8539
Recall: 0.9870
ROC AUC: 0.9481
PR AUC: 0.9633


In [33]:
# Print the performance metrics for the validation set
print('Validation set performance:')
print_performance_metrics(y_val, y_val_pred, y_val_prob)


Validation set performance:
Accuracy: 0.6818
Precision: 0.6875
Recall: 0.8462
ROC AUC: 0.7778
PR AUC: 0.8717


In [34]:
# Print the performance metrics for the validation set
print('Test set performance:')
print_performance_metrics(y_test, y_test_pred, y_test_prob)


Test set performance:
Accuracy: 0.7105
Precision: 0.7143
Recall: 0.8696
ROC AUC: 0.7420
PR AUC: 0.7581


In [35]:

# Get feature importance scores
feature_importance = model.get_feature_importance()
feature_names = X_train.columns.tolist()
# Print feature importance scores
print('Feature importance scores:')
for feature_name, score in sorted(zip(feature_names, feature_importance), reverse=1, key = lambda x: x[1]):
    print(f'{feature_name}: {score:.4f}')

Feature importance scores:
c_ssrs_5_2weeks: 10.3816
c_ssrs_3_2weeks: 9.3421
sci_af_ca_Factor2: 8.3281
sci_af_ca_34: 7.9700
sci_c_4_4_clin: 5.7761
sci_af_ca_2: 5.6526
sas_1: 5.4431
sas_23: 5.3015
sas_3: 5.2036
sci_p_7_m: 3.8158
sci_c_3_3_clin: 3.6631
sci_af_ca_13: 3.5281
sci_af_ca_4: 3.0287
c_ssrs_1_life: 2.9452
sci_af_ca_6: 2.8848
sci_af_ca_Factor1: 2.8684
sci_af_ca_30: 2.6064
sci_af_ca_17: 2.4035
sci_af_ca_Factor5: 2.0290
sci_p_9_m: 1.9604
sci_af_ca_24: 1.8467
sci_af_ca_22: 1.4910
sci_af_ca_27: 0.9869
sas_11: 0.5433
sci_af_ca_is_new_questions_missing: 0.0000
sci_af_ca_Factor3: 0.0000
sci_af_ca_Factor4: 0.0000
sci_af_ca_1: 0.0000
sci_af_ca_3: 0.0000
sci_af_ca_5: 0.0000
sci_af_ca_7: 0.0000
sci_af_ca_8: 0.0000
sci_af_ca_9: 0.0000
sci_af_ca_10: 0.0000
sci_af_ca_11: 0.0000
sci_af_ca_12: 0.0000
sci_af_ca_14: 0.0000
sci_af_ca_15: 0.0000
sci_af_ca_16: 0.0000
sci_af_ca_18: 0.0000
sci_af_ca_19: 0.0000
sci_af_ca_20: 0.0000
sci_af_ca_21: 0.0000
sci_af_ca_23: 0.0000
sci_af_ca_25: 0.0000
sci_af_ca_

cool, 
use the train-validation-test variables that you created in the last snippet, 
and train CatBoostClassifier, with early_stopping parameter based on the validation set.

after that look at the performance scores on the training & validation sets.
and the feature-importance scores

Since it's a complicated task, we need to use the following metrics: accuracy, precision, recall, precision-recall-auc, roc_auc