https://www.ahajournals.org/doi/10.1161/CIRCOUTCOMES.120.006556

https://www.frontiersin.org/articles/10.3389/fpsyg.2019.02970/full

https://compass.onlinelibrary.wiley.com/doi/full/10.1111/spc3.12579

file:///C:/Users/nogur/Downloads/pone_0224365_pdf.pdf

https://arxiv.org/pdf/2209.04830.pdf

In [None]:
import sys
sys.path.append(r"../../")

import pandas as pd
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, precision_recall_curve, roc_curve, auc, average_precision_score

from utils.questionnaires_scores.scores_computations import calculate_c_ssrs_scores
from utils.utils import impute_from_column, simple_eda
from utils.consts.questions_columns import c_ssrs, sci_af_ca
from xgboost import XGBClassifier, XGBRegressor


from sklearn.decomposition import PCA, KernelPCA, NMF
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from umap import UMAP
import seaborn as sns
import dtale


In [None]:
# Splitting the data into train, validation, and test sets
train_ratio = 0.6
val_ratio = 0.1
test_ratio = 0.2
c_ssrs = c_ssrs[:-3]
target_col = 'suicidal_ideation_time2'

sci_cols = ['sci_af_ca_is_new_questions_missing', 'sci_af_ca_Factor1', 'sci_af_ca_Factor2',
             'sci_af_ca_Factor3', 'sci_af_ca_Factor4', 
             'sci_af_ca_Factor5'] + sci_af_ca

In [None]:
import os

current_directory = os.getcwd()
print("Current directory:", current_directory)

In [2]:
df = pd.read_csv(r"../../DeppClinic_prediction_task.csv")
df_intake = df[df.measurement == 'time1']
df_target = df[df.measurement == 'time2']

In [4]:
df.gender

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
732   NaN
733   NaN
734   NaN
735   NaN
736   NaN
Name: gender, Length: 737, dtype: float64

In [20]:
df_target = df_target[[target_col, 'id']]
df_intake = df_intake.dropna(subset=sci_af_ca, thresh=20)
df_intake = df_intake[sci_cols + ['id']]
data_for_prediction = pd.merge(df_intake, df_target, on='id', how='inner')


In [21]:
label = data_for_prediction[target_col]
features = data_for_prediction[sci_cols]

In [22]:

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_ratio, random_state=24, stratify=label)

# Splitting the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio/(train_ratio+val_ratio), random_state=24, stratify=y_train)

In [27]:
dim_reduction = TSNE
#PCA, KernelPCA, NMF, UMAP
sns.set(rc={'figure.figsize':(11,8)})
sns.set(font_scale=1.2)
plt.clf()
features = X_train
features = features.dropna(how= 'any', axis=0)

# Instantiate PCA and fit_transform your features
model = dim_reduction(n_components=2)
model_features = model.fit_transform(features)


# Concatenate your PCA features with your target variable
model_df = pd.DataFrame(model_features, columns=['var1', 'var2'])

# Plot the output using Seaborn

#sns.scatterplot(x='var1', y='var2', data=model_df).figure.savefig(fr"../cluster_vis/{name}_vis.png") 
sns.scatterplot(x='var1', y='var2', hue=y_train, palette='viridis', data=model_df).figure.savefig(fr"vis.png") 


In [9]:
# Create a CatBoostClassifier object with early stopping based on the validation set
eval_set  = [(X_val, y_val)]
model = CatBoostClassifier(random_state=42, eval_metric='AUC', reg_lambda = 5)

# Fit the model on the training set
model.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds = 3, verbose=0)#, use_best_model=True)

# Evaluate the model on the training and validation sets

# Make predictions on the training and validation sets
y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:, 1]
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]


y_test_pred = model.predict(X_test)
y_test_prob = model.predict_proba(X_test)[:, 1]


In [10]:
# Function to calculate and print various performance metrics

def print_performance_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    average_precision = average_precision_score(y_true, y_prob)
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'ROC AUC: {roc_auc:.4f}')
    print(f'PR AUC: {average_precision:.4f}')


In [11]:
# Print the performance metrics for the training set
print('Training set performance:')
print_performance_metrics(y_train, y_train_pred, y_train_prob)



Training set performance:
Accuracy: 0.8516
Precision: 0.8372
Recall: 0.9351
ROC AUC: 0.9121
PR AUC: 0.9407


In [12]:
# Print the performance metrics for the validation set
print('Validation set performance:')
print_performance_metrics(y_val, y_val_pred, y_val_prob)


Validation set performance:
Accuracy: 0.6364
Precision: 0.6667
Recall: 0.7692
ROC AUC: 0.5726
PR AUC: 0.6942


In [13]:
# Print the performance metrics for the validation set
print('Test set performance:')
print_performance_metrics(y_test, y_test_pred, y_test_prob)


Test set performance:
Accuracy: 0.6842
Precision: 0.7200
Recall: 0.7826
ROC AUC: 0.6754
PR AUC: 0.7247


In [14]:

# Get feature importance scores
feature_importance = model.get_feature_importance()
feature_names = X_train.columns.tolist()
# Print feature importance scores
print('Feature importance scores:')
for feature_name, score in sorted(zip(feature_names, feature_importance), reverse=1, key = lambda x: x[1]):
    print(f'{feature_name}: {score:.4f}')

Feature importance scores:
sci_af_ca_1: 18.6573
sci_af_ca_17: 9.7753
sci_af_ca_4: 9.5410
sci_af_ca_2: 8.6917
sci_af_ca_10: 6.0910
sci_af_ca_7: 5.9026
sci_af_ca_24: 5.5984
sci_af_ca_27: 4.9414
sci_af_ca_40: 4.8637
sci_af_ca_12: 4.5636
sci_af_ca_11: 4.1882
sci_af_ca_6: 4.1055
sci_af_ca_Factor5: 3.9799
sci_af_ca_20: 3.6559
sci_af_ca_Factor1: 2.3918
sci_af_ca_Factor2: 1.5394
sci_af_ca_39: 1.5130
sci_af_ca_is_new_questions_missing: 0.0000
sci_af_ca_Factor3: 0.0000
sci_af_ca_Factor4: 0.0000
sci_af_ca_3: 0.0000
sci_af_ca_5: 0.0000
sci_af_ca_8: 0.0000
sci_af_ca_9: 0.0000
sci_af_ca_13: 0.0000
sci_af_ca_14: 0.0000
sci_af_ca_15: 0.0000
sci_af_ca_16: 0.0000
sci_af_ca_18: 0.0000
sci_af_ca_19: 0.0000
sci_af_ca_21: 0.0000
sci_af_ca_22: 0.0000
sci_af_ca_23: 0.0000
sci_af_ca_25: 0.0000
sci_af_ca_26: 0.0000
sci_af_ca_28: 0.0000
sci_af_ca_29: 0.0000
sci_af_ca_30: 0.0000
sci_af_ca_31: 0.0000
sci_af_ca_32: 0.0000
sci_af_ca_33: 0.0000
sci_af_ca_34: 0.0000
sci_af_ca_35: 0.0000
sci_af_ca_36: 0.0000
sci_af_ca_

cool, 
use the train-validation-test variables that you created in the last snippet, 
and train CatBoostClassifier, with early_stopping parameter based on the validation set.

after that look at the performance scores on the training & validation sets.
and the feature-importance scores

Since it's a complicated task, we need to use the following metrics: accuracy, precision, recall, precision-recall-auc, roc_auc