In [1]:
import sys
sys.path.append(r"../../utils")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, average_precision_score, precision_recall_curve, roc_curve, auc
from sklearn.neural_network import MLPRegressor
from questionnaires_aggregation import c_ssrs_aggregation, sci_af_ac_aggregation
from utils import impute_from_column, simple_eda
from questions_columns import sci_af_ca, suicidal_behavior
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import r2_score, max_error

In [113]:
# Splitting the data into train, validation, and test sets
train_ratio = 0.5
val_ratio = 0.3
test_ratio = 0.2

In [114]:
df = pd.read_csv(r"../create_dataset/data_for_prediction_research.csv")
#df = df[df.age_child_pre > 11]
df_intake = df[df.measurement == 'time1']
df_target = df[df.measurement == 'time2']

In [115]:
print(f"# data in the original df = {df['id'].nunique()}\n")
print(f"# data in the df_target = {df_target['id'].nunique()}")
print(f"# data in the df_intake = {df_intake['id'].nunique()}")

merged_times = pd.merge(df_intake, df_target, on='id', how='inner')
print(f"# data in the merged_times = {merged_times['id'].nunique()}\n")


target_dropna = df_target.dropna(subset = suicidal_behavior, how='all')
print(f"# data in the df_target after removing missing data = {target_dropna['id'].nunique()}")

intake_dropna = df_intake.dropna(subset = sci_af_ca, thresh=20)
print(f"# data in the df_intake after removing missing data = {intake_dropna['id'].nunique()}")

merged_times_dropna = pd.merge(intake_dropna, target_dropna, on='id', how='inner')
print(f"# data in the merged_times after removing missing data = {merged_times_dropna['id'].nunique()}\n")

# data in the original df = 419

# data in the df_target = 329
# data in the df_intake = 408
# data in the merged_times = 318

# data in the df_target after removing missing data = 302
# data in the df_intake after removing missing data = 237
# data in the merged_times after removing missing data = 174



In [116]:
df_target = df_target.dropna(subset= suicidal_behavior, how='all')#df_target.dropna(subset=suicidal_behavior, how='all')
df_target['label'] = (df_target[suicidal_behavior].sum(axis=1) > 0).astype(int)
df_target = df_target[['label', 'id'] + suicidal_behavior]

df_intake = df_intake.dropna(subset=sci_af_ca, thresh=20)
df_intake, sci_af_ac_factors = sci_af_ac_aggregation(df_intake)
df_intake = df_intake[sci_af_ca + sci_af_ac_factors + ['id']]

data_for_prediction = pd.merge(df_intake, df_target, on='id', how='inner')
print(f"{data_for_prediction['id'].nunique() = }")

data_for_prediction['id'].nunique() = 174


In [117]:
eda = False
if eda:
    simple_eda(data_for_prediction, columns = list(data_for_prediction.columns), title = 'suicidal behivior prediction based on sci_af_ac', display_additional_columns=False)

In [118]:
features = data_for_prediction[sci_af_ca + sci_af_ac_factors]
label = data_for_prediction['label']
print(f"N features = {features.shape}\n{label.count() = }\n{label.sum() = }")
print(f"pos ratio  = {(label.count() - label.sum())/ label.sum() } ")

N features = (174, 45)
label.count() = 174
label.sum() = 34
pos ratio  = 4.117647058823529 


In [119]:

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_ratio, random_state=42, stratify=label)

# Splitting the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio/(train_ratio+val_ratio), random_state=42, stratify=y_train)

In [120]:
eval_set  = [(X_val, y_val)]
model = CatBoostClassifier(random_state=42, depth = 6, l2_leaf_reg = 10, auto_class_weights = 'Balanced',
                           langevin = True, score_function = 'L2', model_size_reg = 1.0, boosting_type = 'Ordered',
                           eval_metric='Precision', approx_on_full_history = True, eta=0.005,
                          random_strength = 2, n_estimators=100)

# Fit the model on the training set
model.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds=5)


# Make predictions on the training and validation sets
y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:, 1]

y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]


0:	learn: 0.7242366	test: 0.7172557	best: 0.7172557 (0)	total: 3.37ms	remaining: 334ms
1:	learn: 0.8147139	test: 0.5750000	best: 0.7172557 (0)	total: 7.19ms	remaining: 352ms
2:	learn: 0.8023256	test: 0.4480520	best: 0.7172557 (0)	total: 11.2ms	remaining: 362ms
3:	learn: 0.8469722	test: 0.4035088	best: 0.7172557 (0)	total: 15.4ms	remaining: 370ms
4:	learn: 0.8469722	test: 0.5036496	best: 0.7172557 (0)	total: 19.6ms	remaining: 372ms
5:	learn: 0.8240446	test: 0.5750000	best: 0.7172557 (0)	total: 22ms	remaining: 344ms
Stopped by overfitting detector  (5 iterations wait)

bestTest = 0.7172557201
bestIteration = 0

Shrink model to first 1 iterations.


In [121]:
# Function to calculate and print various performance metrics

def print_performance_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
    average_precision = average_precision_score(y_true, y_prob)
    
    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'ROC AUC: {roc_auc:.4f}')
    print(f'PR AUC: {average_precision:.4f}')


In [122]:
# Print the performance metrics for the training set
print('Training set performance:')
print_performance_metrics(y_train, y_train_pred, y_train_prob)



Training set performance:
Accuracy: 0.7326
Precision: 0.3929
Recall: 0.6471
ROC AUC: 0.7123
PR AUC: 0.3324


In [123]:
# Print the performance metrics for the validation set
print('Validation set performance:')
print_performance_metrics(y_val, y_val_pred, y_val_prob)


Validation set performance:
Accuracy: 0.7547
Precision: 0.3846
Recall: 0.5000
ROC AUC: 0.7395
PR AUC: 0.3312


In [20]:

# Get feature importance scores
feature_importance = model.get_feature_importance()
feature_names = X_train.columns.tolist()
# Print feature importance scores
print('Feature importance scores:')
for feature_name, score in sorted(zip(feature_names, feature_importance), reverse=1, key = lambda x: x[1]):
    print(f'{feature_name}: {score:.4f}')

Feature importance scores:
sci_af_ca_1: 11.3371
sci_af_ca_3: 8.5838
sci_af_ca_Factor5: 6.3685
sci_af_ca_16: 5.9821
sci_af_ca_6: 4.5367
sci_af_ca_9: 4.3141
sci_af_ca_36: 4.2144
sci_af_ca_38: 3.9709
sci_af_ca_33: 3.7489
sci_af_ca_19: 3.7038
sci_af_ca_30: 3.5628
sci_af_ca_2: 3.1152
sci_af_ca_15: 2.7919
sci_af_ca_25: 2.5365
sci_af_ca_17: 2.2384
sci_af_ca_34: 2.2146
sci_af_ca_24: 2.2025
sci_af_ca_18: 1.9357
sci_af_ca_37: 1.8807
sci_af_ca_35: 1.6737
sci_af_ca_28: 1.5743
sci_af_ca_31: 1.5722
sci_af_ca_40: 1.5220
sci_af_ca_Factor2: 1.4561
sci_af_ca_5: 1.3726
sci_af_ca_12: 1.2302
sci_af_ca_32: 1.1144
sci_af_ca_4: 1.0886
sci_af_ca_14: 1.0703
sci_af_ca_11: 1.0427
sci_af_ca_13: 0.8586
sci_af_ca_7: 0.6547
sci_af_ca_Factor1: 0.6475
sci_af_ca_26: 0.5803
sci_af_ca_39: 0.5531
sci_af_ca_29: 0.5473
sci_af_ca_10: 0.5205
sci_af_ca_27: 0.4811
sci_af_ca_20: 0.4168
sci_af_ca_Factor3: 0.3821
sci_af_ca_21: 0.2338
sci_af_ca_23: 0.1439
sci_af_ca_Factor4: 0.0247
sci_af_ca_8: 0.0000
sci_af_ca_22: 0.0000


cool, 
use the train-validation-test variables that you created in the last snippet, 
and train CatBoostClassifier, with early_stopping parameter based on the validation set.

after that look at the performance scores on the training & validation sets.
and the feature-importance scores

Since it's a complicated task, we need to use the following metrics: accuracy, precision, recall, precision-recall-auc, roc_auc

#### Create a CatBoostClassifier object with early stopping based on the validation set
eval_set  = [(X_val, y_val)]
#model = CatBoostClassifier(random_state=42, depth = 5, l2_leaf_reg = 7, loss_function='CrossEntropy', eval_metric='AUC')
model = CatBoostClassifier(random_state=42, depth = 2, l2_leaf_reg = 10, class_weights = [1, 8], eval_metric='Precision')

##### Fit the model on the training set
model.fit(X_train, y_train, eval_set=eval_set, early_stopping_rounds = 5)#, eval_set=(X_val, y_val))


#### Make predictions on the training and validation sets
y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:, 1]

y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]


eval_set  = [(X_val, y_val)]
model = CatBoostClassifier(random_state=4212, depth = 6, l2_leaf_reg = 10, auto_class_weights = 'Balanced',
                           langevin = True, score_function = 'L2', model_size_reg = 1.0, boosting_type = 'Ordered',
                           eval_metric='CrossEntropy', od_pval = 10**-2, od_wait=5, approx_on_full_history = True,
                          random_strength = 2, leaf_estimation_iterations=2)

# Fit the model on the training set
model.fit(X_train, y_train, eval_set=eval_set)#, eval_set=(X_val, y_val))


# Make predictions on the training and validation sets
y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:, 1]

y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]
