In [8]:
import sys
sys.path.append(r"../../utils")
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, precision_recall_curve, roc_curve, auc
from sklearn.neural_network import MLPRegressor
from questionnaires_aggregation import c_ssrs_aggregation, sci_af_ac_aggregation
from utils import impute_from_column, simple_eda
from questions_columns import c_ssrs, sci_af_ca, suicidal_behavior
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import r2_score, max_error

In [9]:
# Splitting the data into train, validation, and test sets
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2
c_ssrs = c_ssrs[:-3]

In [10]:
df = pd.read_csv("df_for_research.csv")
df_intake = df[df.measurement == 'time1']
df_target = df[df.measurement == 'time2']

In [11]:
df_target = impute_from_column(df_target, impute_to = 'c_ssrs_6', impute_from = 'c_ssrs_last_visit_6')
df_target.loc[df_target.query('c_ssrs_1 == 0 & c_ssrs_2 == 0').index, c_ssrs] = 0

In [12]:
df_target = df_target.dropna(subset=c_ssrs, thresh=3)

In [13]:
df_target['c_ssrs_stb'] = c_ssrs_aggregation(df_target, severity='stb')
df_target = df_target[['c_ssrs_stb', 'id']]


In [14]:
df_intake = df_intake.dropna(subset=sci_af_ca, thresh=20)
df_intake = df_intake[sci_af_ca + ['id']]
data_for_prediction = pd.merge(df_intake, df_target, on='id', how='inner')
data_for_prediction['label'] = np.log(data_for_prediction['c_ssrs_stb'] + 1)

data_for_prediction, sci_af_ac_factors = sci_af_ac_aggregation(data_for_prediction)

if False:
    simple_eda(data_for_prediction, columns = list(data_for_prediction.columns), title = 'all', display_additional_columns=False)

In [15]:
features = data_for_prediction[sci_af_ca + sci_af_ac_factors]
label = data_for_prediction['label']

In [16]:

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_ratio, random_state=42, stratify=label)

# Splitting the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio/(train_ratio+val_ratio), random_state=42, stratify=y_train)

In [17]:
# Create a CatBoostClassifier object with early stopping based on the validation set

eval_set  = [(X_val, y_val)]
model = CatBoostRegressor(random_state=42, depth = 3, l2_leaf_reg = 7, loss_function='RMSE', eval_metric='R2')


# Fit the model on the training set
model.fit(X_train, y_train, eval_set = eval_set, early_stopping_rounds = 3)    #eval_set=(X_val, y_val))


# Make predictions on the training and validation sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)


0:	learn: 0.0121895	test: 0.0056970	best: 0.0056970 (0)	total: 148ms	remaining: 2m 27s
1:	learn: 0.0217628	test: 0.0166623	best: 0.0166623 (1)	total: 150ms	remaining: 1m 14s
2:	learn: 0.0330819	test: 0.0271034	best: 0.0271034 (2)	total: 153ms	remaining: 50.7s
3:	learn: 0.0438372	test: 0.0377243	best: 0.0377243 (3)	total: 155ms	remaining: 38.5s
4:	learn: 0.0498669	test: 0.0389830	best: 0.0389830 (4)	total: 156ms	remaining: 31.1s
5:	learn: 0.0575625	test: 0.0438509	best: 0.0438509 (5)	total: 158ms	remaining: 26.1s
6:	learn: 0.0677874	test: 0.0497498	best: 0.0497498 (6)	total: 160ms	remaining: 22.6s
7:	learn: 0.0768500	test: 0.0617428	best: 0.0617428 (7)	total: 161ms	remaining: 20s
8:	learn: 0.0848020	test: 0.0675218	best: 0.0675218 (8)	total: 163ms	remaining: 18s
9:	learn: 0.0935060	test: 0.0743589	best: 0.0743589 (9)	total: 165ms	remaining: 16.3s
10:	learn: 0.0997361	test: 0.0804132	best: 0.0804132 (10)	total: 167ms	remaining: 15s
11:	learn: 0.1069207	test: 0.0899726	best: 0.0899726 (11

103:	learn: 0.4468843	test: 0.3085612	best: 0.3085612 (103)	total: 308ms	remaining: 2.65s
104:	learn: 0.4481526	test: 0.3078307	best: 0.3085612 (103)	total: 310ms	remaining: 2.64s
105:	learn: 0.4486734	test: 0.3083526	best: 0.3085612 (103)	total: 311ms	remaining: 2.63s
106:	learn: 0.4499798	test: 0.3092812	best: 0.3092812 (106)	total: 313ms	remaining: 2.61s
107:	learn: 0.4526874	test: 0.3089533	best: 0.3092812 (106)	total: 314ms	remaining: 2.59s
108:	learn: 0.4543097	test: 0.3083323	best: 0.3092812 (106)	total: 315ms	remaining: 2.58s
109:	learn: 0.4575837	test: 0.3091227	best: 0.3092812 (106)	total: 317ms	remaining: 2.56s
Stopped by overfitting detector  (3 iterations wait)

bestTest = 0.3092811841
bestIteration = 106

Shrink model to first 107 iterations.


In [18]:
print(f"{max_error(y_val, y_val_pred) = }\n{r2_score(y_val, y_val_pred) = }")

max_error(y_val, y_val_pred) = 1.3779847539077354
r2_score(y_val, y_val_pred) = 0.309281180711818


In [19]:
print(f"{max_error(y_train, y_train_pred) = }\n{r2_score(y_train, y_train_pred) = }")

max_error(y_train, y_train_pred) = 1.3833627747768082
r2_score(y_train, y_train_pred) = 0.4499797709787827


In [20]:

# Get feature importance scores
feature_importance = model.get_feature_importance()
feature_names = X_train.columns.tolist()
# Print feature importance scores
print('Feature importance scores:')
for feature_name, score in sorted(zip(feature_names, feature_importance), reverse=1, key = lambda x: x[1]):
    print(f'{feature_name}: {score:.4f}')

Feature importance scores:
sci_af_ca_24: 14.2575
sci_af_ca_11: 8.6792
sci_af_ca_Factor1: 6.8044
sci_af_ca_19: 6.6013
sci_af_ca_14: 4.2316
sci_af_ca_40: 4.2029
sci_af_ca_8: 3.9182
sci_af_ca_Factor2: 3.1601
sci_af_ca_7: 2.9430
sci_af_ca_21: 2.7999
sci_af_ca_16: 2.5196
sci_af_ca_Factor4: 2.5039
sci_af_ca_10: 2.3854
sci_af_ca_29: 2.3246
sci_af_ca_1: 2.3245
sci_af_ca_26: 2.2489
sci_af_ca_30: 2.0993
sci_af_ca_Factor5: 2.0399
sci_af_ca_27: 1.8154
sci_af_ca_Factor3: 1.7674
sci_af_ca_36: 1.7312
sci_af_ca_2: 1.6462
sci_af_ca_4: 1.5998
sci_af_ca_5: 1.5886
sci_af_ca_37: 1.1563
sci_af_ca_35: 1.0902
sci_af_ca_39: 1.0726
sci_af_ca_28: 1.0048
sci_af_ca_15: 0.9416
sci_af_ca_31: 0.9095
sci_af_ca_33: 0.9022
sci_af_ca_20: 0.8863
sci_af_ca_13: 0.8630
sci_af_ca_22: 0.8061
sci_af_ca_9: 0.7727
sci_af_ca_25: 0.7520
sci_af_ca_12: 0.5363
sci_af_ca_6: 0.5052
sci_af_ca_38: 0.4890
sci_af_ca_34: 0.3694
sci_af_ca_18: 0.2922
sci_af_ca_23: 0.1830
sci_af_ca_3: 0.1432
sci_af_ca_17: 0.0985
sci_af_ca_32: 0.0331


cool, 
use the train-validation-test variables that you created in the last snippet, 
and train CatBoostClassifier, with early_stopping parameter based on the validation set.

after that look at the performance scores on the training & validation sets.
and the feature-importance scores

Since it's a complicated task, we need to use the following metrics: accuracy, precision, recall, precision-recall-auc, roc_auc