In [1]:
import sys
sys.path.append(r"../../utils")

import pandas as pd
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, precision_recall_curve, roc_curve, auc

from questionnaires_aggregation import c_ssrs_aggregation
from utils import impute_from_column, simple_eda
from questions_columns import c_ssrs, sci_af_ca
from xgboost import XGBClassifier

In [2]:
# Splitting the data into train, validation, and test sets
train_ratio = 0.6
val_ratio = 0.2
test_ratio = 0.2
c_ssrs = c_ssrs[:-3]

In [3]:
df = pd.read_csv("df_for_research.csv")
df_intake = df[df.measurement == 'time1']
df_target = df[df.measurement == 'time2']

In [4]:
df_target = impute_from_column(df_target, impute_to = 'c_ssrs_6', impute_from = 'c_ssrs_last_visit_6')
df_target.loc[df_target.query('c_ssrs_1 == 0 & c_ssrs_2 == 0').index, c_ssrs] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[impute_to] = np.where(df[impute_to].isnull(), df[impute_from], df[impute_to])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [5]:
df_target = df_target.dropna(subset=c_ssrs, thresh=3)

In [7]:
df_target['c_ssrs_stb'] = c_ssrs_aggregation(df_target, severity = 'stb')
df_target = df_target[['c_ssrs_stb', 'id']]


In [8]:
df_target.shape

(262, 2)

In [9]:
df_intake.shape

(408, 258)

In [10]:
df_intake = df_intake.dropna(subset=sci_af_ca, thresh=20)

In [11]:
df_intake = df_intake[sci_af_ca + ['id']]
df_intake.shape

(237, 41)

In [12]:
df_intake['id'].isin(df_target['id']).sum()

154

In [13]:
data_for_prediction = pd.merge(df_intake, df_target, on='id', how='inner')
data_for_prediction['label'] = (data_for_prediction['c_ssrs_stb'] > 3).astype(int)
if True:
    simple_eda(data_for_prediction, columns = list(data_for_prediction.columns), title = 'all', display_additional_columns=False)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [14]:
features = data_for_prediction[sci_af_ca]
label = data_for_prediction['label']

In [15]:

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=test_ratio, random_state=42, stratify=label)

# Splitting the training set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_ratio/(train_ratio+val_ratio), random_state=42, stratify=y_train)

In [21]:
# Create a CatBoostClassifier object with early stopping based on the validation set
model = CatBoostClassifier(early_stopping_rounds=2, random_state=42, verbose=False, l2_leaf_reg=6, class_weights=[1, 4], eval_metric='AUC')

# Fit the model on the training set
model.fit(X_train, y_train, eval_set=(X_val, y_val))

# Evaluate the model on the training and validation sets

# Make predictions on the training and validation sets
y_train_pred = model.predict(X_train)
y_train_prob = model.predict_proba(X_train)[:, 1]
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:, 1]


In [22]:
# Function to calculate and print various performance metrics
def print_performance_metrics(y_true, y_pred, y_prob):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_prob)
#     precision_recall_auc = auc(*precision_recall_curve(y_true, y_prob)[:2])
#     fpr, tpr, _ = roc_curve(y_true, y_prob)

    print(f'Accuracy: {accuracy:.4f}')
    print(f'Precision: {precision:.4f}')
    print(f'Recall: {recall:.4f}')
    print(f'ROC AUC: {roc_auc:.4f}')
#     print(f'Precision-Recall AUC: {precision_recall_auc:.4f}')
#     print(f'FPR: {fpr}')
#     print(f'TPR: {tpr}')
#     print(f'Thresholds: {_}\n')

In [23]:
# Print the performance metrics for the training set
print('Training set performance:')
print_performance_metrics(y_train, y_train_pred, y_train_prob)



Training set performance:
Accuracy: 0.7717
Precision: 0.5116
Recall: 1.0000
ROC AUC: 0.9458


In [24]:
# Print the performance metrics for the validation set
print('Validation set performance:')
print_performance_metrics(y_val, y_val_pred, y_val_prob)


Validation set performance:
Accuracy: 0.6129
Precision: 0.3333
Recall: 0.5000
ROC AUC: 0.7174


In [25]:

# Get feature importance scores
feature_importance = model.get_feature_importance()
feature_names = X_train.columns.tolist()
# Print feature importance scores
print('Feature importance scores:')
for feature_name, score in sorted(zip(feature_names, feature_importance), reverse=1, key = lambda x: x[1]):
    print(f'{feature_name}: {score:.4f}')

Feature importance scores:
sci_af_ca_21: 18.5610
sci_af_ca_8: 18.0660
sci_af_ca_18: 11.9367
sci_af_ca_24: 7.7212
sci_af_ca_4: 7.3499
sci_af_ca_9: 6.0046
sci_af_ca_25: 5.7582
sci_af_ca_28: 5.4893
sci_af_ca_38: 4.8078
sci_af_ca_13: 3.7556
sci_af_ca_26: 2.0497
sci_af_ca_20: 1.9450
sci_af_ca_6: 1.5260
sci_af_ca_10: 1.4394
sci_af_ca_27: 1.1243
sci_af_ca_31: 0.8914
sci_af_ca_1: 0.7024
sci_af_ca_34: 0.5053
sci_af_ca_14: 0.3662
sci_af_ca_2: 0.0000
sci_af_ca_3: 0.0000
sci_af_ca_5: 0.0000
sci_af_ca_7: 0.0000
sci_af_ca_11: 0.0000
sci_af_ca_12: 0.0000
sci_af_ca_15: 0.0000
sci_af_ca_16: 0.0000
sci_af_ca_17: 0.0000
sci_af_ca_19: 0.0000
sci_af_ca_22: 0.0000
sci_af_ca_23: 0.0000
sci_af_ca_29: 0.0000
sci_af_ca_30: 0.0000
sci_af_ca_32: 0.0000
sci_af_ca_33: 0.0000
sci_af_ca_35: 0.0000
sci_af_ca_36: 0.0000
sci_af_ca_37: 0.0000
sci_af_ca_39: 0.0000
sci_af_ca_40: 0.0000


cool, 
use the train-validation-test variables that you created in the last snippet, 
and train CatBoostClassifier, with early_stopping parameter based on the validation set.

after that look at the performance scores on the training & validation sets.
and the feature-importance scores

Since it's a complicated task, we need to use the following metrics: accuracy, precision, recall, precision-recall-auc, roc_auc