In [160]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import os

random_state = 3602

In [161]:
from fancyimpute import IterativeImputer


In [162]:
data_path = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data'
df_2009 = pd.read_excel(os.path.join(data_path, "PTSD.xlsx"))


In [163]:
df_2016 = pd.read_csv(r"C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\IDF_ABM_16.2.15_wide.csv")
df_2016 = df_2016[df_2016['Group']=='control']


In [164]:
df_questionnaire = pd.read_csv(os.path.join(data_path, f"questionnaire_PCL.csv"))


In [165]:
intrusion_features = ["q6.1_INTRU", "q6.2_DREAM", "q6.3_FLASH", "q6.4_UPSET", "q6.5_PHYS"]
hypertension = ["q6.13_SLEEP", "q6.14_ANGER", "q6.15_CONC", "q6.16_HYPER", "q6.17_STRTL"]
avoidance = ["q6.6_AVTHT", "q6.7_AVSIT", "q6.8_AMNES", "q6.9_DISINT", "q6.10_DTACH",  "q6.11_NUMB", "q6.12_FUTRE"]
df_questionnaire['intrusion_score'] = df_questionnaire[intrusion_features].sum(axis=1)
df_questionnaire['avoidance_score'] = df_questionnaire[avoidance].sum(axis=1)
df_questionnaire['hypertension_score'] = df_questionnaire[hypertension].sum(axis=1)

In [166]:
df_2009 = df_2009.merge(df_questionnaire[["intrusion_score", "avoidance_score", "hypertension_score", "ID"]], on="ID", how='outer')


In [167]:
trans_2016_2009_features = {
    'bagrut': 'highschool_diploma',
 #'dyslexia': 'dyslexia',
 'ADHD': 'ADHD',
 #'Accuracy_threat_T1': 'T1Acc1t',
 #'Accuracy_NT_T1': 'T1Acc1n',
 'Threat_Bias_T1': 'T1bias',
 #'PHQ_T1': 'phq1',
 #'Trait_T1': 'trait1',
 #'State_T1': 'state1',
 # 'PCL_T1': 'PCL1',
  'Intrusion_T1': 'intrusion_score',
                             'Avoidance_T1': 'avoidance_score',
                             'Hyper_T1': 'hypertension_score',
# 'PCL_T4': 'PCL_Strict3'
}


In [168]:
target_feature = 'intrusion_score'
X_features = [i for i in trans_2016_2009_features.values() if not i == target_feature]


In [169]:
df_2016['PCL_T4'] = df_2016['PCL_T4'] > 49

In [170]:
df_2016['bagrut'] = df_2016['bagrut'] == 'yes'
df_2016['dyslexia'] = df_2016['dyslexia'] == 'yes'
df_2016['ADHD'] = df_2016['ADHD'] == 'yes'

In [171]:
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)

In [172]:
df_2009 = df_2009[~df_2009[target_feature].isna()]
df_2016 = df_2016[~df_2016[target_feature].isna()]

## CV of 2009 data

In [173]:
cv = StratifiedKFold()
pipe = Pipeline(steps=[('classifier', CatBoostClassifier(verbose=0, random_state=random_state))])
grid_params = [{
        'classifier__class_weights':[[1, 5], [1, 5*1.5], [1, 5*0.5]],
        'classifier__l2_leaf_reg': [150, 50, 3, 250],
        'classifier__depth': [4],#, 9]
        }]

## 2016

In [174]:
x_2016, y_2016 = df_2016[X_features], df_2016[target_feature] *0 +1
x_2009, y_2009 = df_2009[X_features], df_2009[target_feature] * 0

In [175]:
X = pd.concat([x_2016, x_2009])
Y = np.hstack([y_2016, y_2009])

In [176]:
Y.shape

(1040,)

In [177]:
x_train, x_val, y_train, y_val = train_test_split(X, Y,  random_state=random_state, test_size=0.2, stratify=Y)

In [178]:
clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')
clf.fit(x_train, y_train, classifier__early_stopping_rounds = 15)
print(f"roc_auc = {clf.best_score_}, params = {clf.best_params_}")


roc_auc = 0.6120337853635724, params = {'classifier__class_weights': [1, 5], 'classifier__depth': 4, 'classifier__l2_leaf_reg': 3}


In [179]:
y_pred_target = clf.best_estimator_.predict_proba(x_val)[:, 1]
print(f"roc_auc = {roc_auc_score(y_val, y_pred_target)}")

roc_auc = 0.5623224431818181


In [180]:
for i, j in zip(x_train.columns, clf.best_estimator_['classifier'].get_feature_importance()):
    print(i, j)

highschool_diploma 2.9450102890701615
ADHD 3.6054452946470192
T1bias 56.353059850101985
avoidance_score 21.190511405953075
hypertension_score 15.905973160227754
