In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE

import os

random_state = 3601

In [2]:
data_path_2009 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2009'
df_2009 = pd.read_excel(os.path.join(data_path_2009, "PTSD.xlsx"))


In [3]:
data_path_2016 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2016'
df_2016 = pd.read_csv(os.path.join(data_path_2016, "IDF_ABM_16.2.15_wide.csv"))


df_2016 = df_2016[(df_2016['Wave']=='august12')]


## features in the original data

In [17]:
trans_2016_2009_features = {
 'bagrut': 'highschool_diploma',
 'ADHD': 'ADHD',
 #'Accuracy_threat_T1': 'T1Acc1t',
 #'Accuracy_NT_T1': 'T1Acc1n',
 #'Threat_Bias_T1': 'T1bias',
 'PHQ_T1': 'phq1',
 #'Trait_T1': 'trait1',
 #'State_T1': 'state1',
  #'PCL_T1': 'PCL1',
 'PCL_T4': 'PCL3'}


## append PCL intrusion features

In [None]:
PCL_2009 = pd.read_csv(os.path.join(data_path_2009, "questionnaire_PCL1.csv"))
PCL_2016 = pd.read_csv(os.path.join(data_path_2016, "questionnaire5_PCL.csv"))                      

intrusion_features_2009 = ["q6.1_INTRU", "q6.2_DREAM", "q6.3_FLASH", "q6.4_UPSET", "q6.5_PHYS"]
intrusion_features_2016 = ["q5.1", "q5.2", "q5.3", "q5.4", "q5.5"]
df_2009 = df_2009.merge(PCL_2009[intrusion_features_2009 + ["ID"]], on="ID", how='outer')
df_2016 = df_2016.merge(PCL_2016[intrusion_features_2016 + ["ID"]], on="ID", how='outer')

for i, j in zip(intrusion_features_2009, intrusion_features_2016):
    trans_2016_2009_features[j] = i

## append PHQ9 features

In [None]:
PHQ9_2009 = pd.read_csv(os.path.join(data_path_2009, "questionnaire5_PHQ9.csv"))
PHQ9_2016 = pd.read_csv(os.path.join(data_path_2016, "questionnaire4_PHQ9.csv"))                      

PHQ9_features_2009 = ["T1q5.1", "T1q5.2", "T1q5.3", "T1q5.4", "T1q5.5", "T1q5.6", "T1q5.7", "T1q5.8", "T1q5.9"]
PHQ9_features_2016 = ["q4.1", "q4.2","q4.3", "q4.4", "q4.5", "q4.6", "q4.7", "q4.8", "q49"]
df_2009 = df_2009.merge(PHQ9_2009[PHQ9_features_2009 + ["ID"]], on="ID", how='outer')
df_2016 = df_2016.merge(PHQ9_2016[PHQ9_features_2016 + ["ID"]], on="ID", how='outer')

for i, j in zip(PHQ9_features_2009, PHQ9_features_2016):
    trans_2016_2009_features[j] = i

## target feature

In [5]:
target_feature = 'PCL3'
X_features = [i for i in trans_2016_2009_features.values() if not i == target_feature]


In [6]:
df_2016['PCL_T4'] = (df_2016['PCL_T4'] > 39).astype(int)
df_2009['PCL3'] = (df_2009['PCL3'] > 39).astype(int)

## adjust features from 2016

In [7]:
df_2016['bagrut'] = df_2016['bagrut'] == 'yes'
df_2016['dyslexia'] = df_2016['dyslexia'] == 'yes'
df_2016['ADHD'] = df_2016['ADHD'] == 'yes'

In [8]:
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)

In [9]:
df_2009 = df_2009[~df_2009[target_feature].isna()]
df_2016 = df_2016[~df_2016[target_feature].isna()]

## 2009 data outer CV

In [10]:
x, x_test, y, y_test = train_test_split(df_2009[X_features], df_2009[target_feature], test_size=0.25,
                                        random_state=random_state, stratify=df_2009[target_feature])

## parameters init

In [18]:
cv = StratifiedKFold(6, random_state=random_state, shuffle=True)

# class weight
pos_sample = y.sum() 
all_samples = y.count()
class_weights = all_samples/ pos_sample


# pipeline

pipe = Pipeline(steps=[
 #   ('RFE', RFE(CatBoostClassifier(verbose=0, random_state=random_state))),
    ('classifier', CatBoostClassifier(verbose=0, random_state=random_state))])
grid_params = [{
  #      'RFE__n_features_to_select': [15, 5, 3, 8],
       # 'classifier__class_weights':[[1, class_weights]],# [1, class_weights*2]],# [1, class_weights*0.5]],
       # 'classifier__l2_leaf_reg': [100],# 250, 500],
        'classifier__depth': [4]#, 9]
        }]




## 2016

In [19]:
x_2016, y_2016 = df_2016[X_features], df_2016[target_feature] *0 +1
x_2009, y_2009 = df_2009[X_features], df_2009[target_feature] * 0

X = pd.concat([x_2016, x_2009])
Y = np.hstack([y_2016, y_2009])

In [20]:
x_train, x_val, y_train, y_val = train_test_split(X, Y,  random_state=random_state, test_size=0.2, stratify=Y)

In [21]:
clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')
clf.fit(x_train, y_train.astype(int), classifier__early_stopping_rounds = 15)
print(f"roc_auc = {clf.best_score_}, params = {clf.best_params_}")

roc_auc = 0.6952368068439497, params = {'classifier__depth': 4}


In [22]:
y_pred_target = clf.best_estimator_.predict_proba(x_val)[:, 1]
print(f"roc_auc = {roc_auc_score(y_val.astype(int), y_pred_target)}")

roc_auc = 0.6707673453996983


In [23]:
for i, j in zip(X_features, clf.best_estimator_['classifier'].get_feature_importance()):
    print(i, j)

highschool_diploma 12.381828138410995
ADHD 7.471420175524952
phq1 80.14675168606405


In [24]:
y_2016.astype(int)

0      1
1      1
2      1
3      1
4      1
      ..
236    1
237    1
238    1
239    1
240    1
Name: PCL3, Length: 241, dtype: int32