In [2]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

import os

random_state = 3601

In [4]:
data_path_2009 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2009'
df_2009 = pd.read_excel(os.path.join(data_path_2009, "PTSD.xlsx"))

print(df_2009.shape)

df_2009 =df_2009.drop_duplicates(subset="ID")
#df_2016 = df_2016[~(df_2016['Wave']=='nov12')]
print(df_2009.shape)

df_2009['control'] = np.ones_like(df_2009.ID)
df_2009['placebo'] = np.zeros_like(df_2009.ID)
df_2009['train_4'] = np.zeros_like(df_2009.ID)

(1103, 179)
(1103, 179)


In [5]:
data_path_2016 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2016'
df_2016 = pd.read_csv(os.path.join(data_path_2016, "IDF_ABM_16.2.15_wide.csv"))


df_2016 = df_2016[(df_2016['Wave']=='august12')]


df_2016 =df_2016.drop_duplicates(subset="ID")
#df_2016 = df_2016[~(df_2016['Wave']=='nov12')]
print(df_2016.shape)
df_2016 = pd.concat((df_2016,pd.get_dummies(df_2016.Group)),1)

(241, 105)


## features in the original data

In [6]:
trans_2016_2009_features = {
    'bagrut': 'highschool_diploma',
 'ADHD': 'ADHD',
 'Accuracy_threat_T1': 'T1Acc1t',
 'Accuracy_NT_T1': 'T1Acc1n',
 'Threat_Bias_T1': 'T1bias',
 'PHQ_T1': 'phq1',
 'Trait_T1': 'trait1',
 'State_T1': 'state1',
 'PCL_T1': 'PCL1',
'control':'control',
    'placebo': 'placebo',
    'train_4': 'train_4'
}


## append PCL intrusion features

In [5]:
PCL_2009 = pd.read_excel(os.path.join(data_path_2009, "questionnaire6PCL3.xlsx"))

intrusion_features_2009 = ["q6.1_INTRU", "q6.2_DREAM", "q6.3_FLASH", "q6.4_UPSET", "q6.5_PHYS"]
df_2009_4 = df_2009.merge(PCL_2009[intrusion_features_2009 + ["ID"]], on="ID", how='outer')



In [6]:
PCL_2009 = pd.read_csv(os.path.join(data_path_2009, "questionnaire_PCL1.csv"))
PCL_2016 = pd.read_csv(os.path.join(data_path_2016, "questionnaire5_PCL.csv"))                      

intrusion_features_2009 = ["q6.1_INTRU", "q6.2_DREAM", "q6.3_FLASH", "q6.4_UPSET", "q6.5_PHYS"]
intrusion_features_2016 = ["q5.1", "q5.2", "q5.3", "q5.4", "q5.5"]
df_2009 = df_2009.merge(PCL_2009[intrusion_features_2009 + ["ID"]], on="ID", how='outer')
df_2016 = df_2016.merge(PCL_2016[intrusion_features_2016 + ["ID"]], on="ID", how='outer')

for i, j in zip(intrusion_features_2009, intrusion_features_2016):
    trans_2016_2009_features[j] = i

In [7]:
PHQ9_2009 = pd.read_csv(os.path.join(data_path_2009, "questionnaire5_PHQ9.csv"))
PHQ9_2016 = pd.read_csv(os.path.join(data_path_2016, "questionnaire4_PHQ9.csv"))                      

PHQ9_features_2009 = ["T1q5.1", "T1q5.2", "T1q5.3", "T1q5.4", "T1q5.5", "T1q5.6", "T1q5.7", "T1q5.8", "T1q5.9"]
PHQ9_features_2016 = ["q4.1", "q4.2","q4.3", "q4.4", "q4.5", "q4.6", "q4.7", "q4.8", "q49"]
df_2009 = df_2009.merge(PHQ9_2009[PHQ9_features_2009 + ["ID"]], on="ID", how='outer')
df_2016 = df_2016.merge(PHQ9_2016[PHQ9_features_2016 + ["ID"]], on="ID", how='outer')

for i, j in zip(PHQ9_features_2009, PHQ9_features_2016):
    trans_2016_2009_features[j] = i

In [8]:
target_feature = 'target_feature'
X_features = [i for i in trans_2016_2009_features.values() if not i == target_feature]


In [9]:
df_2016['target_feature'] = (df_2016['Intrusion_T4'] > 7).astype(int)
df_2009['Intrusion_T4'] = df_2009_4[intrusion_features_2009].sum(axis=1)
df_2009['target_feature'] = (df_2009['Intrusion_T4'] > 7).astype(int)


## adjust features from 2016

In [10]:
df_2016['bagrut'] = df_2016['bagrut'] == 'yes'
df_2016['dyslexia'] = df_2016['dyslexia'] == 'yes'
df_2016['ADHD'] = df_2016['ADHD'] == 'yes'

In [11]:
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)

In [12]:
df_2009 = df_2009[~df_2009[target_feature].isna()]
df_2016 = df_2016[~df_2016[target_feature].isna()]

## 2009 data outer CV

## parameters init

In [13]:
cv = StratifiedKFold(6, random_state=random_state, shuffle=True)

# class weight
pos_sample = df_2009[target_feature].sum() 
all_samples = df_2009[target_feature].count()
class_weights = all_samples/ pos_sample


# pipeline

pipe = Pipeline(steps=[
   # ('RFE', RFE(CatBoostClassifier(verbose=0, random_state=random_state))),
    ('classifier', CatBoostClassifier(verbose=0, random_state=random_state))])
grid_params = [{
 #       'RFE__n_features_to_select': [15],
        'classifier__class_weights':[[1, class_weights], [1, class_weights*2], [1, class_weights*0.5]],
        'classifier__l2_leaf_reg': [3, 10, 75],
        'classifier__depth': [5, 8]#, 9]
        }]




## 2016

In [14]:
x_2009, y_2009 = df_2009[X_features], df_2009[target_feature]
clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')


In [15]:
x_2012, y_2012 = df_2016[df_2016['Wave']=='august12'][X_features], df_2016[df_2016['Wave']=='august12'][target_feature]

clf.fit(x_2009, y_2009.astype(int), classifier__early_stopping_rounds = 15)
print(f"roc_auc = {clf.best_score_}, params = {clf.best_params_}")

roc_auc = 0.6529222198612538, params = {'classifier__class_weights': [1, 2.746268656716418], 'classifier__depth': 8, 'classifier__l2_leaf_reg': 75}


In [16]:
y_pred_target = clf.best_estimator_.predict_proba(x_2012)[:, 1]
print(f"roc_auc = {roc_auc_score(y_2012.astype(int), y_pred_target)}")

roc_auc = 0.5421455938697318


In [17]:
for i, j in zip(X_features, clf.best_estimator_['classifier'].get_feature_importance()):
    print(i, j)

highschool_diploma 2.08712797052411
ADHD 1.6178089744773274
T1Acc1t 6.209533425501387
T1Acc1n 4.527845244997182
T1bias 12.639133443747355
phq1 5.505947622248064
trait1 7.764646379319579
state1 7.7190131368085035
PCL1 9.530020761056988
q6.1_INTRU 3.970541860780802
q6.2_DREAM 3.2577518658039053
q6.3_FLASH 2.7154708999807617
q6.4_UPSET 1.934639832870112
q6.5_PHYS 1.7180874917854627
T1q5.1 3.4033575843532367
T1q5.2 3.1145805610731037
T1q5.3 3.474897929002598
T1q5.4 3.8083747289800725
T1q5.5 5.084889948477361
T1q5.6 1.515869843855303
T1q5.7 1.8780815208446482
T1q5.8 3.363062366372994
T1q5.9 3.1593166071391603
