In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
import os
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from fancyimpute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

random_state = 854

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data_path_2009 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2009'
df_2009 = pd.read_excel(os.path.join(data_path_2009, "PTSD.xlsx"))

print(df_2009.shape)

df_2009 = df_2009.drop_duplicates(subset="ID")
print(df_2009.shape)

df_2009['control'] = np.ones_like(df_2009.ID)
df_2009['placebo'] = np.zeros_like(df_2009.ID)
df_2009['train_4'] = np.zeros_like(df_2009.ID)

(1103, 179)
(1103, 179)


In [3]:
data_path_2016 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2016'
df_2016 = pd.read_csv(os.path.join(data_path_2016, "IDF_ABM_16.2.15_wide.csv"))


#df_2016 = df_2016[(df_2016['Wave']=='august12')]


df_2016 =df_2016.drop_duplicates(subset="ID")
#df_2016 = df_2016[~(df_2016['Wave']=='nov12')]
print(df_2016.shape)
df_2016 = pd.concat((df_2016,pd.get_dummies(df_2016.Group)),1)

(724, 105)


## features in the original data

In [27]:
trans_2016_2009_features = {
    'bagrut': 'highschool_diploma',
 'ADHD': 'ADHD',
  'Accuracy_threat_T1': 'T1Acc1t',
  'Accuracy_NT_T1': 'T1Acc1n',
  'Threat_Bias_T1': 'T1bias',
 'PHQ_T1': 'phq1',
 'Trait_T1': 'trait1',
 'State_T1': 'state1',
 'PCL_T1': 'PCL1',
 'Intrusion_T1':'Intrusion_T1',
    'Avoidance_T1': 'Avoidance_T1',
    'Hyper_T1': 'Hyper_T1',
    'control': 'control',
    'placebo': 'placebo',
    'train_4': 'train_4'
}


## append PCL intrusion features

In [28]:
PCL_2009_1 = pd.read_csv(os.path.join(data_path_2009, "questionnaire_PCL1.csv"))

intrusion_features_2009 = ["q6.1_INTRU", "q6.2_DREAM", "q6.3_FLASH", "q6.4_UPSET", "q6.5_PHYS"]
avoidance_features_2009 = ["q6.6_AVTHT", "q6.7_AVSIT", "q6.8_AMNES", "q6.9_DISINT", "q6.10_DTACH",
             "q6.11_NUMB", "q6.12_FUTRE"]
hyper_features_2009 = ["q6.13_SLEEP", "q6.14_ANGER", "q6.15_CONC", "q6.16_HYPER", "q6.17_STRTL"]


df_2009_1 = df_2009.merge(PCL_2009_1[intrusion_features_2009 +avoidance_features_2009+hyper_features_2009+ ["ID"]], on="ID", how='outer')
df_2009['Intrusion_T1'] = df_2009_1[intrusion_features_2009].sum(axis=1)
df_2009['Avoidance_T1'] = df_2009_1[avoidance_features_2009].sum(axis=1)
df_2009['Hyper_T1'] = df_2009_1[hyper_features_2009].sum(axis=1)

trans_2016_2009_features['Intrusion_T1']= 'Intrusion_T1'
trans_2016_2009_features['Avoidance_T1']= 'Avoidance_T1'
trans_2016_2009_features['Hyper_T1']= 'Hyper_T1'

In [29]:
PCL_2009_3 = pd.read_excel(os.path.join(data_path_2009, "questionnaire6PCL3.xlsx"))



PCL_2009_3 = df_2009.merge(PCL_2009_3[intrusion_features_2009 +avoidance_features_2009+hyper_features_2009+ ["ID"]], on="ID", how='outer')
df_2009['Intrusion_T4'] = PCL_2009_3[intrusion_features_2009].sum(axis=1)
df_2009['Avoidance_T4'] = PCL_2009_3[avoidance_features_2009].sum(axis=1)
df_2009['Hyper_T4'] = PCL_2009_3[hyper_features_2009].sum(axis=1)


In [30]:
target_feature = 'target_feature'
secondary_targets = ['Intrusion_T4', 'Avoidance_T4', 'Hyper_T4']
X_features = [i for i in trans_2016_2009_features.values() if not i == target_feature]

df_2016['target_feature'] = (df_2016['PCL_T4'] > 39).astype(int)
df_2009['target_feature'] = (df_2009['PCL3'] > 39).astype(int)

## adjust features from 2016

In [31]:
df_2016['bagrut'] = df_2016['bagrut'] == 'yes'
df_2016['dyslexia'] = df_2016['dyslexia'] == 'yes'
df_2016['ADHD'] = df_2016['ADHD'] == 'yes'

In [32]:
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)
print(df_2016.shape)
print(df_2009.shape)

(724, 110)
(1103, 189)


In [33]:

df_2016 = df_2016[~df_2016['Intrusion_T4'].isna()]
df_2016 = df_2016[~df_2016['Avoidance_T4'].isna()]
df_2016 = df_2016[~df_2016['Hyper_T4'].isna()]
df_2016 = df_2016[~df_2016['PCL_T4'].isna()]

In [34]:
df_2009 = df_2009[~df_2009['Intrusion_T4'].isna()]
df_2009 = df_2009[~df_2009['Avoidance_T4'].isna()]
df_2009 = df_2009[~df_2009['Hyper_T4'].isna()]
df_2009 = df_2009[~df_2009['PCL3'].isna()]

In [35]:
print(df_2016.shape)
print(df_2009.shape)

(589, 110)
(725, 189)


In [36]:
cv = StratifiedKFold(5, random_state=random_state, shuffle=True)

In [37]:
x_2012, y_2012 = df_2016[(df_2016['Wave']=='august12')][X_features+ secondary_targets], df_2016[(df_2016['Wave']=='august12')][target_feature]

x_2009, y_2009 = df_2009[X_features+ secondary_targets], df_2009[target_feature]

In [38]:
x_2013, y_2013 = df_2016[df_2016['Wave']=='august13'][X_features+ secondary_targets], df_2016[df_2016['Wave']=='august13'][target_feature]


In [39]:
X_2009_2012 = pd.DataFrame(np.vstack([x_2009, x_2012]), columns=X_features+ secondary_targets)
Y_2009_2012 = pd.DataFrame(np.hstack([y_2009, y_2012]), columns = [target_feature])

In [40]:
for train, test in cv.split(X_2009_2012, Y_2009_2012):
    x_train, y_train = X_2009_2012.iloc[train], Y_2009_2012.iloc[train]
    x_test, y_test = X_2009_2012.iloc[test], Y_2009_2012.iloc[test]
    train_targets = x_train[secondary_targets]
    test_targets = x_test[secondary_targets]
    
    x_train = x_train[X_features]
    x_test = x_test[X_features]
    
    lr_intrusion = CatBoostRegressor(verbose=0, random_state=random_state)
    lr_avoidnce = CatBoostRegressor(verbose=0, random_state=random_state)
    lr_hyper = CatBoostRegressor(verbose=0, random_state=random_state)
    

    
    mice = IterativeImputer(max_iter=1000)
    x_train = mice.fit_transform(x_train)
    x_test = mice.transform(x_test)

    lr_intrusion.fit(x_train, train_targets['Intrusion_T4'].astype(float))
    lr_avoidnce.fit(x_train, train_targets['Avoidance_T4'].astype(float))
    lr_hyper.fit(x_train, train_targets['Hyper_T4'].astype(float))

    
    intrusion_train = lr_intrusion.predict(x_train)
    avoidance_train = lr_avoidnce.predict(x_train)
    hyper_train = lr_hyper.predict(x_train)

    intrusion = lr_intrusion.predict(x_test)
    avoidance = lr_avoidnce.predict(x_test)
    hyper = lr_hyper.predict(x_test)

    
    y_pred_target = (intrusion + avoidance + hyper ) / 70
    print(f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target)}")

roc_auc = 0.6801829268292683
roc_auc = 0.8129573170731708
roc_auc = 0.650834403080873
roc_auc = 0.8260590500641849
roc_auc = 0.7164634146341463


In [41]:
x_train, y_train = X_2009_2012, Y_2009_2012
x_test, y_test = x_2013, y_2013

train_targets = x_train[secondary_targets]
test_targets = x_test[secondary_targets]
    
x_train = x_train[X_features]
x_test = x_test[X_features]
 
    
lr_intrusion = CatBoostRegressor(verbose=0, random_state=random_state)
lr_avoidnce = CatBoostRegressor(verbose=0, random_state=random_state)
lr_hyper = CatBoostRegressor(verbose=0, random_state=random_state)
    

mice = IterativeImputer(max_iter=1000)
x_train = mice.fit_transform(x_train)
x_test = mice.transform(x_test)


lr_intrusion.fit(x_train, train_targets['Intrusion_T4'].astype(float))
lr_avoidnce.fit(x_train, train_targets['Avoidance_T4'].astype(float))
lr_hyper.fit(x_train, train_targets['Hyper_T4'].astype(float))

    
intrusion_train = lr_intrusion.predict(x_train)
avoidance_train = lr_avoidnce.predict(x_train)
hyper_train = lr_hyper.predict(x_train)
    
    
intrusion = lr_intrusion.predict(x_test)
avoidance = lr_avoidnce.predict(x_test)
hyper = lr_hyper.predict(x_test)

y_pred_target = (intrusion + avoidance + hyper) / 70
print(f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target)}")

roc_auc = 0.6722077922077923


## Classification

In [42]:
for train, test in cv.split(X_2009_2012, Y_2009_2012):
    x_train, y_train = X_2009_2012.iloc[train], Y_2009_2012.iloc[train]
    x_test, y_test = X_2009_2012.iloc[test], Y_2009_2012.iloc[test]
    
    x_train = x_train[X_features]
    x_test = x_test[X_features]
    
    lr_intrusion = CatBoostClassifier(verbose=0, random_state=random_state)
    
    mice = IterativeImputer(max_iter=1000)
    x_train = mice.fit_transform(x_train)
    x_test = mice.transform(x_test)

    lr_intrusion.fit(x_train, y_train)

    
    y_pred_target = lr_intrusion.predict_proba(x_test)
    print(f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target[:, 1])}")

roc_auc = 0.6972560975609756
roc_auc = 0.8382621951219513
roc_auc = 0.6774711168164314
roc_auc = 0.8193196405648268
roc_auc = 0.6577342747111682


In [43]:
x_train, y_train = X_2009_2012, Y_2009_2012
x_test, y_test = x_2013, y_2013

x_train = x_train[X_features]
x_test = x_test[X_features]
 
    
lr_intrusion = CatBoostClassifier(verbose=0, random_state=random_state)

mice = IterativeImputer(max_iter=1000)
x_train = mice.fit_transform(x_train)
x_test = mice.transform(x_test)


lr_intrusion.fit(x_train, y_train)

    

y_pred_target = lr_intrusion.predict_proba(x_test)
print(f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target[:, 1])}")

roc_auc = 0.5948051948051948
