In [2]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.preprocessing import OneHotEncoder
import os
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler
from fancyimpute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

random_state = 12

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
data_path_2016 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2016'
df_2016 = pd.read_csv(os.path.join(data_path_2016, "IDF_ABM_16.2.15_wide.csv"))
print(df_2016.shape)

df_2016 =df_2016.drop_duplicates(subset="ID")
#df_2016 = df_2016[~(df_2016['Wave']=='nov12')]
print(df_2016.shape)
df_2016 = pd.concat((df_2016,pd.get_dummies(df_2016.Group)),1)

(724, 105)
(724, 105)


## features in the original data

In [4]:
X_features = ['bagrut', 'ADHD', 'Accuracy_threat_T1', 'Accuracy_NT_T1',
              'Threat_Bias_T1', 'control', 'placebo', 'train_4',
  'Accuracy_all_T1',
  'Accuracy_neutral_T1',
  'RT_all_T1',
  'RT_neutral_NT_T1',
  'RT_threat_NT_T1',
  'RT_NT_T1',
 'ABV_T1', 'PHQ_T1', 'Trait_T1', 'PCL_T1', 'Intrusion_T1',
              'Avoidance_T1', 'Hyper_T1']


## append PCL intrusion features

In [5]:
target_feature = 'target_feature'
df_2016['target_feature'] = (df_2016['PCL_T4'] > 39).astype(int)
secondary_targets = ['Intrusion_T4', 'Avoidance_T4', 'Hyper_T4']


## adjust features from 2016

In [6]:
df_2016['bagrut'] = (df_2016['bagrut'] == 'yes').astype(int)
df_2016['dyslexia'] = (df_2016['dyslexia'] == 'yes').astype(int)
df_2016['ADHD'] = (df_2016['ADHD'] == 'yes').astype(int)

In [7]:
print(df_2016.shape)


(724, 110)


In [8]:
df_2016 = df_2016[~df_2016['PCL_T4'].isna()]
print(df_2016.shape)

(589, 110)


In [9]:
df_2016 = df_2016[~df_2016['Intrusion_T4'].isna()]
df_2016 = df_2016[~df_2016['Avoidance_T4'].isna()]
df_2016 = df_2016[~df_2016['Hyper_T4'].isna()]


In [10]:
df_2016.shape

(589, 110)

## 2016

In [53]:
x_2013, y_2013 = df_2016[df_2016['Wave']=='august13'][X_features+ secondary_targets], df_2016[df_2016['Wave']=='august13'][target_feature]
x_2012b, y_2012b = df_2016[(df_2016['Wave']=='nov12')][X_features+ secondary_targets], df_2016[(df_2016['Wave']=='nov12')][target_feature]

x_2012, y_2012 = df_2016[(df_2016['Wave']=='august12')][X_features+ secondary_targets], df_2016[(df_2016['Wave']=='august12')][target_feature]


In [54]:
cv = StratifiedKFold(n_splits=5, random_state=random_state, shuffle=True)

In [55]:
for train, test in cv.split(x_2012, y_2012):
    x_train, y_train = x_2012.iloc[train], y_2012.iloc[train]
    x_test, y_test = x_2012.iloc[test], y_2012.iloc[test]

    train_targets = x_train[secondary_targets]
    test_targets = x_test[secondary_targets]
    
    x_train = x_train[X_features]
    x_test = x_test[X_features]
    
    lr_intrusion = CatBoostRegressor(verbose=0, random_state=random_state)
    lr_avoidnce = CatBoostRegressor(verbose=0, random_state=random_state)
    lr_hyper = CatBoostRegressor(verbose=0, random_state=random_state)
    

    
    mice = IterativeImputer(max_iter=1000)
    x_train = mice.fit_transform(x_train)
    x_test = mice.transform(x_test)

    lr_intrusion.fit(x_train, train_targets['Intrusion_T4'])
    lr_avoidnce.fit(x_train, train_targets['Avoidance_T4'])
    lr_hyper.fit(x_train, train_targets['Hyper_T4'])

    
    intrusion_train = lr_intrusion.predict(x_train)
    avoidance_train = lr_avoidnce.predict(x_train)
    hyper_train = lr_hyper.predict(x_train)

    intrusion = lr_intrusion.predict(x_test)
    avoidance = lr_avoidnce.predict(x_test)
    hyper = lr_hyper.predict(x_test)

    
    y_pred_target = (intrusion + avoidance + hyper ) / 70
    print(f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target)}")

roc_auc = 1.0
roc_auc = 0.6216216216216216
roc_auc = 0.7083333333333333
roc_auc = 1.0
roc_auc = 0.2222222222222222


In [56]:
x_train, y_train = x_2012, y_2012
x_test, y_test = x_2013, y_2013

train_targets = x_train[secondary_targets]
test_targets = x_test[secondary_targets]
    
x_train = x_train[X_features]
x_test = x_test[X_features]
 
    
lr_intrusion = CatBoostRegressor(verbose=0, random_state=random_state)
lr_avoidnce = CatBoostRegressor(verbose=0, random_state=random_state)
lr_hyper = CatBoostRegressor(verbose=0, random_state=random_state)
    

# ss = StandardScaler()
# x_train = ss.fit_transform(x_train[X_features])
# x_test = ss.transform(x_test[X_features])

mice = IterativeImputer(max_iter=1000)
x_train = mice.fit_transform(x_train)
x_test = mice.transform(x_test)


lr_intrusion.fit(x_train, train_targets['Intrusion_T4'])
lr_avoidnce.fit(x_train, train_targets['Avoidance_T4'])
lr_hyper.fit(x_train, train_targets['Hyper_T4'])

    
intrusion_train = lr_intrusion.predict(x_train)
avoidance_train = lr_avoidnce.predict(x_train)
hyper_train = lr_hyper.predict(x_train)
    
    
intrusion = lr_intrusion.predict(x_test)
avoidance = lr_avoidnce.predict(x_test)
hyper = lr_hyper.predict(x_test)

y_pred_target = (intrusion + avoidance + hyper) / 70
print(f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target)}")

roc_auc = 0.6742857142857143


## normal classification

In [57]:
for train, test in cv.split(x_2012, y_2012):
    x_train, y_train = x_2012.iloc[train], y_2012.iloc[train]
    x_test, y_test = x_2012.iloc[test], y_2012.iloc[test]


    x_train = x_train[X_features]
    x_test = x_test[X_features]


    lr_pcl = CatBoostClassifier(verbose=0, random_state=random_state)

    mice = IterativeImputer(max_iter=1000)
    x_train = mice.fit_transform(x_train)
    x_test = mice.transform(x_test)
    lr_pcl.fit(x_train, y_train)

    y_pred_target = lr_pcl.predict_proba(x_test)

    print(f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target[:, 1])}")

roc_auc = 0.7837837837837838
roc_auc = 0.6216216216216216
roc_auc = 0.4444444444444445
roc_auc = 0.9305555555555556
roc_auc = 0.7083333333333334


In [58]:
x_train, y_train = x_2012, y_2012
x_test, y_test = x_2013, y_2013

x_train = x_train[X_features]
x_test = x_test[X_features]
 

lr_pcl = CatBoostClassifier(verbose=0, random_state=random_state)
    
mice = IterativeImputer(max_iter=1000)
x_train = mice.fit_transform(x_train)
x_test = mice.transform(x_test)
lr_pcl.fit(x_train, y_train)

y_pred_target = lr_pcl.predict_proba(x_test)

print(f"roc_auc = {roc_auc_score(y_test.astype(int), y_pred_target[:, 1])}")

roc_auc = 0.5953246753246753


In [59]:
y_2013.shape

(186,)