In [18]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
import os

random_state = 3601

In [19]:
data_path_2009 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2009'
df_2009 = pd.read_excel(os.path.join(data_path_2009, "PTSD.xlsx"))


In [20]:
data_path_2016 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2016'
df_2016 = pd.read_csv(os.path.join(data_path_2016, "IDF_ABM_16.2.15_wide.csv"))


df_2016 = df_2016[~(df_2016['Wave']=='nov12')]


## features in the original data

In [21]:
trans_2016_2009_features = {
    'bagrut': 'highschool_diploma',
 'ADHD': 'ADHD',
 'Accuracy_threat_T1': 'T1Acc1t',
 'Accuracy_NT_T1': 'T1Acc1n',
 'Threat_Bias_T1': 'T1bias',
    
 'Accuracy_all_T1': 'Accuracy_all_T1',
 'Accuracy_neutral_T1': 'Accuracy_neutral_T1',
 'RT_all_T1': 'RT_all_T1',
 'RT_neutral_NT_T1': 'RT_neutral_NT_T1',
 'RT_threat_NT_T1': 'RT_threat_NT_T1',
 'RT_NT_T1': 'RT_NT_T1',
 'ABV_T1': 'ABV_T1',
 'PHQ_T1': 'phq1',
 'Trait_T1': 'trait1',
 'State_T1': 'state1',
 'PCL_T1': 'PCL1',
 'Intrusion_T1':'Intrusion_T1',
    'Avoidance_T1': 'Avoidance_T1',
    'Hyper_T1': 'Hyper_T1'
}


## append PCL intrusion features

In [22]:
target_feature = 'target_feature'
X_features = [i for i in trans_2016_2009_features.values() if not i == target_feature]


In [23]:
df_2016['target_feature'] = (df_2016['Intrusion_T4'] > 9).astype(int)
#df_2009['PCL3'] = (df_2009['PCL3'] > 39).astype(int)

## adjust features from 2016

In [24]:
df_2016['bagrut'] = df_2016['bagrut'] == 'yes'
df_2016['dyslexia'] = df_2016['dyslexia'] == 'yes'
df_2016['ADHD'] = df_2016['ADHD'] == 'yes'

In [25]:
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)

In [26]:
#df_2009 = df_2009[~df_2009[target_feature].isna()]
df_2016 = df_2016[~df_2016[target_feature].isna()]

## 2009 data outer CV

## parameters init

In [27]:
cv = StratifiedKFold(6, random_state=random_state, shuffle=True)

# class weight
pos_sample = df_2016[target_feature].sum() 
all_samples = df_2016[target_feature].count()
class_weights = all_samples/ pos_sample


# pipeline

pipe = Pipeline(steps=[
    ('RFE', RFE(CatBoostClassifier(verbose=0, random_state=random_state))),
    ('classifier', CatBoostClassifier(verbose=0, random_state=random_state))])
grid_params = [{
        'RFE__n_features_to_select': [15, 8],
        'classifier__class_weights':[[1, class_weights*0.5], [1, class_weights*0.75], [1, class_weights*0.25]],
        'classifier__l2_leaf_reg': [130, 10, 75],
        'classifier__depth': [6, 4]#, 9]
        }]




In [28]:
class_weights

10.311111111111112

## 2016

In [29]:
x_2016, y_2016 = df_2016[df_2016['Wave']=='august13'][X_features], df_2016[df_2016['Wave']=='august13'][target_feature]
clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')


In [30]:
x_2012, y_2012 = df_2016[df_2016['Wave']=='august12'][X_features], df_2016[df_2016['Wave']=='august12'][target_feature]

clf.fit(x_2012, y_2012.astype(int), classifier__early_stopping_rounds = 15)
print(f"roc_auc = {clf.best_score_}, params = {clf.best_params_}")

roc_auc = 0.5733364943891259, params = {'RFE__n_features_to_select': 15, 'classifier__class_weights': [1, 2.577777777777778], 'classifier__depth': 6, 'classifier__l2_leaf_reg': 75}


In [31]:
y_pred_target = clf.best_estimator_.predict_proba(x_2016)[:, 1]
print(f"roc_auc = {roc_auc_score(y_2016.astype(int), y_pred_target)}")

roc_auc = 0.5831443688586546


In [32]:
for i, j in zip(X_features, clf.best_estimator_['classifier'].get_feature_importance()):
    print(i, j)

highschool_diploma 3.5194353233390405
ADHD 7.940976066251967
T1Acc1t 4.8539354674261705
T1Acc1n 6.72199099466475
T1bias 6.061670490574297
Accuracy_all_T1 4.293581256908611
Accuracy_neutral_T1 6.557280351243258
RT_all_T1 4.318111359689945
RT_neutral_NT_T1 12.554369759381377
RT_threat_NT_T1 4.342748616319024
RT_NT_T1 7.085795430493763
ABV_T1 4.877672435482536
phq1 10.38601933725567
trait1 8.333141642584952
state1 8.153271468384638
