In [2]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE

import os

random_state = 3601

In [3]:
data_path_2009 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2009'
df_2009 = pd.read_excel(os.path.join(data_path_2009, "PTSD.xlsx"))


In [4]:
data_path_2016 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2016'
df_2016 = pd.read_csv(os.path.join(data_path_2016, "IDF_ABM_16.2.15_wide.csv"))


df_2016 = df_2016[~(df_2016['Wave']=='nov12')]


## features in the original data

In [5]:
trans_2016_2009_features = {
    'bagrut': 'highschool_diploma',
 'ADHD': 'ADHD',
 'Accuracy_threat_T1': 'T1Acc1t',
 'Accuracy_NT_T1': 'T1Acc1n',
 'Threat_Bias_T1': 'T1bias',
 'PHQ_T1': 'phq1',
 'Trait_T1': 'trait1',
 'State_T1': 'state1',
  'PCL_T1': 'PCL1',
 'PCL_T4': 'PCL3'}


In [6]:
df_2016['Wave'].unique()

array(['august12', 'august13'], dtype=object)

## target feature

In [7]:
target_feature = 'PCL3'
X_features = [i for i in trans_2016_2009_features.values() if not i == target_feature]


In [8]:
df_2016['PCL_T4'] = (df_2016['PCL_T4'] > 39).astype(int)
df_2009['PCL3'] = (df_2009['PCL3'] > 39).astype(int)

## adjust features from 2016

In [9]:
df_2016['bagrut'] = df_2016['bagrut'] == 'yes'
df_2016['dyslexia'] = df_2016['dyslexia'] == 'yes'
df_2016['ADHD'] = df_2016['ADHD'] == 'yes'

In [10]:
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)

In [11]:
df_2009 = df_2009[~df_2009[target_feature].isna()]
df_2016 = df_2016[~df_2016[target_feature].isna()]

## parameters init

In [12]:
cv = StratifiedKFold(6, random_state=random_state, shuffle=True)

# class weight
pos_sample = df_2016[target_feature].sum() 
all_samples = df_2016[target_feature].count()
class_weights = all_samples/ pos_sample


# pipeline

pipe = Pipeline(steps=[
   # ('RFE', RFE(CatBoostClassifier(verbose=0, random_state=random_state))),
    ('classifier', CatBoostClassifier(verbose=0, random_state=random_state))])
grid_params = [{
 #       'RFE__n_features_to_select': [15],
        'classifier__class_weights':[[1, class_weights], [1, class_weights*2], [1, class_weights*0.5]],
        'classifier__l2_leaf_reg': [100, 3, 50, 10, 1, 6],
        'classifier__depth': [4, 7, 5]#, 9]
        }]




## 2016

In [13]:
x_2016, y_2016 = df_2016[df_2016['Wave']=='august13'][X_features+['Intrusion_T1']], df_2016[df_2016['Wave']=='august13'][target_feature]
x_2012, y_2012 = df_2016[df_2016['Wave']=='august12'][X_features+['Intrusion_T1']], df_2016[df_2016['Wave']=='august12'][target_feature]


In [14]:
x_2016, y_2016 = df_2016[X_features], df_2016[target_feature] *0 +1
x_2009, y_2009 = df_2009[X_features], df_2009[target_feature] * 0

In [15]:
X = pd.concat([x_2016, x_2012])
Y = np.hstack([y_2016, y_2012])

In [16]:
x_train, x_val, y_train, y_val = train_test_split(X, Y,  random_state=random_state, test_size=0.2, stratify=Y)

In [None]:
clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')
clf.fit(x_train, y_train.astype(int), classifier__early_stopping_rounds = 15)
print(f"roc_auc = {clf.best_score_}, params = {clf.best_params_}")

In [None]:
y_pred_target = clf.best_estimator_.predict_proba(x_val)[:, 1]
print(f"roc_auc = {roc_auc_score(y_val.astype(int), y_pred_target)}")