In [6]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
import os

random_state = 3601

In [7]:
data_path_2009 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2009'
df_2009 = pd.read_excel(os.path.join(data_path_2009, "PTSD.xlsx"))


In [8]:
data_path_2016 = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\2016'
df_2016 = pd.read_csv(os.path.join(data_path_2016, "IDF_ABM_16.2.15_wide.csv"))


df_2016 = df_2016[~(df_2016['Wave']=='nov12')]


## features in the original data

In [9]:
trans_2016_2009_features = {
    'bagrut': 'highschool_diploma',
 'ADHD': 'ADHD',
 'Accuracy_threat_T1': 'T1Acc1t',
 'Accuracy_NT_T1': 'T1Acc1n',
 'Threat_Bias_T1': 'T1bias',
    

 'PHQ_T1': 'phq1',
 'Trait_T1': 'trait1',
 'PCL_T1': 'PCL1',
    'PCL_T4': 'PCL3'
}


## append PCL intrusion features

In [10]:
PCL_2009 = pd.read_csv(os.path.join(data_path_2009, "questionnaire_PCL1.csv"))

intrusion_features_2009 = ["q6.1_INTRU", "q6.2_DREAM", "q6.3_FLASH", "q6.4_UPSET", "q6.5_PHYS"]
df_2009 = df_2009.merge(PCL_2009[intrusion_features_2009 + ["ID"]], on="ID", how='outer')
df_2009['Intrusion_T1'] = df_2009[intrusion_features_2009].sum(axis=1)

trans_2016_2009_features['Intrusion_T1']= 'Intrusion_T1'

In [11]:
target_feature = 'PCL3'
X_features = [i for i in trans_2016_2009_features.values() if not i == target_feature]


In [12]:
#df_2016['target_feature'] = (df_2016['Intrusion_T4'] > 9).astype(int)
df_2016['PCL_T4'] = (df_2009['PCL3'] > 39).astype(int)
df_2009['PCL3'] = (df_2009['PCL3'] > 39).astype(int)

## adjust features from 2016

In [13]:
df_2016['bagrut'] = df_2016['bagrut'] == 'yes'
df_2016['dyslexia'] = df_2016['dyslexia'] == 'yes'
df_2016['ADHD'] = df_2016['ADHD'] == 'yes'

In [14]:
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)

In [15]:
df_2009 = df_2009[~df_2009[target_feature].isna()]
df_2016 = df_2016[~df_2016[target_feature].isna()]

## 2009 data outer CV

## parameters init

In [16]:
cv = StratifiedKFold(6, random_state=random_state, shuffle=True)

# class weight
pos_sample = df_2009[target_feature].sum() 
all_samples = df_2009[target_feature].count()
class_weights = all_samples/ pos_sample


# pipeline

pipe = Pipeline(steps=[
    ('RFE', RFE(CatBoostClassifier(verbose=0, random_state=random_state))),
    ('classifier', CatBoostClassifier(verbose=0, random_state=random_state))])
grid_params = [{
        'RFE__n_features_to_select': [15, 8],
        'classifier__class_weights':[[1, class_weights*0.5], [1, class_weights*0.75], [1, class_weights*0.25]],
        'classifier__l2_leaf_reg': [130, 10, 75],
        'classifier__depth': [6, 4]#, 9]
        }]




## 2016

In [25]:
x_2016, y_2016 = df_2016[df_2016['Wave']=='august13'][X_features], df_2016[df_2016['Wave']=='august13'][target_feature]
clf = GridSearchCV(pipe, grid_params, cv=cv, scoring='roc_auc')


In [26]:
x_2009, y_2009 = df_2009[X_features], df_2009[target_feature]
x_2012, y_2012 = df_2016[df_2016['Wave']=='august12'][X_features], df_2016[df_2016['Wave']=='august12'][target_feature]


In [36]:
x_2009['time'] = 0
x_2012['time'] = 5
x_2016['time'] = 6

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
np.hstack([y_2009, y_2012]).shape

(1344,)

In [38]:
np.vstack([x_2009, x_2012]).shape

(1344, 10)

In [39]:
X, Y = np.vstack([x_2009, x_2012]), np.hstack([y_2009, y_2012])

In [40]:

clf.fit(X, Y, classifier__early_stopping_rounds = 15)
print(f"roc_auc = {clf.best_score_}, params = {clf.best_params_}")

roc_auc = 0.6884737936729967, params = {'RFE__n_features_to_select': 15, 'classifier__class_weights': [1, 3.1695402298850577], 'classifier__depth': 4, 'classifier__l2_leaf_reg': 130}


In [41]:
y_pred_target = clf.best_estimator_.predict_proba(x_2016)[:, 1]
print(f"roc_auc = {roc_auc_score(y_2016.astype(int), y_pred_target)}")

roc_auc = 0.48672086720867214


In [None]:
for i, j in zip(X_features, clf.best_estimator_['classifier'].get_feature_importance()):
    print(i, j)