In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import precision_score, roc_auc_score, f1_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import os

random_state = 3603

In [None]:
data_path = r'C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data'
df_2009 = pd.read_excel(os.path.join(data_path, "PTSD.xlsx"))


In [None]:
df_2016 = pd.read_csv(r"C:\Users\nogag\Documents\birocracy\PTSDClassifier\PTSD\Data\IDF_ABM_16.2.15_wide.csv")
#df_2016 = df_2016[df_2016['Group']=='control']


In [None]:
df_questionnaire = pd.read_csv(os.path.join(data_path, f"questionnaire_PCL.csv"))


In [None]:
intrusion_features = ["q6.1_INTRU", "q6.2_DREAM", "q6.3_FLASH", "q6.4_UPSET", "q6.5_PHYS"]
hypertension = ["q6.13_SLEEP", "q6.14_ANGER", "q6.15_CONC", "q6.16_HYPER", "q6.17_STRTL"]
avoidance = ["q6.6_AVTHT", "q6.7_AVSIT", "q6.8_AMNES", "q6.9_DISINT", "q6.10_DTACH",  "q6.11_NUMB", "q6.12_FUTRE"]
df_questionnaire['intrusion_score'] = df_questionnaire[intrusion_features].sum(axis=1)
df_questionnaire['avoidance_score'] = df_questionnaire[avoidance].sum(axis=1)
df_questionnaire['hypertension_score'] = df_questionnaire[hypertension].sum(axis=1)

In [None]:
df_2009 = df_2009.merge(df_questionnaire[["intrusion_score", "avoidance_score", "hypertension_score", "ID"]], on="ID", how='outer')


In [None]:
trans_2016_2009_features = {
    'bagrut': 'highschool_diploma',
 'dyslexia': 'dyslexia',
 'ADHD': 'ADHD',
 'Accuracy_threat_T1': 'T1Acc1t',
 'Accuracy_NT_T1': 'T1Acc1n',
 'Threat_Bias_T1': 'T1bias',
 'PHQ_T1': 'phq1',
 'Trait_T1': 'trait1',
 'State_T1': 'state1',
  'PCL_T1': 'PCL1',
  'Intrusion_T1': 'intrusion_score',
  'Avoidance_T1': 'avoidance_score',
  'Hyper_T1': 'hypertension_score',
 'PCL_T4': 'PCL3'}


In [None]:
df_2016['Wave'].unique()

In [None]:
target_feature = 'PCL3'
X_features = [i for i in trans_2016_2009_features.values() if not i == target_feature]


In [None]:
df_2016['bagrut'] = df_2016['bagrut'] == 'yes'
df_2016['dyslexia'] = df_2016['dyslexia'] == 'yes'
df_2016['ADHD'] = df_2016['ADHD'] == 'yes'

In [None]:
df_2016 = df_2016.rename(trans_2016_2009_features, axis=1)

In [None]:
df_2009 = df_2009[~df_2009[target_feature].isna()]
df_2016 = df_2016[~df_2016[target_feature].isna()]

## visualizations

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
df_2016['Wave'].unique()

In [None]:
plt.hist(df_2016_1['PCL3'].dropna(), label = '1', alpha=0.5)
plt.hist(df_2016_2['PCL3'].dropna(), label = '2', alpha=0.5)
plt.hist(df_2016_3['PCL3'].dropna(), label = '3', alpha=0.5)

In [None]:
plt.hist(df_2009['dyslexia'].dropna().astype(int), label = '2009');
plt.hist(df_2016['dyslexia'].dropna().astype(int), label = '2016', alpha=0.75);
plt.legend()


In [None]:
plt.hist(df_2009['highschool_diploma'].dropna().astype(int), label = '2009');
plt.hist(df_2016['highschool_diploma'].dropna().astype(int), label = '2016', alpha=0.75);
plt.legend()

In [None]:
df_2016.Wave.unique()
df_2016_1 = df_2016[df_2016.Wave=='august12']
df_2016_2 = df_2016[df_2016.Wave=='nov12']
df_2016_3 = df_2016[df_2016.Wave=='august13']

In [None]:

#plt.scatter(df_2009['T1Acc1t'].dropna(),df_2009['T1Acc1n'].dropna(), label = '2009', alpha=0.5)
plt.scatter(df_2016_1['T1Acc1t'].dropna(),df_2016_1['T1Acc1n'].dropna(), label = '1', alpha=0.5)
plt.scatter(df_2016_2['T1Acc1t'].dropna(),df_2016_2['T1Acc1n'].dropna(), label = '2', alpha=0.5)
plt.scatter(df_2016_3['T1Acc1t'].dropna(),df_2016_3['T1Acc1n'].dropna(), label = '3', alpha=0.5)


In [None]:
#plt.scatter(df_2009['T1Acc1t'].dropna(),df_2009['T1Acc1n'].dropna(), label = '2009', alpha=0.5)
plt.hist(df_2016_1['T1Acc1t'].dropna(), label = '1', alpha=0.5)
plt.hist(df_2016_2['T1Acc1t'].dropna(), label = '2', alpha=0.5)
plt.hist(df_2016_3['T1Acc1t'].dropna(), label = '3', alpha=0.5)

In [None]:
df_2016_1['T1Acc1n'].min()
df_2016_1['T1Acc1n'].min()

In [None]:
plt.hist(df_2016_1['T1Acc1n'].dropna(), label = '1', alpha=0.5)
plt.hist(df_2016_2['T1Acc1n'].dropna(), label = '2', alpha=0.5)
plt.hist(df_2016_3['T1Acc1n'].dropna(), label = '3', alpha=0.5)

In [None]:
plt.hist(df_2016_1['T1bias'].dropna(), label = '1', alpha=0.5)
plt.hist(df_2016_2['T1bias'].dropna(), label = '2', alpha=0.5)
plt.hist(df_2016_3['T1bias'].dropna(), label = '3', alpha=0.5)
plt.legend()

In [None]:
trans_2016_2009_features.values()

In [None]:
plt.hist(df_2016_1['phq1'].dropna(), label = '1', alpha=0.5)
plt.hist(df_2016_2['phq1'].dropna(), label = '2', alpha=0.5)
plt.hist(df_2016_3['phq1'].dropna(), label = '3', alpha=0.5)
plt.legend()

In [None]:
plt.hist(df_2016_1['trait1'].dropna(), label = '1', alpha=0.5)
plt.hist(df_2016_2['trait1'].dropna(), label = '2', alpha=0.5)
plt.hist(df_2016_3['trait1'].dropna(), label = '3', alpha=0.5)
plt.legend()

In [None]:
plt.hist(df_2016_1['state1'].dropna(), label = 'aug12', alpha=0.5)
plt.hist(df_2016_2['state1'].dropna(), label = 'nov12', alpha=0.5)
plt.hist(df_2016_3['state1'].dropna(), label = 'aug13', alpha=0.5)
plt.hist(df_2009['state1'].dropna(), label = '2009', alpha=0.5)

plt.legend()

In [None]:
plt.hist(df_2016_1['PCL1'].dropna(), label = 'aug12', alpha=0.5)
plt.hist(df_2016_2['PCL1'].dropna(), label = 'nov12', alpha=0.5)
plt.hist(df_2016_3['PCL1'].dropna(), label = 'aug13', alpha=0.5)

plt.legend()

In [None]:
df_2016['PCL1'].max()