In [118]:
import numpy as np
import pandas as pd
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import tensorflow.keras
import keras.metrics
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Input
from keras.models import Sequential
from keras.callbacks import History 
from keras.utils import plot_model
from keras.optimizers import SGD
from keras.models import Model
from imblearn.under_sampling import RandomUnderSampler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
import featuretools as ft
from sklearn.preprocessing import StandardScaler

In [119]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df["PCL_Strict3"].isna()]

In [120]:
features = ["ID" ,"age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD", "T1ETBE", "T1Acc1t",
                         "T1Acc1n", "T1bias", "T2Acc1t", "T2Acc1n", "T2bias", "state1", "state2", "trait1",
                         "trait2", "lot1", "lot2", "phq1", "phq2", "PCL1", "PCL2", "cd_risc1", "ptgi2",
                         "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                         "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                         "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                         "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                         "religion2", "emotional_support2", "instrumental_support2","self_distraction2",
                         "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                         "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                         "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                         "Sephar_scale", "Unknown", 'terror_p1', 'terror_i1', 'mva_p1', 'mva_i1',                   
                         'violent1', 'sexual1', 'rockets_p1', 'rockets_i1', 'trauma_history6_1',
                        'terror_p2','terror_i2','mva_p2', 'mva_i2', 'violent2', 'sexual2', 'rockets_p2',
                        'rockets_i2', 'trauma6t2', 'trauma8t2', 'military_exp18_1','military_exp18_t2',
                        'commanders18','commanders20', 'commanders22', 't1bias_1_zero', 'state1_zero',
                        'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'depression_clinical2', 'avoid_bias',
                        'ptsd1_clini', 'avoidance_cop', 'clinical_depression15', 'avoidance_compa',
                        'resilience_compa', 'combat_compa', 'emotional_cop1n', 'avoidance_cop2', 'bad_features']

In [121]:
bad_features = ["T1ETBE", "T1bias", "state1", "state2", "trait1", "trait2", "phq1", "phq2", "PCL1", "PCL2",
                "denial1", "substance_use1", "self_blame1", "denial2", "substance_use2","self_blame2",
                         "trauma_history8_1", "military_exposure_unit", 'trauma6t2', 'trauma8t2',
                        't1bias_1_zero', 'state1_zero', 'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'depression_clinical2',
                        'ptsd1_clini', 'emotional_cop1n']

numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2"]
categorical_features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD",   "ptgi2",
                    "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                    "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                    "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                    "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                    "religion2", "emotional_support2", "instrumental_support2", "self_distraction2",
                    "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                    "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                    "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                    "Sephar_scale", "Unknown", 'terror_p1', 'terror_i1', 'mva_p1', 'mva_i1',                   
                    'violent1', 'sexual1', 'rockets_p1', 'rockets_i1', 'trauma_history6_1', 'terror_p2','terror_i2',
                    'mva_p2', 'mva_i2', 'violent2', 'sexual2', 'rockets_p2', 'rockets_i2', 'trauma6t2', 'trauma8t2',
                    'military_exp18_1','military_exp18_t2', 'commanders18','commanders20','commanders22', 
                    't1bias_1_zero', 'state1_zero', 'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'depression_clinical2',
                    'avoid_bias', 'ptsd1_clini', 'avoidance_cop', 'clinical_depression15', 'avoidance_compa',
                    'resilience_compa', 'combat_compa', 'emotional_cop1n', 'avoidance_cop2', 'avoidance_cop']


In [122]:
df.dropna(thresh=0.5, axis=1,inplace=True)
df.dropna(thresh=0.9, axis=0,inplace=True)

In [123]:
df['bad_features'] = (df > df.mean())[bad_features].sum(axis=1)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

X = df[features]
ss = StandardScaler()
X = ss.fit_transform(X)

Y = df[["PCL_Strict3", "PCL3"]]

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=271828, stratify=Y["PCL_Strict3"])
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.1, random_state=271828, stratify=y_train["PCL_Strict3"])


In [125]:
def create_data(X_train, y_train):
    X_train_3 = X_train[y_train==1]
    y_train_3 = y_train[y_train==1]
    X_train_4 = X_train[y_train==0][:20:]
    y_train_4 = y_train[y_train==0][:20:]
    X_train_5 = np.vstack((X_train_4, X_train_3))
    y_train_5 =  np.hstack((y_train_4, y_train_3))
    sm = SMOTE(random_state=27)
    X_train_6, y_train_6 = sm.fit_sample(X_train_5, y_train_5.ravel())
    X_train_6 = X_train_6[y_train_6==0]
    y_train_6 = y_train_6[y_train_6==0]
    return X_train_6, y_train_6

In [126]:
def regressions(X_train, y_train, X_test):
    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train)
    
    X_train = np.hstack((X_train, gbr.predict(X_train).reshape(-1, 1)))
    
    X_test= np.hstack((X_test, gbr.predict(X_test).reshape(-1, 1)))
    return X_train, X_test

In [127]:
DNA = [ "HML_5HTT", "HL_MAOA", "HML_NPY", "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH",
       "HML_FKBP5", "Ashken_scale", "Sephar_scale", "Unknown"]
coping_mechanism = ["active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                         "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                         "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                         "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                         "religion2", "emotional_support2", "instrumental_support2","self_distraction2",
                         "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                        'avoidance_cop', 'emotional_cop1n', 'avoidance_cop2']
dot_probe = ["T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t", "T2Acc1n", "T2bias",  't1bias_1_zero', 'avoid_bias']
PCL = ["PCL1", "PCL2", "cd_risc1", "ptgi2", "trauma_history8_1", 'PCL1_zero', 'ptsd1_clini']
military_exposure =  ["military_exposure_unit", 'military_exp18_1','military_exp18_t2',
                        'commanders18','commanders20', 'commanders22']
depression = ["state1", "state2", "trait1",
                         "trait2","phq1", "phq2" ,'state1_zero',
                        'trait1_zero', 'PHQ1_zero', 'depression_clinical2', 
                         'clinical_depression15', 'bad_features']
other = ["highschool_diploma",   "lot1", "lot2",  'avoidance_compa',
                        'resilience_compa', 'combat_compa']
features_groups = [DNA, coping_mechanism, dot_probe, PCL, military_exposure, depression, other]

In [None]:
kfold = StratifiedKFold(n_splits=7, shuffle=True)
cvscores = []
y_train_2 = np.array(y_train_2)
X_train_2 = np.array(X_train_2)

for class_weight in [{0:0.59, 1:0.41}]:
    for num_smote in [0.9]:
        for lim in [0.5]:
            for first_layer in [30]:
                for loops in [10]:
                    for each_layer in [20]:
                        n_pca = 0.99
                        first_layer = first_layer
                        each_layer = each_layer
                        num_smote = num_smote
                        loops = loops
                        train_scores_f = []
                        train_scores_p = []
                        train_scores_r = []
                        
                        scores_f = []
                        scores_p = []
                        scores_r = []
                        print("\n\nlim", lim, "\nfirst_layer", first_layer, 
                                  "\neach_layer", each_layer, "\nnum_smote", num_smote,
                              "\nloops", loops, "\nclass_weight", class_weight)
                        for train, test in kfold.split(X_train_2, y_train_2[:, 0]):
                            y_train_pred = np.zeros_like(y_train_2[train][:,0]).reshape(-1, 1)

                            y_pred = np.zeros_like(y_train_2[test][:,0]).reshape(-1, 1)
                            for feature_group in features_groups:
                                sm = RandomUnderSampler(num_smote)
                                feature_group = [features.index(i) for i in feature_group]
                                
                                X_train_res =  X_train_2[train] [:, feature_group]
                                X_test_cv = X_train_2[test][: , feature_group]
                                y_train_res = y_train_2[train][:,0]
                                X_train_res_original, X_test_cv= X_train_res, X_test_cv#regressions(X_train_res, y_train_2[train][:, 1], X_test_cv)
                                X_train_res, y_train_res = sm.fit_sample(X_train_res_original, y_train_res.ravel())
                                
                                
                                n_cols = X_train_res.shape[1]
                                model = Sequential()

                                model.add(Dense(15 , activation='elu'))
                                model.add(Dropout(0.5))
                                
                                model.add(Dense(5, activation='elu'))
                                model.add(Dropout(0.5))                              
                                model.add(Dense(5, activation='elu'))
                                model.add(Dropout(0.5))
                                           
                                model.add(Dense(1, activation='sigmoid'))

                                model.compile(optimizer='adam', 
                                                  loss='binary_crossentropy')
                                #print(X_train_res.shape)
                                #print(X_test_cv.shape)
                                # Fit the model
                                callbacks = [EarlyStopping(monitor='val_loss', patience=15)]
                                model.fit(X_train_res, y_train_res, epochs = 350, class_weight = class_weight, verbose=0, callbacks=callbacks)
                                # evaluate the model
                                y_pred +=  model.predict(X_test_cv)
                                
                                # training error
                                y_train_pred +=  model.predict(X_train_res_original) 
                                #print(y_train_pred)
                        
                            y_pred = (y_pred/7) > lim
                            s_f = f1_score(y_train_2[test][:,0], y_pred)
                            s_p = precision_score(y_train_2[test][:,0], y_pred)
                            s_r = recall_score(y_train_2[test][:,0], y_pred)
                            print("\tscores f1", (s_f))
                            print("\tscores p", (s_p))
                            print("\tscores r", (s_r))
                            scores_f.append(s_f)
                            scores_p.append(s_p)
                            scores_r.append(s_r)
                            
                            y_train_pred = (y_train_pred/7) > lim
                            train_s_f = f1_score(y_train_2[train][:,0], y_train_pred)
                            train_s_p = precision_score(y_train_2[train][:,0], y_train_pred)
                            train_s_r = recall_score(y_train_2[train][:,0], y_train_pred)
                            print("\tscores f1 train", (train_s_f))
                            print("\tscores p train", (train_s_p))
                            print("\tscores r train", (train_s_r))
                            train_scores_f.append(train_s_f)
                            train_scores_p.append(train_s_p)
                            train_scores_r.append(train_s_r)

                        print("mean scores f1", np.mean(scores_f))
                        print("mean scores p", np.mean(scores_p))
                        print("mean scores r", np.mean(scores_r))
                        
                        print("mean scores f1 train", np.mean(train_scores_f))
                        print("mean scores p train", np.mean(train_scores_p))
                        print("mean scores r train", np.mean(train_scores_r))



lim 0.5 
first_layer 30 
each_layer 20 
num_smote 0.9 
loops 10 
class_weight {0: 0.59, 1: 0.41}
	scores f1 0.6153846153846153
	scores p 0.5714285714285714
	scores r 0.6666666666666666
	scores f1 train 0.417910447761194
	scores p train 0.3783783783783784
	scores r train 0.4666666666666667
	scores f1 0.7272727272727272
	scores p 0.6666666666666666
	scores r 0.8
	scores f1 train 0.45901639344262296
	scores p train 0.4666666666666667
	scores r train 0.45161290322580644
	scores f1 0.0
	scores p 0.0
	scores r 0.0
	scores f1 train 0.6666666666666666
	scores p train 0.6285714285714286
	scores r train 0.7096774193548387
	scores f1 0.42857142857142855
	scores p 0.3333333333333333
	scores r 0.6
	scores f1 train 0.4266666666666667
	scores p train 0.36363636363636365
	scores r train 0.5161290322580645
	scores f1 0.25
	scores p 0.3333333333333333
	scores r 0.2
	scores f1 train 0.4666666666666667
	scores p train 0.4827586206896552
	scores r train 0.45161290322580644


In [None]:
num_smote = 0.9
lim = 0.5
class_weight = {1:0.41, 0:0.59}


y_train_pred = np.zeros_like(y_train_2).reshape(-1, 1)
y_pred = np.zeros_like(y_test_2).reshape(-1, 1)

for feature_group in features_groups:
    sm = RandomUnderSampler(num_smote)
    feature_group = [features.index(i) for i in feature_group]
    X_train_res =  X_train_2[:, feature_group]
    X_train_res, y_train_res = sm.fit_sample(X_train_res, y_train_2.ravel())
    X_test_cv = X_test_2[: , feature_group]

    n_cols = X_train_res.shape[1]
    model = Sequential()

    model.add(Dense(15 , activation='elu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='elu'))
    model.add(Dropout(0.5))                              
    model.add(Dense(5, activation='elu'))
    model.add(Dropout(0.5))
                                           
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy')

   # Fit the model
    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]
    model.fit(X_train_res, y_train_res, epochs = 350, class_weight = class_weight, verbose=0, callbacks=callbacks)
    # evaluate the model
    y_pred +=  model.predict(X_test_cv)
     # training error
    y_train_pred +=  model.predict(X_train_2[:, feature_group]) 

                        
y_pred = (y_pred/7) > lim
s_f = f1_score(y_test_2, y_pred)
s_p = precision_score(y_test_2, y_pred)
s_r = recall_score(y_test_2, y_pred)
print("\tscores f1", (s_f))
print("\tscores p", (s_p))
print("\tscores r", (s_r))
scores_f.append(s_f)
scores_p.append(s_p)
scores_r.append(s_r)
                            
y_train_pred = (y_train_pred/7) > lim
train_s_f = f1_score(y_train_2, y_train_pred)
train_s_p = precision_score(y_train_2, y_train_pred)
train_s_r = recall_score(y_train_2, y_train_pred)
print("\tscores f1 train", (train_s_f))
print("\tscores p train", (train_s_p))
print("\tscores r train", (train_s_r))
train_scores_f.append(train_s_f)
train_scores_p.append(train_s_p)
train_scores_r.append(train_s_r)
