In [129]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import tensorflow.keras
import keras.metrics
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D
from keras.models import Sequential
from keras.callbacks import History 
from keras.utils import plot_model
from keras.optimizers import SGD
from sklearn.preprocessing import StandardScaler

In [130]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df["drop"] = df["PCL3"].isna() & df["PCL2"].isna() & ~df["PCL1"].isna()


In [131]:
features =  [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD", "T1ETBE", "T1Acc1t",
                         "T1Acc1n", "T1bias", "T2Acc1t", "T2Acc1n", "T2bias", "state1", "state2", "trait1",
                         "trait2", "lot1", "lot2", "phq1", "phq2", "PCL1", "PCL2", "cd_risc1", "ptgi2",
                         "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                         "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                         "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                         "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                         "religion2", "emotional_support2", "instrumental_support2","self_distraction2",
                         "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                         "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                         "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                         "Sephar_scale", "Unknown"]

In [132]:
bad_features = ["T1ETBE", "T1bias", "state1", "trait1", "phq1", "PCL1",
                "denial1", "substance_use1", "self_blame1",
                         "trauma_history8_1", "military_exposure_unit", 't1bias_1_zero', 
                'state1_zero', 'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'ptsd1_clini', 'emotional_cop1n']
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2"]
categorical_features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD",   "ptgi2",
                    "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                    "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                    "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                    "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                    "religion2", "emotional_support2", "instrumental_support2", "self_distraction2",
                    "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                    "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                    "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                    "Sephar_scale", "Unknown"]

In [133]:


# for i, feature in enumerate(features):
#     for interation in features[i::]:
#         X[f"interaction_{feature}_{interation}"] = X[feature]

In [134]:
df['bad_features'] = (df > df.mean())[bad_features].sum(axis=1)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

X = df[features]

Y = df["drop"]

In [135]:
ss = StandardScaler()
X = ss.fit_transform(X)

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=271828, stratify=Y)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.1, random_state=271828, stratify=y_train)
#X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_train_2, y_train_2, test_size = 0.1, random_state=271828, stratify=y_train_2)

In [137]:
def create_data(X_train, y_train):
    X_train_3 = X_train[y_train==1]
    y_train_3 = y_train[y_train==1]
    X_train_4 = X_train[y_train==0][:10:]
    y_train_4 = y_train[y_train==0][:10:]
    X_train_5 = np.vstack((X_train_4, X_train_3))
    y_train_5 =  np.hstack((y_train_4, y_train_3))
    sm = SMOTE(random_state=27)
    X_train_6, y_train_6 = sm.fit_sample(X_train_5, y_train_5.ravel())
    X_train_6 = X_train_6[y_train_6==0]
    y_train_6 = y_train_6[y_train_6==0]
    return X_train_6, y_train_6

In [138]:
preds = []
trues= []

In [139]:
kfold = StratifiedKFold(n_splits=7, shuffle=True)
cvscores = []
y_train_2 = np.array(y_train_2)
X_train_2 = np.array(X_train_2)

for num_layers in [1]:
    for first_layer in [20]:
        for loops in [20]:
            for each_layer in [5]:
                num_layers = num_layers
                first_layer = first_layer
                each_layer = each_layer
                num_smote = 1
                loops = loops
                train_scores_f = []
                train_scores_p = []
                train_scores_r = []
                        
                scores_f = []
                scores_p = []
                scores_r = []
                
                for train, test in kfold.split(X_train_2, y_train_2):
                    print("num_layers", num_layers, "\nfirst_layer", first_layer, 
                          "\neach_layer", each_layer, "\nnum_smote", num_smote, "\nloops", loops)

                    X_train_res = X_train_2[train]
                    y_train_res = y_train_2[train]

                  # create model
                    n_cols = X_train_res.shape[1]
                    model = Sequential()
                    model.add(Dense(first_layer, activation='elu', input_dim = n_cols))
                    model.add(Dropout(0.5))

                    for i in range(num_layers):
                        model.add(Dense(each_layer, activation='elu'))
                        model.add(Dropout(0.5))

                    model.add(Dense(1, activation='sigmoid'))

                    model.compile(optimizer='adam', 
                                  loss='binary_crossentropy')


                    # Fit the model
                    callbacks = [EarlyStopping(monitor='val_loss', patience=1)]
                    model.fit(X_train_res, y_train_res, epochs = 150, validation_split = .1, verbose=0, callbacks=callbacks, class_weight= {1:0.45, 0:0.55})
                    # evaluate the model
                    y_pred =  model.predict(X_train_2[test])
                    y_train_pred =  model.predict(X_train_2[train])
                    preds.extend(y_pred)
                    trues.extend(y_train_2[test])
                    y_pred = y_pred>0.5
                    s_f = f1_score(y_train_2[test], y_pred)
                    s_p = precision_score(y_train_2[test], y_pred)
                    s_r = recall_score(y_train_2[test], y_pred)
                    print("\tscores f1", (s_f))
                    print("\tscores p", (s_p))
                    print("\tscores r", (s_r))
                    scores_f.append(s_f)
                    scores_p.append(s_p)
                    scores_r.append(s_r)

                    y_train_pred = (y_train_pred) > 0.5
                    train_s_f = f1_score(y_train_2[train], y_train_pred)
                    train_s_p = precision_score(y_train_2[train], y_train_pred)
                    train_s_r = recall_score(y_train_2[train], y_train_pred)
                    print("\tscores f1 train", (train_s_f))
                    print("\tscores p train", (train_s_p))
                    print("\tscores r train", (train_s_r))
                    train_scores_f.append(train_s_f)
                    train_scores_p.append(train_s_p)
                    train_scores_r.append(train_s_r)
                print("mean scores f1", np.mean(scores_f))
                print("mean scores p", np.mean(scores_p))
                print("mean scores r", np.mean(scores_r))
                        
                print("mean scores f1 train", np.mean(train_scores_f))
                print("mean scores p train", np.mean(train_scores_p))
                print("mean scores r train", np.mean(train_scores_r))

num_layers 1 
first_layer 20 
each_layer 5 
num_smote 1 
loops 20
	scores f1 0.9333333333333333
	scores p 0.9333333333333333
	scores r 0.9333333333333333
	scores f1 train 0.9176470588235294
	scores p train 0.9285714285714286
	scores r train 0.9069767441860465
num_layers 1 
first_layer 20 
each_layer 5 
num_smote 1 
loops 20
	scores f1 0.7499999999999999
	scores p 1.0
	scores r 0.6
	scores f1 train 0.9090909090909091
	scores p train 0.9493670886075949
	scores r train 0.872093023255814
num_layers 1 
first_layer 20 
each_layer 5 
num_smote 1 
loops 20
	scores f1 0.5833333333333334
	scores p 0.7777777777777778
	scores r 0.4666666666666667
	scores f1 train 0.9221556886227545
	scores p train 0.9506172839506173
	scores r train 0.8953488372093024
num_layers 1 
first_layer 20 
each_layer 5 
num_smote 1 
loops 20
	scores f1 0.8125000000000001
	scores p 0.7222222222222222
	scores r 0.9285714285714286
	scores f1 train 0.9371428571428572
	scores p train 0.9318181818181818
	scores r train 0.94252873

In [140]:
preds = np.array(preds)
trues = np.array(trues)

In [113]:
trues[np.where(preds > 0.8)[0]]

array([], dtype=bool)

In [114]:
num_layers = 2
first_layer = 75
each_layer = 10

# create model
n_cols = X_train_res.shape[1]
model = Sequential()
model.add(Dense(first_layer, activation='elu', input_dim = n_cols))
model.add(Dropout(0.5))
for i in range(num_layers):
    model.add(Dense(each_layer, activation='elu'))
    model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
                                  loss='binary_crossentropy')


                    # Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=5)]
model.fit(X_train_2, y_train_2, epochs = 150, validation_split = .1, verbose=0, callbacks=callbacks, class_weight = {1:3, 0:7})
                    # evaluate the model
y_pred =  model.predict(X_test_2)
y_train_pred =  model.predict(X_train_2)
#preds.extend(y_pred)
#trues.extend(y_train_2[test])
y_pred = y_pred>0.5
s_f = f1_score(y_test_2, y_pred)
s_p = precision_score(y_test_2, y_pred)
s_r = recall_score(y_test_2, y_pred)
print("\tscores f1", (s_f))
print("\tscores p", (s_p))
print("\tscores r", (s_r))
                    
y_train_pred = (y_train_pred) > 0.5
train_s_f = f1_score(y_train_2, y_train_pred)
train_s_p = precision_score(y_train_2, y_train_pred)
train_s_r = recall_score(y_train_2, y_train_pred)
print("\tscores f1 train", (train_s_f))
print("\tscores p train", (train_s_p))
print("\tscores r train", (train_s_r))
    

	scores f1 0.0
	scores p 0.0
	scores r 0.0
	scores f1 train 0.0
	scores p train 0.0
	scores r train 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
