In [280]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score, mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import tensorflow.keras
import keras.metrics
from imblearn.under_sampling import RandomUnderSampler

from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D
from keras.models import Sequential
from keras.callbacks import History 
from keras.utils import plot_model
from keras.optimizers import SGD
from sklearn.preprocessing import StandardScaler

In [None]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df["d_ptsd"].isna()]

In [None]:
features =  [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD", "T1ETBE", "T1Acc1t",
                         "T1Acc1n", "T1bias", "T2Acc1t", "T2Acc1n", "T2bias", "state1", "state2", "trait1",
                         "trait2", "lot1", "lot2", "phq1", "phq2", "PCL1", "PCL2", "cd_risc1", "ptgi2",
                         "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                         "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                         "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                         "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                         "religion2", "emotional_support2", "instrumental_support2","self_distraction2",
                         "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                         "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                         "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                         "Sephar_scale", "Unknown"]

In [None]:
bad_features = ["T1ETBE", "T1bias", "state1", "state2", "trait1", "trait2", "phq1", "phq2", "PCL1", "PCL2",
                "denial1", "substance_use1", "self_blame1", "denial2", "substance_use2","self_blame2",
                         "trauma_history8_1", "military_exposure_unit", 'trauma6t2', 'trauma8t2',
                        't1bias_1_zero', 'state1_zero', 'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'depression_clinical2',
                        'ptsd1_clini', 'emotional_cop1n']
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2"]
categorical_features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD",   "ptgi2",
                    "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                    "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                    "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                    "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                    "religion2", "emotional_support2", "instrumental_support2", "self_distraction2",
                    "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                    "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                    "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                    "Sephar_scale", "Unknown"]

In [None]:


# for i, feature in enumerate(features):
#     for interation in features[i::]:
#         X[f"interaction_{feature}_{interation}"] = X[feature]

In [None]:
df['bad_features'] = (df > df.mean())[bad_features].sum(axis=1)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

X = df[features]

Y = df["d_ptsd"]

In [None]:
X["interaction_1"] = X["T1Acc1t"] * X["T2Acc1n"] * X["military_exposure_unit"]
X["interaction_2"] = X["T1Acc1n"] * X["T2Acc1t"] * X["military_exposure_unit"]
X["interaction_3"] = X["highschool_diploma"] * X["military_exposure_unit"] * X['PCL1']
X["interaction_4"] = X["T1ETBE"] * X["military_exposure_unit"] * X['HML_5HTT']

In [None]:
ss = StandardScaler()
X = ss.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=271828)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.1, random_state=271828)
#X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_train_2, y_train_2, test_size = 0.1, random_state=271828, stratify=y_train_2)

In [None]:
def create_data(X_train, y_train):
    X_train_3 = X_train[y_train==1]
    y_train_3 = y_train[y_train==1]
    X_train_4 = X_train[y_train==0][:10:]
    y_train_4 = y_train[y_train==0][:10:]
    X_train_5 = np.vstack((X_train_4, X_train_3))
    y_train_5 =  np.hstack((y_train_4, y_train_3))
    sm = SMOTE(random_state=27)
    X_train_6, y_train_6 = sm.fit_sample(X_train_5, y_train_5.ravel())
    X_train_6 = X_train_6[y_train_6==0]
    y_train_6 = y_train_6[y_train_6==0]
    return X_train_6, y_train_6

In [None]:
preds = []
trues= []

In [None]:
kfold = KFold(n_splits=7, shuffle=True)
cvscores = []
y_train_2 = np.array(y_train_2)
X_train_2 = np.array(X_train_2)

for num_layers in [3]:
    for first_layer in [75]:
        for loops in [20]:
            for each_layer in [15]:
                num_layers = num_layers
                first_layer = first_layer
                each_layer = each_layer
                num_smote = 1
                loops = loops
                train_scores_f = []
                train_scores_p = []
                train_scores_r = []
                        
                scores_f = []
                scores_p = []
                scores_r = []
                
                for train, test in kfold.split(X_train_2, y_train_2):
                    print("num_layers", num_layers, "\nfirst_layer", first_layer, 
                          "\neach_layer", each_layer, "\nnum_smote", num_smote, "\nloops", loops)
                    X_train_res = X_train_2[train]
                    y_train_res = y_train_2[train]

                  # create model
                    n_cols = X_train_res.shape[1]
                    model = Sequential()
                    model.add(Dense(first_layer, activation='elu', input_dim = n_cols))
                    model.add(Dropout(0.5))

                    for i in range(num_layers):
                        model.add(Dense(each_layer, activation='elu'))
                        model.add(Dropout(0.5))

                    model.add(Dense(1, activation='sigmoid'))

                    model.compile(optimizer='adam', 
                                  loss='mean_squared_error')


                    # Fit the model
                    #callbacks = [EarlyStopping(monitor='val_loss', patience=1)]
                    model.fit(X_train_res, y_train_res, epochs = 130, validation_split = .1, verbose=0, callbacks=callbacks)
                    # evaluate the model
                    y_pred =  model.predict(X_train_2[test])
                    y_train_pred =  model.predict(X_train_2[train])
                    preds.extend(y_pred)
                    trues.extend(y_train_2[test])
                    y_pred = y_pred
                    s_f = mean_absolute_error(y_train_2[test], y_pred)
                    print("\tscores f1", (s_f))
                    scores_f.append(s_f)

                    y_train_pred = (y_train_pred)
                    train_s_f = mean_absolute_error(y_train_2[train], y_train_pred)
                    print("\tscores f1 train", (train_s_f))
 
                    train_scores_f.append(train_s_f)

                print("mean scores f1", np.mean(scores_f))

                        
                print("mean scores f1 train", np.mean(train_scores_f))


In [None]:
preds = np.array(preds)
trues = np.array(trues)

In [None]:
num_layers = 3
first_layer = 75
each_layer = 15

# create model
n_cols = X_train_res.shape[1]
model = Sequential()
model.add(Dense(first_layer, activation='elu', input_dim = n_cols))
model.add(Dropout(0.5))
for i in range(num_layers):
    model.add(Dense(each_layer, activation='elu'))
    model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
                                  loss='mean_squared_error')


                    # Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
model.fit(X_train_2, y_train_2, epochs = 350, validation_split = .1, verbose=0, callbacks=callbacks)
                    # evaluate the model
y_pred =  model.predict(X_test_2)
y_train_pred =  model.predict(X_train_2)
#preds.extend(y_pred)
#trues.extend(y_train_2[test])
s_f = mean_absolute_error(y_test_2, y_pred)
print("\tscores f1", (s_f))
                    

train_s_f = mean_absolute_error(y_train_2, y_train_pred)
print("\tscores f1 train", (train_s_f))
    

In [None]:
df = pd.read_excel(path)


In [None]:
num_layers = 3
first_layer = 75
each_layer = 15

# create model
n_cols = X_train_res.shape[1]
model = Sequential()
model.add(Dense(first_layer, activation='elu', input_dim = n_cols))
model.add(Dropout(0.5))
for i in range(num_layers):
    model.add(Dense(each_layer, activation='elu'))
    model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', 
                                  loss='mean_squared_error')
train_here = df.iloc[list(y_train_2.index)]["PCL3"]
test_here =  df.iloc[list(y_test_2.index)]["PCL3"]
                    # Fit the model
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]
model.fit(X_train_2, train_here, epochs = 350, validation_split = .1, verbose=0, callbacks=callbacks)
                    # evaluate the model
y_pred =  model.predict(test_here)
y_train_pred =  model.predict(X_train_2)
#preds.extend(y_pred)
#trues.extend(y_train_2[test])
s_f = mean_absolute_error(y_test_2, y_pred)
print("\tscores f1", (s_f))
                    

train_s_f = mean_absolute_error(y_train_2, y_train_pred)
print("\tscores f1 train", (train_s_f))
    

In [274]:

df = pd.read_excel(path)

In [275]:
mean_absolute_error(y_pred.reshape(-1)+df.iloc[list(y_test_2.index)]["PCL2"], df.iloc[list(y_test_2.index)]["PCL3"])

7.483467668831349

In [276]:
a = y_pred.reshape(-1)+df.iloc[list(y_test_2.index)]["PCL2"] > 50

In [277]:
b = df.iloc[list(y_test_2.index)]["PCL3"] > 50

In [278]:
precision_score(b, a)

0.16666666666666666

In [272]:
sum(a)

2

In [170]:
df["PCL3"] == df["d_ptsd"] + df["PCL2"]

0        True
1        True
2        True
3        True
4        True
6        True
7        True
8        True
9        True
10       True
11       True
12       True
13       True
14       True
16       True
19       True
20       True
21       True
23       True
25       True
28       True
29       True
31      False
33       True
34       True
35       True
36       True
37       True
38       True
39       True
        ...  
1048     True
1050     True
1053     True
1054     True
1059     True
1060     True
1062     True
1065     True
1066     True
1067     True
1068     True
1069     True
1071     True
1072     True
1073     True
1075     True
1076     True
1078     True
1080     True
1082     True
1083     True
1084     True
1085     True
1088     True
1097     True
1098     True
1099     True
1100     True
1101     True
1102     True
Length: 661, dtype: bool