In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import tensorflow.keras
import keras.metrics
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D
from keras.models import Sequential
from keras.callbacks import History 
from keras.utils import plot_model
from keras.optimizers import SGD
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE


Using TensorFlow backend.


In [2]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df["PCL_Strict3"].isna()]


In [3]:
features = ["age", "highschool_diploma",  "dyslexia", "ADHD", "T1Acc1t", "T1Acc1n", "T1bias", "phq1", "lot1", "trait1",
               "state1", "PCL1",  "PCL_Broad1", "PCL_Strict1", "cd_risc1", "active_coping1", "planning1", "positive_reframing1",
               "acceptance1", "humor1", "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
               "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1", "trauma_history8_1",
               "HML_5HTT", "HL_MAOA", "HML_NPY", "COMT_Ranked", "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH",
               "HML_FKBP5", "Ashken_scale", "Sephar_scale", "Unknown"]

In [4]:
bad_features = ["T1ETBE", "T1bias", "state1", "trait1", "phq1", "PCL1",
                "denial1", "substance_use1", "self_blame1", "trauma_history8_1"]
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2", "bad_features"]
categorical_features = ["age", "highschool_diploma",  "dyslexia", "ADHD", "T1Acc1t", "T1Acc1n", "T1bias", "phq1", "lot1", "trait1",
               "state1", "PCL1",  "PCL_Broad1", "PCL_Strict1", "phq2", "lot2", "trait2", "state2", "PCL2", "PCL_Broad2", 
                 "PCL_Strict2", "cd_risc1", "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                 "religion1", "emotional_support1","instrumental_support1", "self_distraction1", "denial1", 
               "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1", "active_coping2", "planning2",
                "positive_reframing2", "acceptance2", "humor2", "religion2", "emotional_support2", "instrumental_support2", 
                 "self_distraction2", "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                 "trauma_history8_1", "HML_5HTT", "HL_MAOA", "HML_NPY", "COMT_Ranked", "COMT_Hap1_recode", 
               "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale", "Sephar_scale", "Unknown"]

df['bad_features'] = (df > df.mean())[bad_features].sum(axis=1)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

pca = PCA(n_components = 1)
df["pcls"] = pca.fit_transform(df[["PCL1",  "PCL_Broad1", "PCL_Strict1"]])
df.drop(["PCL1",  "PCL_Broad1", "PCL_Strict1"], axis=1, inplace=True)
features.append("pcls")
features.remove("PCL1")
features.remove("PCL_Broad1")
features.remove("PCL_Strict1")


X = df[features]


ss = StandardScaler()
X = ss.fit_transform(X)


Y = df["PCL_Strict3"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=271828, stratify=Y)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.1, random_state=271828, stratify=y_train)


In [None]:
kfold = StratifiedKFold(n_splits=10, shuffle=True)
cvscores = []
y_train_2 = np.array(y_train_2)
X_train_2 = np.array(X_train_2)

#eval
pres=[]
for train, test in kfold.split(X_train_2, y_train_2):

    rfe =  RFE(RandomForestClassifier(n_estimators=100), 8).fit(X_train_2[train], y_train_2[train])

    X_train_rfe = rfe.transform(X_train_2[train])
    X_test_rfe = rfe.transform(X_train_2[test])
    
    sm = SMOTE(random_state=27)
    X_train_res, y_train_res = sm.fit_sample(X_train_rfe, y_train_2[train].ravel())

    # create model
    n_cols = X_train_res.shape[1]
    model = Sequential()
    
    model.add(Dense(70, activation='elu', input_dim = n_cols))
    model.add(Dropout(0.5))

#     model.add(Dense(5, activation='elu'))
#     model.add(Dropout(0.5))

    
    model.add(Dense(1, activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy')


    # Fit the model
    model.fit(X_train_res, y_train_res, epochs = 500, verbose=0, class_weight = {0:5, 1:1})
    
    # evaluate the model                    
    y_pred =  model.predict(X_test_rfe)
    y_pred = y_pred > 0.5

    s = precision_score(y_train_2[test], y_pred)
    print("\n\nprecision\n", s)
    pres.append(s)
    print("recall\n", recall_score(y_train_2[test], y_pred))
print("\n\nmean precision", sum(pres)/len(pres))



precision
 0.5
recall
 0.25


precision
 0.0
recall
 0.0


precision
 0.0
recall
 0.0


In [7]:

rfe =  RFE(RandomForestClassifier(n_estimators=100), 5).fit(X_train_2, y_train_2)

X_train_rfe = rfe.transform(X_train_2)
X_test_rfe = rfe.transform(X_test_2)
    
sm = SMOTE(random_state=27)
X_train_res, y_train_res = sm.fit_sample(X_train_rfe, y_train_2.ravel())

# create model
n_cols = X_train_res.shape[1]
model = Sequential()
    
model.add(Dense(7, activation='elu', input_dim = n_cols))
model.add(Dropout(0.5))

model.add(Dense(5, activation='elu'))
model.add(Dropout(0.5))

model.add(Dense(3, activation='elu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy')


# Fit the model
model.fit(X_train_res, y_train_res, epochs = 250, verbose=0, class_weight = {0:1, 1:1.7})
    
# evaluate the model                    
y_pred =  model.predict(X_test_rfe)
y_pred = y_pred > 0.5

s = precision_score(y_pred,y_test_2)
print("\n\nprecision\n", s)
print("recall\n", recall_score(y_pred,y_test_2))




precision
 0.75
recall
 0.08333333333333333


In [8]:

rfe =  RFE(RandomForestClassifier(n_estimators=100), 5).fit(X_train, y_train)

X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)
    
sm = SMOTE(random_state=27)
X_train_res, y_train_res = sm.fit_sample(X_train_rfe, y_train.ravel())

# create model
n_cols = X_train_res.shape[1]
model = Sequential()
    
model.add(Dense(7, activation='elu', input_dim = n_cols))
model.add(Dropout(0.5))

model.add(Dense(5, activation='elu'))
model.add(Dropout(0.5))

model.add(Dense(3, activation='elu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy')


# Fit the model
model.fit(X_train_res, y_train_res, epochs = 250, verbose=0, class_weight = {0:1, 1:1.7})
    
# evaluate the model                    
y_pred =  model.predict(X_test_rfe)
y_pred = y_pred > 0.5

s = precision_score(y_pred,y_test)
print("\n\nprecision\n", s)
print("recall\n", recall_score(y_pred,y_test))




precision
 1.0
recall
 0.11764705882352941
