In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import tensorflow.keras
import tensorflow.keras.metrics
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import History 
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import SGD
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV, RFE


In [2]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df["PCL_Strict3"].isna()]


In [3]:
features = ["age", "highschool_diploma",  "dyslexia", "ADHD", "T1Acc1t", "T1Acc1n", "T1bias", "phq1", "lot1", "trait1",
               "state1", "PCL1",  "PCL_Broad1", "PCL_Strict1", "phq2", "lot2", "trait2", "state2", "PCL2", "PCL_Broad2", 
                 "PCL_Strict2", "cd_risc1", "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                 "religion1", "emotional_support1","instrumental_support1", "self_distraction1", "denial1", 
               "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1", "active_coping2", "planning2",
                "positive_reframing2", "acceptance2", "humor2", "religion2", "emotional_support2", "instrumental_support2", 
                 "self_distraction2", "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                 "trauma_history8_1", "HML_5HTT", "HL_MAOA", "HML_NPY", "COMT_Ranked", "COMT_Hap1_recode", 
               "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "bad_features"]

In [4]:
bad_features = ["T1ETBE", "T1bias", "state1", "state2", "trait1", "trait2", "phq1", "phq2", "PCL1", "PCL2",
                "denial1", "substance_use1", "self_blame1", "denial2", "substance_use2","self_blame2",
                         "trauma_history8_1"]
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2", "bad_features"]
categorical_features = ["age", "highschool_diploma",  "dyslexia", "ADHD", "T1Acc1t", "T1Acc1n", "T1bias", "phq1", "lot1", "trait1",
               "state1", "PCL1",  "PCL_Broad1", "PCL_Strict1", "phq2", "lot2", "trait2", "state2", "PCL2", "PCL_Broad2", 
                 "PCL_Strict2", "cd_risc1", "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                 "religion1", "emotional_support1","instrumental_support1", "self_distraction1", "denial1", 
               "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1", "active_coping2", "planning2",
                "positive_reframing2", "acceptance2", "humor2", "religion2", "emotional_support2", "instrumental_support2", 
                 "self_distraction2", "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                 "trauma_history8_1", "HML_5HTT", "HL_MAOA", "HML_NPY", "COMT_Ranked", "COMT_Hap1_recode", 
               "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale", "Sephar_scale", "Unknown"]

df['bad_features'] = (df > df.mean())[bad_features].sum(axis=1)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

pca = PCA(n_components = 1)
df["pcls"] = pca.fit_transform(df[["PCL1",  "PCL_Broad1", "PCL_Strict1"]])
df.drop(["PCL1",  "PCL_Broad1", "PCL_Strict1"], axis=1, inplace=True)
df["pcls2"] = pca.fit_transform(df[["PCL2",  "PCL_Broad2", "PCL_Strict2"]])
df.drop(["PCL2",  "PCL_Broad2", "PCL_Strict2"], axis=1, inplace=True)
features.append("pcls2")

features.remove("PCL1")
features.remove("PCL_Broad1")
features.remove("PCL_Strict1")


features.remove("PCL2")
features.remove("PCL_Broad2")
features.remove("PCL_Strict2")

X = df[features]


ss = StandardScaler()
X = ss.fit_transform(X)


Y = df["PCL_Strict3"]

In [5]:
unsup = 1

if unsup:

    rand_item = eval("['pcls2', 'positive_reframing1', 'trauma_history8_1', 'lot2', 'T1bias', 'emotional_support1', 'self_blame1', 'pcls2']")

    pca = PCA(n_components = 2)
    data_transformed = df[rand_item]
    data_transformed = pca.fit_transform(data_transformed)

    db = DBSCAN(eps=0.5, min_samples=10, leaf_size=30).fit(X)
    X = np.hstack((db.labels_.reshape(-1,1), X))


    rand_item = eval("['instrumental_support1', 'COMT_Hap1_recode', 'dyslexia', 'HL_MAOA', 'lot1', 'active_coping2', 'ADHD']")

    pca = PCA(n_components = 2)
    data_transformed = df[rand_item]
    data_transformed = pca.fit_transform(data_transformed)

    db = DBSCAN(eps=0.7, min_samples=10, leaf_size=30).fit(X)
    X = np.hstack((db.labels_.reshape(-1,1), X))


    rand_item = eval("['COMT_Hap1_LvsMH', 'venting2', 'positive_reframing2', 'COMT_Ranked', 'behavioral_disengagement2', 'trait1', 'HL_MAOA', 'HL_MAOA']")

    pca = PCA(n_components = 2)
    data_transformed = df[rand_item]
    data_transformed = pca.fit_transform(data_transformed)

    db = DBSCAN(eps=0.5, min_samples=10, leaf_size=30).fit(X)
    X = np.hstack((db.labels_.reshape(-1,1), X))


    rand_item = eval("['pcls2', 'emotional_support2', 'pcls2', 'emotional_support1', 'COMT_Hap1_LvsMH', 'HML_FKBP5', 'lot1', 'COMT_Hap1_LvsMH', 'COMT_Hap2_recode', 'state1']")

    pca = PCA(n_components = 2)
    data_transformed = df[rand_item]
    data_transformed = pca.fit_transform(data_transformed)

    db = DBSCAN(eps=0.6, min_samples=10, leaf_size=30).fit(X)
    X = np.hstack((db.labels_.reshape(-1,1), X))


    rand_item = eval("['phq2', 'substance_use1', 'instrumental_support2', 'substance_use1', 'trait1']")

    pca = PCA(n_components = 2)
    data_transformed = df[rand_item]
    data_transformed = pca.fit_transform(data_transformed)

    db = DBSCAN(eps=0.5, min_samples=5, leaf_size=30).fit(X)
    X = np.hstack((db.labels_.reshape(-1,1), X))


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=271828, stratify=Y)
#X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.1, random_state=271828, stratify=y_train)


In [8]:
kfold = StratifiedKFold(n_splits=7, shuffle=True)
cvscores = []
y_train_2 = np.array(y_train)
X_train_2 = np.array(X_train)

#eval
pres=[]
rs = []
for train, test in kfold.split(X_train_2, y_train_2):

    rfe =  RFE(RandomForestClassifier(n_estimators=100), 18).fit(X_train_2[train], y_train_2[train])
    print(rfe.ranking_)
    X_train_rfe = rfe.transform(X_train_2[train])
    X_test_rfe = rfe.transform(X_train_2[test])
    
    sm = SMOTE(random_state=27, k_neighbors= 12)
    X_train_res, y_train_res = sm.fit_sample(X_train_rfe, y_train_2[train].ravel())

    model_number = 4

        
    if model_number == 4:
        # create model
        n_cols = X_train_res.shape[1]
        model = Sequential()

        
        model.add(Dense(60, activation='elu'))
        model.add(Dropout(0.5))


        model.add(Dense(1, activation='sigmoid'))

        model.compile(optimizer='adam', loss='binary_crossentropy')


        # Fit the model
        model.fit(X_train_res, y_train_res, epochs = 300, verbose=0, class_weight = {0:10, 1:1})

        # evaluate the model                    
        y_pred =  model.predict(X_test_rfe)
        y_pred = y_pred > 0.5

        
    s = precision_score(y_train_2[test], y_pred)
    print("\n\nprecision\n", s)
    pres.append(s)
    
    r = recall_score(y_train_2[test], y_pred)
    print("recall\n", r)
    rs.append(r)
    
print("\n\nmean precision", sum(pres)/len(pres))
print("\n\nmean recall", sum(rs)/len(rs))

[43 42 41 40 39  1 31 37 26 18 15  1  1  1  1  1  1  1  1  1  1 30 32 34
 29 24 16 28 27 20 36 13 19 14 11 25 23  1 10  2 22 12  7  1  1  1  1  4
 21  8  6 33  3  9 35  5 38 17  1  1]


precision
 0.0
recall
 0.0
[43 42 41 40 39  1 36 37  8 24 12  1  1  1  1  1  1  1  1  1  1 28 35 23
 21 27 17 32 20 19  9 14 34 11 25 31 29  4  3  5 16  1 10  1  1  2 15 18
 13  1  6 30  1  7 33 26 38 22  1  1]


precision
 0.5
recall
 0.16666666666666666
[43 42 41 40 39  1 10 37 27 26  5  1  1  1  1  1  1  1  1  1  1 24 33 35
 21 22  1 16  9 19  4  7 34  1 11 14 28  3 17  6 18 15 12  1  1  8  1 29
 23 13 20 30  2 25 31 32 38 36  1  1]


precision
 0.0
recall
 0.0
[43 42 41 40 39 14 32 37 15 22  9  1  1  1  1  1  1  1  1  1  1 27 33 26
 36 19 20 29 17 28  3 11 18  1 12 31 30  1  4  1 21  5 10  1  1  1  7 23
 13 16  6 34  2 35 25  8 38 24  1  1]


precision
 0.16666666666666666
recall
 0.16666666666666666
[43 42 41 40 39  1 30 37 13 26 21  1  1  1  1  1  1  1  1  1  1 32 35 34
 24  1 12 20 17 27  3 19 29

In [None]:
##

# RFE = 5 
## unsup 1 model 4 - 25/20/15
## p - 0.82
## r - 0.12


# RFE = 5 
## unsup 1 model 4 - 25/20/15    w - 10/20
## p - 0.82
## r - 0.12


# RFE = 5 
## unsup 1 model 4 - 25/20/15    w - 10/15
## p - 0.8
## r - 0.12


# RFE = 4
## unsup 1 model 4 - 25/20/15    w - 10/15
## p - 0.8
## r - 0.12

In [35]:
# RFE = 3

## unsup 1 model 4 - 
## p - 0.857
## r - 0.0995
## unsup 1 model 0 - 
## p - 0.86
## r - 0.099

## unsup 0 model 4 - 
## p - 0.839
## r - 0.0995
## unsup 0 model 0 - 
## p - 0.725
## r - 0.101


# RFE = 13

## unsup 1 model 4 - 
## p - 0.52
## r - 0.14

## unsup 0 model 4 - 
## p - 0.6
## r - 0.15


# RFE = 7

## unsup 1 model 4 - 
## p - 0.67
## r - 0.14


# RFE = 5

## unsup 1 model 4 - 
## p - 0.67
## r - 0.14

In [42]:

rfe =  RFE(RandomForestClassifier(n_estimators=100), 5).fit(X_train_2, y_train_2)

X_train_rfe = rfe.transform(X_train_2)
X_test_rfe = rfe.transform(X_test_2)
    
sm = SMOTE(random_state=27)
X_train_res, y_train_res = sm.fit_sample(X_train_rfe, y_train_2.ravel())

# create model
n_cols = X_train_res.shape[1]
model = Sequential()
    
model.add(Dense(10, activation='elu', input_dim = n_cols))
model.add(Dropout(0.5))


model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy')


# Fit the model
model.fit(X_train_res, y_train_res, epochs = 250, verbose=0, class_weight = {0:8, 1:1})
    
# evaluate the model                    
y_pred =  model.predict(X_test_rfe)
y_pred = y_pred > 0.5

s = precision_score(y_test_2, y_pred)
print("\n\nprecision\n", s)
print("recall\n", recall_score(y_test_2, y_pred))




precision
 0.0
recall
 0.0


In [None]:
y_pred

In [None]:

rfe =  RFE(RandomForestClassifier(n_estimators=100), 5).fit(X_train, y_train)

X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)
    
sm = SMOTE(random_state=27)
X_train_res, y_train_res = sm.fit_sample(X_train_rfe, y_train.ravel())

# create model
n_cols = X_train_res.shape[1]
model = Sequential()
    
model.add(Dense(7, activation='elu', input_dim = n_cols))
model.add(Dropout(0.5))

model.add(Dense(5, activation='elu'))
model.add(Dropout(0.5))

model.add(Dense(3, activation='elu'))
model.add(Dropout(0.5))

model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy')


# Fit the model
model.fit(X_train_res, y_train_res, epochs = 250, verbose=0, class_weight = {0:1, 1:1.7})
    
# evaluate the model                    
y_pred =  model.predict(X_test_rfe)
y_pred = y_pred > 0.5

s = precision_score(y_pred,y_test)
print("\n\nprecision\n", s)
print("recall\n", recall_score(y_pred,y_test))
