In [27]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import tensorflow.keras
import keras.metrics
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Input
from keras.models import Sequential
from keras.callbacks import History 
from keras.utils import plot_model
from keras.optimizers import SGD
from keras.models import Model
from imblearn.under_sampling import RandomUnderSampler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
import featuretools as ft

In [28]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df["PCL_Strict3"].isna()]

In [29]:
features = ["ID" ,"age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD", "T1ETBE", "T1Acc1t",
                         "T1Acc1n", "T1bias", "T2Acc1t", "T2Acc1n", "T2bias", "state1", "state2", "trait1",
                         "trait2", "lot1", "lot2", "phq1", "phq2", "PCL1", "PCL2", "cd_risc1", "ptgi2",
                         "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                         "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                         "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                         "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                         "religion2", "emotional_support2", "instrumental_support2","self_distraction2",
                         "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                         "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                         "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                         "Sephar_scale", "Unknown", 'terror_p1', 'terror_i1', 'mva_p1', 'mva_i1',                   
                         'violent1', 'sexual1', 'rockets_p1', 'rockets_i1', 'trauma_history6_1',
                        'terror_p2','terror_i2','mva_p2', 'mva_i2', 'violent2', 'sexual2', 'rockets_p2',
                        'rockets_i2', 'trauma6t2', 'trauma8t2', 'military_exp18_1','military_exp18_t2',
                        'commanders18','commanders20', 'commanders22', 't1bias_1_zero', 'state1_zero',
                        'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'depression_clinical2', 'avoid_bias',
                        'ptsd1_clini', 'avoidance_cop', 'clinical_depression15', 'avoidance_compa',
                        'resilience_compa', 'combat_compa', 'emotional_cop1n', 'avoidance_cop2']

In [30]:
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2"]
categorical_features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD",   "ptgi2",
                    "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                    "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                    "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                    "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                    "religion2", "emotional_support2", "instrumental_support2", "self_distraction2",
                    "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                    "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                    "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                    "Sephar_scale", "Unknown", 'terror_p1', 'terror_i1', 'mva_p1', 'mva_i1',                   
                    'violent1', 'sexual1', 'rockets_p1', 'rockets_i1', 'trauma_history6_1', 'terror_p2','terror_i2',
                    'mva_p2', 'mva_i2', 'violent2', 'sexual2', 'rockets_p2', 'rockets_i2', 'trauma6t2', 'trauma8t2',
                    'military_exp18_1','military_exp18_t2', 'commanders18','commanders20','commanders22', 
                    't1bias_1_zero', 'state1_zero', 'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'depression_clinical2',
                    'avoid_bias', 'ptsd1_clini', 'avoidance_cop', 'clinical_depression15', 'avoidance_compa',
                    'resilience_compa', 'combat_compa', 'emotional_cop1n', 'avoidance_cop2', 'avoidance_cop']


In [31]:
df.dropna(thresh=0.5, axis=1,inplace=True)
df.dropna(thresh=0.9, axis=0,inplace=True)

In [32]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

X = df[features]
X = X - X.mean()

Y = df["PCL_Strict3"]

In [33]:
es = ft.EntitySet(id = 'ID')
es = es.entity_from_dataframe(entity_id='ID', dataframe = X)
features, feature_names = ft.dfs(entityset = es, target_entity = 'ID',  max_depth = 5)



In [34]:
X_train, X_test, y_train, y_test = train_test_split(features, Y, test_size = 0.1, random_state=271828, stratify=Y)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.05, random_state=271828, stratify=y_train)


In [35]:
def create_data(X_train, y_train):
    X_train_3 = X_train[y_train==1]
    y_train_3 = y_train[y_train==1]
    X_train_4 = X_train[y_train==0][:20:]
    y_train_4 = y_train[y_train==0][:20:]
    X_train_5 = np.vstack((X_train_4, X_train_3))
    y_train_5 =  np.hstack((y_train_4, y_train_3))
    sm = SMOTE(random_state=27)
    X_train_6, y_train_6 = sm.fit_sample(X_train_5, y_train_5.ravel())
    X_train_6 = X_train_6[y_train_6==0]
    y_train_6 = y_train_6[y_train_6==0]
    return X_train_6, y_train_6

In [36]:
kfold = StratifiedKFold(n_splits=3, shuffle=True)
cvscores = []
y_train_2 = np.array(y_train_2)
X_train_2 = np.array(X_train_2)

for class_weight in [{1:0.35, 0:0.65}, {1:0.25, 0:0.75}, {1:0.15, 0:0.85}]:
    for num_smote in [0.9]:
        for lim in [0.5, 0.6]:
            for first_layer in [30]:
                for loops in [1]:
                    for each_layer in [20]:
                        n_pca = 0.99
                        first_layer = first_layer
                        each_layer = each_layer
                        num_smote = num_smote
                        loops = loops
                        scores_f = []
                        scores_p = []
                        scores_r = []
                        print("\n\nlim", lim, "\nfirst_layer", first_layer, 
                                  "\neach_layer", each_layer, "\nnum_smote", num_smote,
                              "\nloops", loops, "\nclass_weight", class_weight)
                        for train, test in kfold.split(X_train_2, y_train_2):
                            
                            y_pred = np.zeros_like(y_train_2[test]).reshape(-1, 1)
                            t = 15
                            for depth in range(t):
                                sm = SMOTE(num_smote)
                                X_train_res, y_train_res = sm.fit_sample(X_train_2[train], y_train_2[train].ravel())

                                pca = PCA(n_components = n_pca)
                                X_train_res = pca.fit_transform(X_train_res)
                                X_test_2 = pca.transform(X_train_2[test])


                                n_cols = X_train_res.shape[1]
                                model = Sequential()
                                model.add(Dense(8, activation='elu', input_dim = n_cols))
                                model.add(Dropout(0.5))


#                                 model.add(Dense(15, activation='elu'))
#                                 model.add(Dropout(0.5))

                                model.add(Dense(5 , activation='elu'))
                                model.add(Dropout(0.5))
                                
                                
                                model.add(Dense(3, activation='elu'))
                                model.add(Dropout(0.5))
                                           
                                model.add(Dense(1, activation='sigmoid'))

                                model.compile(optimizer='adam', 
                                                  loss='binary_crossentropy')

                                    # Fit the model
                                model.fit(X_train_res, y_train_res, epochs = 700, class_weight = class_weight, verbose=0)
                                    # evaluate the model
                                #print (model.predict(X_test_2).shape)
                                y_pred +=  model.predict(X_test_2)

                        
                            y_pred = (y_pred/t) > lim
                            s_f = f1_score(y_train_2[test], y_pred)
                            s_p = precision_score(y_train_2[test], y_pred)
                            s_r = recall_score(y_train_2[test], y_pred)
                            print("\tscores f1", (s_f))
                            print("\tscores p", (s_p))
                            print("\tscores r", (s_r))
                            scores_f.append(s_f)
                            scores_p.append(s_p)
                            scores_r.append(s_r)

                        print("mean scores f1", np.mean(scores_f))
                        print("mean scores p", np.mean(scores_p))
                        print("mean scores r", np.mean(scores_r))



lim 0.5 
first_layer 30 
each_layer 20 
num_smote 0.9 
loops 1 
class_weight {1: 0.35, 0: 0.65}
	scores f1 0.3225806451612903
	scores p 0.2777777777777778
	scores r 0.38461538461538464
	scores f1 0.1904761904761905
	scores p 0.25
	scores r 0.15384615384615385
	scores f1 0.14814814814814814
	scores p 0.13333333333333333
	scores r 0.16666666666666666
mean scores f1 0.22040166126187632
mean scores p 0.22037037037037036
mean scores r 0.23504273504273507


lim 0.6 
first_layer 30 
each_layer 20 
num_smote 0.9 
loops 1 
class_weight {1: 0.35, 0: 0.65}
	scores f1 0.21052631578947367
	scores p 0.3333333333333333
	scores r 0.15384615384615385
	scores f1 0.09090909090909093
	scores p 0.1111111111111111
	scores r 0.07692307692307693
	scores f1 0.2727272727272727
	scores p 0.3
	scores r 0.25
mean scores f1 0.19138755980861244
mean scores p 0.24814814814814815
mean scores r 0.16025641025641027


lim 0.5 
first_layer 30 
each_layer 20 
num_smote 0.9 
loops 1 
class_weight {1: 0.25, 0: 0.75}
	score

KeyboardInterrupt: 

In [11]:
? f1_score