In [8]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [9]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df["PCL_Strict3"].isna()]


In [10]:
features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD", "T1ETBE", "T1Acc1t",
                         "T1Acc1n", "T1bias", "T2Acc1t", "T2Acc1n", "T2bias", "state1", "state2", "trait1",
                         "trait2", "lot1", "lot2", "phq1", "phq2", "PCL1", "PCL2", "cd_risc1", "ptgi2",
                         "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                         "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                         "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                         "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                         "religion2", "emotional_support2", "instrumental_support2","self_distraction2",
                         "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                         "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                         "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                         "Sephar_scale", "Unknown", 'terror_p1', 'terror_i1', 'mva_p1', 'mva_i1',                   
                        'violent1', 'sexual1', 'rockets_p1', 'rockets_i1', 'trauma_history6_1',
                        'terror_p2','terror_i2','mva_p2', 'mva_i2', 'violent2', 'sexual2', 'rockets_p2', 'rockets_i2', 'trauma6t2', 'trauma8t2',
'military_exp18_1','military_exp18_t2', 'commanders18','commanders20','commanders22', 
't1bias_1_zero', 'state1_zero', 'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'depression_clinical2',
                        'avoid_bias', 'd_ptsd', 'ptsd1_clini', 'avoidance_cop',
'clinical_depression15', 'avoidance_compa', 'resilience_compa', 'combat_compa', 'emotional_cop1n', 'avoidance_cop2', 'avoidance_cop']

In [11]:
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2"]
categorical_features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD",   "ptgi2",
                    "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                    "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                    "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                    "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                    "religion2", "emotional_support2", "instrumental_support2", "self_distraction2",
                    "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                    "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                    "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                    "Sephar_scale", "Unknown", 'terror_p1', 'terror_i1', 'mva_p1', 'mva_i1',                   
                        'violent1', 'sexual1', 'rockets_p1', 'rockets_i1', 'trauma_history6_1',
                        'terror_p2','terror_i2','mva_p2', 'mva_i2', 'violent2', 'sexual2', 'rockets_p2', 'rockets_i2', 'trauma6t2', 'trauma8t2',
'military_exp18_1','military_exp18_t2', 'commanders18','commanders20','commanders22', 
't1bias_1_zero', 'state1_zero', 'trait1_zero', 'PHQ1_zero', 'PCL1_zero', 'depression_clinical2',
                        'avoid_bias', 'd_ptsd', 'ptsd1_clini', 'avoidance_cop',
'clinical_depression15', 'avoidance_compa', 'resilience_compa', 'combat_compa', 'emotional_cop1n', 'avoidance_cop2', 'avoidance_cop']
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

X = df[features]
X = X - X.mean()

Y = df["PCL_Strict3"]

In [12]:
X["interaction_1"] = X["T1Acc1t"] * X["T2Acc1n"] * X["military_exposure_unit"]
X["interaction_2"] = X["T1Acc1n"] * X["T2Acc1t"] * X["military_exposure_unit"]

X["interaction_9"] = X["highschool_diploma"] * X["military_exp18_1"]
X["interaction_10"] = X["highschool_diploma"] * X["military_exp18_t2"]
X["interaction_11"] = X["highschool_diploma"] * X["military_exposure_unit"] * X['PCL1']
X["interaction_12"] = X["T1ETBE"] * X["military_exposure_unit"] * X['HML_5HTT']
X["interaction_13"] = X["T1ETBE"] * X['HML_5HTT']
X["interaction_14"] = X["T1ETBE"] * X['military_exposure_unit']

X["interaction_15"] = X["T1ETBE"] * X["PCL1"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=271828, stratify=Y)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.05, random_state=271828, stratify=y_train)

In [20]:
pipe = Pipeline(steps=[
            ('SMOTE', SMOTE(0.95, random_state=27)),
#              ('PCA',  PCA(n_components = 0.9)),
                ('classifier', VotingClassifier(estimators=[], voting='soft'))])

In [21]:

params_grid = [
           {'SMOTE__k_neighbors': [5],
            #'PCA__n_components': [0.99, 0.95, 0.9, 0.85, 0.8],
            'classifier__estimators':[[('clf1',MLPClassifier(max_iter= 500, hidden_layer_sizes=(70, 40, 20, 10, 5),alpha= 0.0001)),
                                    ('clf2',MLPClassifier(max_iter= 500, hidden_layer_sizes=(30, 30, 30, 30, 5),alpha= 0.001)),
                                    ('clf3',MLPClassifier(max_iter= 500, hidden_layer_sizes=(70, 10, 10, 10, 5),alpha= 0.00001))
                                        ]]}]

In [22]:
cv = StratifiedKFold()

gs = GridSearchCV(pipe, params_grid, cv=cv, scoring='f1')



In [15]:
vot = VotingClassifier(estimators=[('clf0', clf0), ('clf1', clf1)], voting='soft')


score = cross_val_score(pipe, X_train_2, y_train_2, scoring="f1", cv=cv)

print("score", score)
# clf.fit(X_train_res, y_train_res)
# X_train = pca.transform(X_train)
# y_pred = clf.predict(X_train)
# print("recall_score", recall_score(y_pred,y_train))
# print("precision_score",precision_score(y_pred,y_train))
# print("accuracy_score",accuracy_score(y_pred,y_train))
# print("f1_score",f1_score(y_pred,y_train))
# print("roc_auc_score",roc_auc_score(y_pred,y_train))


score [0.52631579 0.41666667 0.51428571]


In [23]:
gs.fit(X_train_2, y_train_2)



GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('SMOTE', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=27, ratio=None,
   sampling_strategy=0.95, svm_estimator='deprecated')), ('classifier', VotingClassifier(estimators=[], flatten_transform=None, n_jobs=None,
         voting='soft', weights=None))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'SMOTE__k_neighbors': [5], 'classifier__estimators': [[('clf1', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(70, 40, 20, 10, 5), learning_rate='constant',
       learnin...ue, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False))]]}],
       pre_dispatch='2*n_jobs', refit=True, r

In [24]:
gs.best_score_

0.5072518398851193

In [25]:
gs.best_estimator_.steps[1]

('classifier',
 VotingClassifier(estimators=[('clf1', MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
        beta_2=0.999, early_stopping=False, epsilon=1e-08,
        hidden_layer_sizes=(70, 40, 20, 10, 5), learning_rate='constant',
        learning_rate_init=0.001, max_iter=500, momentum=0.9,
        n_...=True, solver='adam', tol=0.0001,
        validation_fraction=0.1, verbose=False, warm_start=False))],
          flatten_transform=None, n_jobs=None, voting='soft', weights=None))

In [27]:
f1_score(gs.predict(X_test_2), y_test_2)

0.6666666666666666

In [7]:
? StratifiedKFold 