In [67]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer

In [68]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df["PCL_Strict3"].isna()]

In [69]:
features = ["age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD", "T1ETBE", "T1Acc1t",
                         "T1Acc1n", "T1bias", "state1", "trait1",
                         "lot1", "phq1",  "PCL1", "cd_risc1",
                         "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                         "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                         "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                         "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                         "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                         "Sephar_scale", "Unknown"]

In [70]:
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2"]
categorical_features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD",   "ptgi2",
                    "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                    "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                    "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                    "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                    "religion2", "emotional_support2", "instrumental_support2", "self_distraction2",
                    "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                    "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                    "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                    "Sephar_scale", "Unknown"]
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

X = df[features]
Y = df["PCL_Strict3"]

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.15, random_state=271828, stratify=Y)


In [72]:
sm = SMOTE(random_state=27)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

In [73]:
# pca = PCA(n_components = 0.99)
# X_train_res = pca.fit_transform(X_train_res)

In [74]:
clf = RandomForestClassifier()
score = cross_val_score(clf, X_train_res, y_train_res, scoring="f1")
print("score", score)
clf.fit(X_train_res, y_train_res)
#X_train = pca.transform(X_train)
y_pred = clf.predict(X_train)
print("recall_score", recall_score(y_pred,y_train))
print("precision_score",precision_score(y_pred,y_train))
print("accuracy_score",accuracy_score(y_pred,y_train))
print("f1_score",f1_score(y_pred,y_train))
print("roc_auc_score",roc_auc_score(y_pred,y_train))


score [0.90704225 0.9870801  0.98969072]
recall_score 1.0
precision_score 0.8648648648648649
accuracy_score 0.9918831168831169
f1_score 0.927536231884058
roc_auc_score 0.9957191780821918




In [75]:
sorted(zip(features, clf.feature_importances_), key= lambda x: x[1], reverse=1)

[('phq1', 0.11016168283182415),
 ('PCL1', 0.09448512864963302),
 ('highschool_diploma', 0.07589378148733614),
 ('military_exposure_unit', 0.06078024215541864),
 ('age', 0.05173493661153239),
 ('COMT_Hap2_recode', 0.041679843168317285),
 ('self_distraction1', 0.038313017917350194),
 ('planning1', 0.03764109727619229),
 ('humor1', 0.03231549931869055),
 ('HML_NPY', 0.031244739280565214),
 ('trait1', 0.026516537969932624),
 ('lot1', 0.021720839776554583),
 ('denial1', 0.021561885993390175),
 ('T1Acc1t', 0.019883267899670896),
 ('HML_FKBP5', 0.019514577275508937),
 ('positive_reframing1', 0.0189367130719957),
 ('T1ETBE', 0.018339886794628786),
 ('HML_5HTT', 0.01818397741671097),
 ('T1bias', 0.018156484742980008),
 ('state1', 0.017599979870782427),
 ('venting1', 0.017521312095032517),
 ('HL_MAOA', 0.016385375508471002),
 ('self_blame1', 0.01596104818684388),
 ('religion1', 0.014631522728324506),
 ('T1Acc1n', 0.014416458231404833),
 ('Sephar_scale', 0.01387157534747017),
 ('behavioral_diseng

In [18]:
print("negative before smote", len(y_train[y_train==0]))
print("positive before smote", len(y_train[y_train==1]))
print("negative after smote", len(y_train_res[y_train_res==0]))
print("positive after smote", len(y_train_res[y_train_res==1]))

negative before smote 579
positive before smote 37
negative after smote 579
positive after smote 579
