In [41]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV, RFE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import random 

In [42]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
df = df[~df["PCL_Strict3"].isna()]
df2 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx")
df = df.merge(df2.drop(['mean', 'pcl3', 'intrusion', 'avoidance', 'hypertention', 'PCL3_Strict', 'PrimaryLast'],axis=1), on="ID")

In [43]:
features = ["age", "highschool_diploma",  "dyslexia", "ADHD", "T1Acc1t", "T1Acc1n", "T1bias", "phq1", "lot1", "trait1",
               "state1", "PCL1",  "PCL_Broad1", "PCL_Strict1", "phq2", "lot2", "trait2", "state2", "PCL2", "PCL_Broad2", 
                 "PCL_Strict2", "cd_risc1", "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                 "religion1", "emotional_support1","instrumental_support1", "self_distraction1", "denial1", 
               "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1", "active_coping2", "planning2",
                "positive_reframing2", "acceptance2", "humor2", "religion2", "emotional_support2", "instrumental_support2", 
                 "self_distraction2", "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                 "trauma_history8_1", "HML_5HTT", "HL_MAOA", "HML_NPY", "COMT_Ranked", "COMT_Hap1_recode", 
               "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "bad_features"]

In [44]:
bad_features = ["T1ETBE", "T1bias", "state1", "state2", "trait1", "trait2", "phq1", "phq2", "PCL1", "PCL2",
                "denial1", "substance_use1", "self_blame1", "denial2", "substance_use2","self_blame2",
                         "trauma_history8_1"]
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2", "bad_features"]
categorical_features = ["age", "highschool_diploma",  "dyslexia", "ADHD", "T1Acc1t", "T1Acc1n", "T1bias", "phq1", "lot1", "trait1",
               "state1", "PCL1",  "PCL_Broad1", "PCL_Strict1", "phq2", "lot2", "trait2", "state2", "PCL2", "PCL_Broad2", 
                 "PCL_Strict2", "cd_risc1", "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                 "religion1", "emotional_support1","instrumental_support1", "self_distraction1", "denial1", 
               "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1", "active_coping2", "planning2",
                "positive_reframing2", "acceptance2", "humor2", "religion2", "emotional_support2", "instrumental_support2", 
                 "self_distraction2", "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                 "trauma_history8_1", "HML_5HTT", "HL_MAOA", "HML_NPY", "COMT_Ranked", "COMT_Hap1_recode", 
               "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale", "Sephar_scale", "Unknown"]

df['bad_features'] = (df > df.mean())[bad_features].sum(axis=1)

imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

pca = PCA(n_components = 1)
df["pcls"] = pca.fit_transform(df[["PCL1",  "PCL_Broad1", "PCL_Strict1"]])
df.drop(["PCL1",  "PCL_Broad1", "PCL_Strict1"], axis=1, inplace=True)
features.append("pcls")
features.remove("PCL1")
features.remove("PCL_Broad1")
features.remove("PCL_Strict1")


X = df[features]


ss = StandardScaler()
X = ss.fit_transform(X)


Y = df["q6.11_NUMB"] >1

In [115]:
rand_item = eval("['PCL2', 'positive_reframing1', 'trauma_history8_1', 'lot2', 'T1bias', 'emotional_support1', 'self_blame1', 'PCL2']")

pca = PCA(n_components = 2)
data_transformed = df[rand_item]
data_transformed = pca.fit_transform(data_transformed)

db = DBSCAN(eps=0.5, min_samples=10, leaf_size=30).fit(X)
X = np.hstack((db.labels_.reshape(-1,1), X))


rand_item = eval("['instrumental_support1', 'COMT_Hap1_recode', 'dyslexia', 'HL_MAOA', 'lot1', 'active_coping2', 'ADHD']")

pca = PCA(n_components = 2)
data_transformed = df[rand_item]
data_transformed = pca.fit_transform(data_transformed)

db = DBSCAN(eps=0.7, min_samples=10, leaf_size=30).fit(X)
X = np.hstack((db.labels_.reshape(-1,1), X))


rand_item = eval("['COMT_Hap1_LvsMH', 'venting2', 'positive_reframing2', 'COMT_Ranked', 'behavioral_disengagement2', 'trait1', 'HL_MAOA', 'HL_MAOA']")

pca = PCA(n_components = 2)
data_transformed = df[rand_item]
data_transformed = pca.fit_transform(data_transformed)

db = DBSCAN(eps=0.5, min_samples=10, leaf_size=30).fit(X)
X = np.hstack((db.labels_.reshape(-1,1), X))


rand_item = eval("['PCL_Strict2', 'emotional_support2', 'PCL_Broad2', 'emotional_support1', 'COMT_Hap1_LvsMH', 'HML_FKBP5', 'lot1', 'COMT_Hap1_LvsMH', 'COMT_Hap2_recode', 'state1']")

pca = PCA(n_components = 2)
data_transformed = df[rand_item]
data_transformed = pca.fit_transform(data_transformed)

db = DBSCAN(eps=0.6, min_samples=10, leaf_size=30).fit(X)
X = np.hstack((db.labels_.reshape(-1,1), X))


rand_item = eval("['phq2', 'substance_use1', 'instrumental_support2', 'substance_use1', 'trait1']")

pca = PCA(n_components = 2)
data_transformed = df[rand_item]
data_transformed = pca.fit_transform(data_transformed)

db = DBSCAN(eps=0.5, min_samples=5, leaf_size=30).fit(X)
X = np.hstack((db.labels_.reshape(-1,1), X))


In [116]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=271828, stratify=Y)


In [122]:
pipe = Pipeline(steps=[
            ('SMOTE', SMOTE(random_state=27)),
              #('PCA',  PCA(n_components = 0.85)),
       ('RFE',  RFE(RandomForestClassifier(n_estimators=100), 14)),
                ('classifier', GradientBoostingClassifier(n_estimators=100))])

In [123]:

params_grid = [
           {
            'SMOTE__k_neighbors':[3,  2,1],
            'SMOTE__sampling_strategy':[0.9, 0.85, 0.95],
            'RFE__n_features_to_select':[11,18, 25, 35]
            }]

In [124]:
gs = GridSearchCV(pipe, params_grid, cv=5, scoring='precision')

In [125]:
score = cross_val_score(pipe, X_train, y_train, scoring="precision", cv=5)

print("score", score)
# clf.fit(X_train_res, y_train_res)
# X_train = pca.transform(X_train)
# y_pred = clf.predict(X_train)
# print("recall_score", recall_score(y_pred,y_train))
# print("precision_score",precision_score(y_pred,y_train))
# print("accuracy_score",accuracy_score(y_pred,y_train))
# print("f1_score",f1_score(y_pred,y_train))
# print("roc_auc_score",roc_auc_score(y_pred,y_train))


score [0.525      0.55813953 0.43333333 0.54545455 0.51282051]


In [126]:
gs.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
gs.best_score_

In [None]:
gs.best_params_

In [82]:
? SMOTE