In [7]:
import random
import numpy as np
import pandas as pd
from fancyimpute import IterativeImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score
np.random.seed(271828)
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

https://towardsdatascience.com/multitask-learning-teach-your-ai-more-to-make-it-better-dde116c2cd40

In [10]:
def PCL_calculator(df):

    symptomatic_cutoff = 2

    intrusion = ['q6.1_INTRU', 'q6.2_DREAM', 'q6.3_FLASH', 'q6.4_UPSET', 'q6.5_PHYS']
    avoidance = ['q6.6_AVTHT', 'q6.7_AVSIT', 'q6.8_AMNES', 'q6.9_DISINT', 'q6.10_DTACH',
                 'q6.11_NUMB', 'q6.12_FUTRE']
    tred = ['q6.1_INTRU', 'q6.2_DREAM', 'q6.3_FLASH']
    only_avoidance = ['q6.6_AVTHT', 'q6.7_AVSIT', 'q6.8_AMNES']

    hypertension = ['q6.13_SLEEP', 'q6.14_ANGER', 'q6.15_CONC', 'q6.16_HYPER', 'q6.17_STRTL']

    depression = ['q6.9_DISINT', 'q6.10_DTACH', 'q6.11_NUMB', 'q6.12_FUTRE']

    df[intrusion + avoidance + hypertension].fillna(df[intrusion + avoidance + hypertension].mean(axis=1))
    intrusion_cuoff = 1
    avoidance_cuoff = 3
    hypertension_cuoff = 2
    only_avoidance_cutoff = 1
    depression_cutoff = 2
    tred_cutoff = 1

    df['sum'] = (df[intrusion + avoidance + hypertension]).sum(axis=1)

    df['intrusion'] = (df[intrusion] > symptomatic_cutoff).sum(axis=1)
    df['intrusion_cutoff'] = df['intrusion'] >= intrusion_cuoff

    df['avoidance'] = (df[avoidance] > symptomatic_cutoff).sum(axis=1)
    df['avoidance_cutoff'] = df['avoidance'] >= avoidance_cuoff

    df['depression'] = (df[depression] > symptomatic_cutoff).sum(axis=1)
    df['depression_cutoff'] = df['depression'] >= depression_cutoff

    df['hypertention'] = (df[hypertension] > symptomatic_cutoff).sum(axis=1)
    df['hypertention_cutoff'] = df['hypertention'] >= hypertension_cuoff

    df['tred'] = (df[tred] > symptomatic_cutoff).sum(axis=1)
    df['tred_cutoff'] = df['tred'] >= tred_cutoff

    df['only_avoidance'] = (df[only_avoidance] > symptomatic_cutoff).sum(axis=1)
    df['only_avoidance_cutoff'] = df['only_avoidance'] >= only_avoidance_cutoff

    df['regression_cutoff_33'] = df['sum'] >= 33
    df['regression_cutoff_50'] = df['sum'] >= 50
    df['diagnosis'] = ((df['hypertention_cutoff']) & (df['avoidance_cutoff']) & (df['intrusion_cutoff']) & (df['sum'] >= 50))

    return df



In [11]:

class PTSD_Model:

    features = ["age", "highschool_diploma", "dyslexia", "ADHD", "T1Acc1t", "T1Acc1n", "T1bias", "phq1", "lot1",
                "trait1",
                "state1", "PCL1", "PCL_Broad1", "PCL_Strict1", "phq2", "lot2", "trait2", "state2", "PCL2", "PCL_Broad2",
                "PCL_Strict2", "cd_risc1", "active_coping1", "planning1", "positive_reframing1", "acceptance1",
                "humor1",
                "religion1", "emotional_support1", "instrumental_support1", "self_distraction1", "denial1",
                "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1", "active_coping2", "planning2",
                "positive_reframing2", "acceptance2", "humor2", "religion2", "emotional_support2",
                "instrumental_support2",
                "self_distraction2", "denial2", "venting2", "substance_use2", "behavioral_disengagement2",
                "self_blame2",
                "trauma_history8_1", "HML_5HTT", "HL_MAOA", "HML_NPY", "COMT_Hap1_recode",
                "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5"]

    features_2 = ['q6.1_INTRU_pcl1', 'q6.2_DREAM_pcl1',
                  'q6.3_FLASH_pcl1', 'q6.4_UPSET_pcl1',
                  'q6.5_PHYS_pcl1', 'q6.6_AVTHT_pcl1', 'q6.7_AVSIT_pcl1', 'q6.8_AMNES_pcl1', 'q6.9_DISINT_pcl1',
                  'q6.10_DTACH_pcl1', 'q6.11_NUMB_pcl1', 'q6.12_FUTRE_pcl1', 'q6.13_SLEEP_pcl1',
                  'q6.14_ANGER_pcl1', 'q6.15_CONC_pcl1', 'q6.16_HYPER_pcl1', 'q6.17_STRTL_pcl1',
                  'intrusion_pcl1', 'avoidance_pcl1', 'hypertention_pcl1', 'depression_pcl1', 'tred_pcl1',
                  'q6.1_INTRU_pcl2', 'q6.2_DREAM_pcl2',
                  'q6.3_FLASH_pcl2', 'q6.4_UPSET_pcl2',
                  'q6.5_PHYS_pcl2', 'q6.6_AVTHT_pcl2', 'q6.7_AVSIT_pcl2', 'q6.8_AMNES_pcl2', 'q6.9_DISINT_pcl2',
                  'q6.10_DTACH_pcl2', 'q6.11_NUMB_pcl2', 'q6.12_FUTRE_pcl2', 'q6.13_SLEEP_pcl2',
                  'q6.14_ANGER_pcl2', 'q6.15_CONC_pcl2', 'q6.16_HYPER_pcl2', 'q6.17_STRTL_pcl2',
                  'intrusion_pcl2', 'avoidance_pcl2', 'hypertention_pcl2', 'depression_pcl2', 'tred_pcl2']
    target_features = ["PCL_Strict3", "PCL3"]
    target_features_2 = ["intrusion_cutoff", "avoidance_cutoff", "hypertention_cutoff",
                         'depression_cutoff', 'diagnosis', "PCL3", "only_avoidance_cutoff", "tred_cutoff",
                         "regression_cutoff_33", "regression_cutoff_50"]
    ID = ["ID"]
    dataset_path = r"../Data/PTSD.xlsx"

    multiple_features_no_imputation = ['q6.16_HYPER_pcl1',  'hypertention_pcl2', 'q6.5_PHYS_pcl2', 'q6.12_FUTRE_pcl1',
                                       'cd_risc1',  'q6.2_DREAM_pcl2',  'q6.14_ANGER_pcl2', 'positive_reframing2',
                                       'venting2', 'q6.15_CONC_pcl1', 'q6.8_AMNES_pcl1',
                                       'q6.15_CONC_pcl2', 'PCL_Broad2', 'phq2', 'q6.4_UPSET_pcl2']

    def __init__(self):
        path = "C:\‏‏PycharmProjects\PTSD\Data\PTSD.xlsx"
        df = pd.read_excel(path)
        df = df[~df['PCL_Strict3'].isna()]
        #df = df[~ ((df["military_exp18_t3"] == 0) & (df["military_exp18_t2"] == 0))]
        df = df[self.features + self.ID + self.target_features]
        df_pcl3 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL3.xlsx")
        df_pcl3 = PCL_calculator(df_pcl3)
        df_pcl2 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL2.xlsx")
        df_pcl2 = PCL_calculator(df_pcl2)
        df_pcl1 = pd.read_excel("C:\‏‏PycharmProjects\PTSD\Data\questionnaire6PCL1.xlsx")
        df_pcl1 = PCL_calculator(df_pcl1)

        df = df.merge(df_pcl1, on="ID", how='outer')
        df = df.merge(df_pcl2, suffixes=('_pcl1', '_pcl2'), on="ID", how='outer')
        df = df.merge(df_pcl3.drop(['PCL3_Strict', 'pcl3', 'PCL3_Broad'], axis=1), on="ID", how='outer')

        df = df[~df['PCL_Strict3'].isna()]
        #df = df[~df['tred_cutoff'].isna()]
        df.drop(self.ID, inplace=True, axis=1)
        mice = IterativeImputer()
        df = pd.DataFrame(mice.fit_transform(df), columns=df.columns)

        all_x_col = self.features + self.features_2 + self.target_features_2 + questions
        #all_x_col = self.features + self.features_2
        #y_col = ["tred_cutoff"]
        y_col = ["PCL_Strict3"]
        X = df[all_x_col]
        Y = df[y_col]
        X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(X, Y, test_size=0.25, random_state=271828, stratify=Y)
        X_train, X_test, y_train, y_test = train_test_split(X_train_0, y_train_0, test_size=0.25, random_state=271828, stratify=y_train_0)
        df = pd.concat([X_train, y_train], axis=1)
        self.X_test = X_test
        self.y_test =y_test

        self.X_train_0 = X_train_0
        self.X_test_0 = X_test_0
        self.y_train_0 = y_train_0
        self.y_test_0 = y_test_0

        self.df = df


In [24]:
m = PTSD_Model()
features = m.features + m.features_2
X = m.df[features]
Y01 = m.df["PCL_Strict3"].apply(lambda x: int(x))
X = PolynomialFeatures(2).fit_transform(X)
features = X.columns

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y01, stratify=Y01)

In [None]:
rf = RandomForestClassifier(n_estimators=1000, class_weight={1:13, 0:1}, max_depth=2).fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(f1_score(y_test, y_pred))

for i in sorted(zip(rf.feature_importances_, features), reverse=1)[:15:]:
    print(i[1])

In [None]:
lr = LogisticRegression(class_weight={1:100, 0:1}).fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f1_score(y_test, y_pred))

for i in sorted(zip(np.abs(lr.coef_).reshape(-1,1), features), reverse=1)[:15:]:
    print(i[1])

In [None]:
gb = GradientBoostingClassifier(n_estimators=1000, max_depth=2).fit(X_train, y_train)
y_pred = gb.predict(X_test)
print(f1_score(y_test, y_pred))

for i in sorted(zip(gb.feature_importances_, features), reverse=1)[:15:]:
    print(i[1])

In [None]:
et = ExtraTreesClassifier(n_estimators=1000,class_weight={1:13, 0:1}, max_depth=3).fit(X_train, y_train)
y_pred = et.predict(X_test)
print(f1_score(y_test, y_pred))

for i in sorted(zip(et.feature_importances_, features), reverse=1)[110:210:]:
    print(i[1])

In [None]:
X_train.shape