In [14]:
import numpy as np
import pandas as pd
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score,precision_score, recall_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.impute import SimpleImputer

In [15]:
path = "PTSD.xlsx"
df = pd.read_excel(path)
#df = df[~df["PCL_Strict3"].isna()]


In [16]:
features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD", "T1ETBE", "T1Acc1t",
                         "T1Acc1n", "T1bias", "T2Acc1t", "T2Acc1n", "T2bias", "state1", "state2", "trait1",
                         "trait2", "lot1", "lot2", "phq1", "phq2", "PCL1", "PCL2", "cd_risc1", "ptgi2",
                         "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                         "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                         "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                         "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                         "religion2", "emotional_support2", "instrumental_support2","self_distraction2",
                         "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                         "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                         "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                         "Sephar_scale", "Unknown"]

In [17]:
numerical_features = [ "T1ETBE", "T1Acc1t", "T1Acc1n", "T1bias", "T2Acc1t","T2Acc1n", "T2bias", "state1", "state2",
                        "trait1", "trait2", "lot1", "lot2", "phq1", "phq2", "cd_risc1", "PCL1", "PCL2"]
categorical_features = [ "age", "highschool_diploma", "Hebrew", "dyslexia", "ADHD",   "ptgi2",
                    "active_coping1", "planning1", "positive_reframing1", "acceptance1", "humor1",
                    "religion1", "emotional_support1", "instrumental_support1", "self_distraction1",
                    "denial1", "venting1", "substance_use1", "behavioral_disengagement1", "self_blame1",
                    "active_coping2", "planning2", "positive_reframing2", "acceptance2", "humor2",
                    "religion2", "emotional_support2", "instrumental_support2", "self_distraction2",
                    "denial2", "venting2", "substance_use2", "behavioral_disengagement2", "self_blame2",
                    "trauma_history8_1", "military_exposure_unit", "HML_5HTT", "HL_MAOA", "HML_NPY",
                    "COMT_Hap1_recode", "COMT_Hap2_recode", "COMT_Hap1_LvsMH", "HML_FKBP5", "Ashken_scale",
                    "Sephar_scale", "Unknown"]
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df[numerical_features] = imp.fit_transform(df[numerical_features])

imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[categorical_features] = imp.fit_transform(df[categorical_features])

#df = df[~df["PCL_Strict3"].isna()]
X = df[~df["PCL_Strict3"].isna()][features]
X = X - X.mean()

Y = df[~df["PCL_Strict3"].isna()]["PCL_Strict3"]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.1, random_state=271828, stratify=Y)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_train, y_train, test_size = 0.05, random_state=271828, stratify=y_train)

In [29]:
sm = SMOTE(random_state=27)
X_train_2, y_train_2 = sm.fit_sample(X_train_2, y_train_2.ravel())

In [30]:
X_train_2 = np.vstack((X_train_2, df[df["PCL_Strict3"].isna()][features]))


In [31]:
unlabeled = df[df["PCL_Strict3"].isna()]['PCL_Strict3'].fillna(-1)

In [32]:
y_train_2 = np.hstack((y_train_2, unlabeled))


In [33]:
? LabelSpreading

In [34]:

clf0 = LabelPropagation(gamma=20, n_neighbors=150,alpha=0.0005,kernel='knn')
clf1 = LabelSpreading(gamma=20, n_neighbors=30,alpha=0.05,kernel='knn')
vot = VotingClassifier(estimators=[('clf0', clf0), ('clf1', clf1)], voting='soft')
#score = cross_val_score(vot, X_train_2, y_train_2, cv=StratifiedKFold())

#print("score", score)



vot.fit(X_train_2, y_train_2)
# X_train = pca.transform(X_train)
# y_pred = clf.predict(X_train)
y_pred =  vot.predict(X_test_2)
print(f1_score(y_pred,y_test_2))
# print("recall_score", recall_score(y_pred,y_train))
# print("precision_score",precision_score(y_pred,y_train))
# print("accuracy_score",accuracy_score(y_pred,y_train))
# print("f1_score",f1_score(y_pred,y_train))
# print("roc_auc_score",roc_auc_score(y_pred,y_train))




0.11764705882352941


In [194]:
sum(y_test_2)

6.0