### Model performance before and after feature selection. 

### Update in this version: 
#### 1. look at balancing the cases and controls in training and optimizing the feature weights. 

#### Weights are coefficient using elasticNet (L1 & L2 penalty), features were selected using kbest approach

#### 1. Import settings

In [None]:
# Settings imported from other notebook Settings.ipynb
%run Settings.ipynb

In [None]:
# Load libraries
from pyarrow import feather
import nbimporter
import joblib
import os, datetime
from makedirectory import make_directory

In [None]:
if 'qcd_data' not in globals(): # 2021-11-27_19-16-35, 2021-10-02_23-12-43, 2021-07-19_13-32-34
    print("Loading data")
    qcd_data = joblib.load("G:/PGC ML/Combined Data/2022-03-30_15-07-11/DNHS_GTP_MRS_ArmyS_Prismo_combined.pkl")

In [None]:
[(k,qcd_data[k][0].iloc[:,0:4 ]) for k in qcd_data.keys()]

In [None]:
# Male female count: 1 - M, 2 - F
qcd_data['ptsdpm_wo_NonCpGs'][0]['Gender'].value_counts()

In [None]:
qcd_data['ptsdlife_wo_NonCpGs'][0]['Gender'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
# load the models and features using kbest approach
if 'models_after_fs' not in globals(): 
    print("Loading models") # 2021-11-27_19-23-28 2021-10-03_06-04-14
#     models_after_fs = joblib.load('G:/PGC ML/Trained Models/2021-10-03_06-04-14/RandomForest_after_fs.pkl')

if 'accuracy' not in globals(): #2021-10-03_06-04-14
    accuracy = joblib.load('G:/PGC ML/Trained Models/2022-08-31_22-07-47/logisticregressioncv_accuracy_after_fs.pkl')

if 'important_fea' not in globals(): # 2021-11-28_14-05-29 2021-10-03_18-17-57
    important_fea = joblib.load('G:/PGC ML/Feature Sets/2022-09-02_00-23-31/Important Feature sets.pkl')

In [None]:
# As we had features with methylation values stored previously
# We actually need only the name of features so that we can pull the info
# from QCd data. So let's pull the names and save them 
# Note => this version onwards we will save only feature names

# --------------------------
# It was needed in the earlier versions where I had both, feature names and data
# important_fea_names = dict()
# for k in important_fea.keys():
#     f_index = np.arange(len(important_fea[k]))
#     important_fea_names[k] = [(important_fea[k][indx][0], 
#                                important_fea[k][indx][1].columns) 
#                               for indx in f_index]
    

In [None]:
# save the modified important feature sets - save feature names
# We dont need to run this code for the set where we saved only feature names
# joblib.dump(important_fea_names, 'G:/PGC ML/Feature Sets/2022-03-31_09-55-46/Important Feature sets.pkl')

In [None]:
# when we don't need
# del important_fea

In [None]:
# Features using kbest approach
# Arrange features/accuracy in a data frame
def arrange_ouput(acc, cols):
    """
    Function to arrange the accuracy of the model
    Parameters:
    acc: accuracy
    cols: column names
    """
    acc = [(k, *t) for k, v in acc.items() for t in v]
    acc_df = pd.DataFrame(acc, columns= cols)
    return(acc_df)

In [None]:
# get the rows with ms
import pandas as pd
accuracy_df = arrange_ouput(acc = accuracy, 
                           cols = ['Variable','features_num','accuracy'] )
idx = accuracy_df.loc[accuracy_df.groupby('Variable')['accuracy'].idxmax()]
idx = idx.sort_index()
idx

In [None]:
accuracy_df

In [None]:
imp_ls = list(idx.itertuples(index=False))
imp_ls

In [None]:
# Now get the index of import feature
def get_index(imp, var):
    print(var)
    ind = [name.features_num for name in imp if name.Variable == var]
    ind = ind[0]//10-1 # features in sets of 10, index at 0
    return(ind)

In [None]:
# var_names = [x.Variable for x in imp_ls]
var_names = ['ptsdpm_wo_NonCpGs', 'ptsdpm_wo_NonCpGsXY']
var_names

In [None]:
fea_indx = [get_index(imp=imp_ls, var= x) for x in var_names]
fea_indx

In [None]:
# save cases and controls to use in the 
# 
# ptsd_group = pd.DataFrame(qcd_data['ptsdpm_wo_NonCpGs'][1], columns=['group'])
# ptsd_group

In [None]:
# save group info
# ptsd_group.to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Ptsdpm_case_control_info.csv",
#                                    index=False)

#### Plot confusion matrix

In [None]:
# Confusion matrix for ptsdpm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from copy import deepcopy
# Data with most significant feature sets
# [][] indicates test data and labels

def build_confusion_matrix(predictions, 
                           ytest, 
                           labels, 
                           title,
                           plot_label = None,
                           ax=None,
                           xax=True,
                           yax=True, 
                           colbar=True):
    
    """
    Function to build confusion matrix 
    
    Input:
    predictions: Prediction from the model
    ytest: Test labels to match with predicted values
    labels: Lablels for confusion matrix
    title: Title for plot
    plot_label: Labels for plot (eg, A, B), default None
    ax: ax to plot on if new axes provided
    xax: Option to turn off x axis label, default on
    yax: Option to turn off y axis label, default on
    colbar: Option to turn off colbar, default on
    
    Output: Confusion matrix
    """
    
    
    import matplotlib.pyplot as plt
    
    
    
    # confusion matrix
    cm = confusion_matrix(ytest, predictions, labels=labels)
    print(cm)
    
    # plot confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=labels)
        
    label_font = {'size':'14'}  # Adjust to fit
    title_font = {'size':'15'}  # Adjust to fit

    
    # pass axes object
    if ax is None and colbar: # no axes but colorbar by default
        disp.plot()
    elif ax is None and not colbar: # no axes and colorbar
        disp.plot() # type: ignore
        disp.im_.colorbar.remove()
    elif ax is not None and colbar: # axes and colorbar
        disp.plot(ax = ax)
    else:
        disp.plot(ax=ax)
        disp.im_.colorbar.remove() # axes but no colorbar
    
    # set title
    disp.ax_.set_title(title)
    
    # disp.ax_.set_ylabel("True label", fontdict = label_font)

    # disp.ax_.set_xlabel("Predicted label", fontdict = label_font)

    # Off xaxis label
    if not xax:
        disp.ax_.axes.get_xaxis().get_label().set_visible(False)
     
    # Off yaxis label
    if not yax:
        disp.ax_.axes.get_yaxis().get_label().set_visible(False)
    
    # set plot label
    if plot_label is not None:
        disp.ax_.text(-0.1, 1.15, plot_label, transform=ax.transAxes,
                  fontsize=16, fontweight='bold', va='top', ha='right')


In [None]:
# split data into train & test sets, build the model
# and run predictions
def run_model(top_fea, 
              qcd_data_ml, 
              clf, 
              key, 
              title,
              plot_label=None,
              ax=None,
              xax=True,
              yax=True,
              colbar=True):
    print("Traing data :.....")
    print("Training on ", key)
    
    # split into train and test
    X_train, X_test, y_train, y_test = train_test_split(top_fea[key][0],
                                                        qcd_data_ml[key][1], 
                                                        test_size = 0.25,
                                                        random_state=0,
                                                       stratify=qcd_data_ml[key][1])
    
     # Samples used in training
    # train_s = X_train['Basename']
    
    # Remove sample identifier
    X_train, X_test = [x.loc[:,~x.columns.str.contains("Basename")] 
                       for x in [X_train, X_test]]
    
    # Testing small set
#     X_train = X_train.iloc[:, :100]
#     X_test = X_test.iloc[:, :100]

    print("Train data shape: ", X_train.shape)
    print("Test data shape: ", X_test.shape)
    
    # train and predict
    clf_n = deepcopy(clf)
    clf_n.fit(X_train, y_train)
    prediction = clf_n.predict(X_test)
    
    print('Classification accuracy on test data: {:.3f}\n',
    classification_report(y_test, prediction))

    # function call to make confusion matrix
    plot = build_confusion_matrix(predictions=prediction, 
                                  ytest=y_test, 
                                  labels=[0,1], 
                                  title = title, 
                                  plot_label=plot_label,
                                  ax = ax,
                                  xax = xax,
                                  yax = yax,
                                  colbar = colbar)
    return clf_n
#     return({"classifier":clf_n, "plot":plot})

### Performance before feature selection and plot confusion matrix for the model with all the features

In [None]:
# Random forest
clf_rf = make_pipeline(MinMaxScaler(),
                       BalancedRandomForestClassifier(n_estimators=100, 
                                                      random_state=42,
                                                      n_jobs = -1))
# Gradient boosting
clf_gb = make_pipeline(MinMaxScaler(),
                       GradientBoostingClassifier(loss='deviance', 
                                                  learning_rate=0.1,
                                                  n_estimators=100, 
                                                  subsample=1.0,
                                                  criterion='friedman_mse',
                                                  random_state=42)
                      )
# Lasso
clf_lso = make_pipeline(MinMaxScaler(),  
                        LogisticRegression(solver="liblinear", 
                                           penalty="l1",
                                           max_iter=500,
                                           class_weight = "balanced")
                       )

# elasticNet 
clf_EN = make_pipeline(MinMaxScaler(),  
                        LogisticRegression(solver="saga",
                                           penalty="elasticnet",
                                           C = .95,
                                           l1_ratio=0.1,
                                           max_iter=1000,
                                           class_weight = "balanced")
                      )

# Elastic net with cross validation
cs = [0.5, 1, 2, 4, 5, 8, 10, 15]
l1_r = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
clf_EN_CV =  make_pipeline(MinMaxScaler(), 
                           LogisticRegressionCV(cv = 10,
                                                Cs = cs,
                                                solver="saga",
                                                penalty="elasticnet",
                                                l1_ratios=l1_r,
                                                max_iter=2000,
                                                class_weight = "balanced")
                          )
# Current ptsd
# keys = 'ptsdpm'

#### Running models before feature selection

In [None]:
# Elastic net CV
# Random forest
# fig, (ax1, ax2) = plt.subplots(1, 2, dpi=300) # axes for two figures
# fig.set_figheight(3.8)
# fig.set_figwidth(8)

# EN_CV = [run_model(top_fea = qcd_data, 
#                     qcd_data_ml = qcd_data,
#                     clf = clf_EN_CV, 
#                     key = key, 
#                     title = "Current PTSD (elastic net) \nwithout nonCpG probes",
#                     ax=ax) for key, ax in zip(var_names, (ax1, ax2))]

In [None]:
# Random forest
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=300) # axes for two figures
fig.set_figheight(3.8)
fig.set_figwidth(8)

full_df_names = ["without nonCpG probes", "without nonCpG and XY probes"]


rf1_all = [run_model(top_fea = qcd_data, 
                    qcd_data_ml = qcd_data,
                    clf = clf_rf, 
                    key = key, 
                    title = "Current PTSD (Random forest) \n" + nm,
                    ax=ax,
                    yax = yax,
                    colbar = cb) 
           for key, ax, nm, yax, cb in zip(var_names, 
                                           (ax1, ax2),
                                           full_df_names,
                                          [True, False],
                                          [False, True])]

plt.tight_layout()


In [None]:
# Gradient Boosting
# [run_model(top_fea = qcd_data, 
#           qcd_data_ml = qcd_data,
#           clf = clf_gbb, key = key, 
#          title = "current PTSD (Gradient Boost)" + key) for key in var_names]

In [None]:
# Lasso
# run_model(top_fea = qcd_data, 
#           qcd_data_ml = qcd_data,
#           clf = clf_lso, key = "ptsdpm_cov_adj", 
#          title = "current PTSD (Lasso)")

In [None]:
# elasticNet
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=300) # axes for two figures
fig.set_figheight(3.8)
fig.set_figwidth(8)

EN_all = [run_model(top_fea = qcd_data, 
                    qcd_data_ml = qcd_data,
                    clf = clf_EN, 
                    key = key, 
                    title = "Current PTSD (elasticNet) \n" + nm,
                    ax=ax) for key, ax, nm in zip(var_names, (ax1, ax2),
                                                 full_df_names)]

In [None]:
# fea_indx = [fea_indx[1]] # get only covariate adjusted data index
fea_indx

In [None]:
# keys = list(important_fea.keys())
# keys[0]
qcd_data['ptsdpm_wo_NonCpGs'][1]

### Performance on top features

In [None]:
fea_indx[0]

In [None]:
# loop over keys and important feature list at the same time
# Display number of features in each
[(k, important_fea[k][indx][0]) for k, indx in zip(var_names, fea_indx)]

In [None]:
# create dic of top features, add df and number of features
# fea_indx = fea_indx[0]
fea_ls = [important_fea[k][indx][1]
          for k, indx in zip(var_names, fea_indx)]
fea_ls

In [None]:
# common features in three dfs used in training
common_fea = set(fea_ls[0]) & set(fea_ls[1])
print("# of common features:", len(common_fea))

In [None]:
qcd_data['ptsdlife_wo_NonCpGs'][0].shape

In [None]:
# create a dictionary of top features,
# each with key and a tuple (important features and outcome labels)

def get_top_features(df_name, f_index):
    
    """
    Function to create a dictionary of top features from qcd data and 
    important features.
    Input: 
    df_name : names (keys) of qcd data
    f_index: list of index of top features
    
    output: 
    dictionary of important features with outcome variable
    
    """
    top_f = dict()
    for key, ind in zip(df_name, f_index):
        print("Working on :{}, with imp feature index :{}".format(key, ind))
        get_features = important_fea[key][ind][1].copy()
        x_df = qcd_data[key][0].copy() # copy qcd df with features
        labels = qcd_data[key][1].copy() # outcome labels
        top_f[key] = (x_df.loc[:, x_df.columns.str.contains('|'.join(get_features))],
              labels)
        
    return top_f


top_fea = get_top_features(df_name = var_names, 
                            f_index = fea_indx)


In [None]:
# check if we got all the important features
# matching top features should be all true
[(top_fea[k][0].columns == important_fea[k][i][1]).all() 
 for k, i in zip(top_fea.keys(), fea_indx)]

In [None]:
# top features in the earlier version
# top_fea = dict({k:(important_fea[k][indx][1], qcd_data[k][1]) for k, indx in zip(var_names, fea_indx)})

In [None]:
# top_fea

In [None]:
keys = list(top_fea.keys())
keys

In [None]:
# check if ids are matching
(qcd_data['ptsdpm_wo_NonCpGs'][0]['Basename'] == qcd_data['ptsdpm_wo_NonCpGsXY'][0]['Basename']).all() 

In [None]:
# -------------------------------
# Now save the data in csv so that we could use
# it in the getting demographic characteristics
current_ptsd = [pd.concat([qcd_data[k][0]['Basename'], 
                         top_fea[k][0]], axis=1) for k in top_fea.keys()]

In [None]:
[print("Dim of current ptsd dfs {}".format(x.shape)) for x in current_ptsd]

In [None]:
# Add outcome variable (current ptsd) to each df
for i,val in enumerate(current_ptsd):
    current_ptsd[i]['current_ptsd'] = top_fea[keys[i]][1]

In [None]:
# Function to check shape
def print_shape(in_df):
    print("Length:", len(in_df))
    print("Shape without nonCpGs :", in_df[0].shape)
    print("Shape without nonCpGsXY :", in_df[1].shape)
    in_df[0].iloc[:,0:5].head()

In [None]:
# Current PTSD details
print_shape(current_ptsd)

In [None]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
import pandas as pd
path = "G:/PGC ML/Combined Data/2022-03-30_15-07-11/"
writer = pd.ExcelWriter(path + 'ElasticNet_Current_ptsd_important_features.xlsx', 
                        engine='xlsxwriter')


current_ptsd[0].to_excel(writer, sheet_name="Without NonCpG Probes",
                         index=False)
current_ptsd[1].to_excel(writer, sheet_name="Without NonCpGXY Probes",
                         index=False)

# Close the Pandas Excel writer and output the Excel file.
writer.save()

# current_ptsd

In [None]:
# keys of lifetime ptsd
ptsdlife_keys = [k for k in qcd_data.keys() if 'ptsdlife' in k]
ptsdlife_keys

In [None]:
lifetime_ptsd = [qcd_data[k][0] for k in ptsdlife_keys]
[x.shape for x in lifetime_ptsd]

In [None]:
# Now get important features only
lifetime_ptsd = [x.loc[:, x.columns.str.contains('|'.join(y))] 
                       for x,y in zip(lifetime_ptsd, fea_ls)]
[x.shape for x in lifetime_ptsd]

In [None]:
# Now add sample id and lifetime ptsd
lifetime_ptsd = [pd.concat([qcd_data[k][0]['Basename'], y], axis=1) 
                 for k, y in zip(ptsdlife_keys, lifetime_ptsd)]

In [None]:
[x.shape for x in lifetime_ptsd]

In [None]:
# Add outcome variable (lifetime ptsd) to each df
for i,val in enumerate(lifetime_ptsd):
    lifetime_ptsd[i]['lifetime_ptsd'] = qcd_data[ptsdlife_keys[i]][1]

In [None]:
# Current Lifetime PTSD details
print_shape(lifetime_ptsd)

In [None]:
# Change file name to save if needed
writer_life = pd.ExcelWriter(path + 'ElasticNet_Lifetime_ptsd_important_features.xlsx', 
                        engine='xlsxwriter')


lifetime_ptsd[0].to_excel(writer_life, sheet_name="Without NonCpG Probes",
                         index=False)
lifetime_ptsd[1].to_excel(writer_life, sheet_name="Without NonCpGXY Probes",
                         index=False)

writer_life.save()

In [None]:
top_fea.keys()

In [None]:
# Random forest on top features
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=300) # axes for two figures
fig.set_figheight(3.8)
fig.set_figwidth(8)
model_rf1 = run_model(top_fea = top_fea, 
                      qcd_data_ml = qcd_data,
                      clf = clf_rf, 
                      key = 'ptsdpm_wo_NonCpGs', 
                      title = "Current PTSD (Random forest)- \nwithout NonCpG probes",
                      plot_label = "A",
                      ax=ax1,
                      colbar=False)
# plt.colorbar().remove()

model_rf2 = run_model(top_fea = top_fea, 
                      qcd_data_ml = qcd_data,
                      clf = clf_rf, 
                      key = 'ptsdpm_wo_NonCpGsXY', 
                      title = "Current PTSD (Random forest)-\nwithout NonCpG and XY probes",
                      ax=ax2,
                      yax = False)
plt.tight_layout()


In [None]:
(168+55)/(168+55+60+24)

In [None]:
# Gradient Boosting
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=300) # axes for two figures
fig.set_figheight(3.8)
fig.set_figwidth(8)
model_gb1 = run_model(top_fea = top_fea, 
                      qcd_data_ml = qcd_data,
                      clf = clf_gb, 
                      key = 'ptsdpm_wo_NonCpGs', 
                      title = "Current PTSD (Gradient Boost)- \nwithout NonCpG probes",
                      plot_label = "A",
                      ax=ax1,
                      colbar=False)
# plt.colorbar().remove()

model_gb2 = run_model(top_fea = top_fea, 
                      qcd_data_ml = qcd_data,
                      clf = clf_gb, 
                      key = 'ptsdpm_wo_NonCpGsXY', 
                      title = "Current PTSD (Gradient Boost)-\nwithout NonCpG and XY probes",
                      ax=ax2,
                      yax = False)
plt.tight_layout()


In [None]:

# from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
# import matplotlib.pyplot as plt
# import seaborn as sn
# import pandas as pd
# def build_confusion_matrix_test(ax, fig_title):
#     cm = np.array([[279,  49],
#                    [ 18 , 161]])
#     print(cm)
    
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm,
#                                   display_labels=[0,1])
        
#     title_font = {'size':'13.5'}  # Adjust to fit
    
#     disp.plot(ax = ax, colorbar=False) # pass axis and colorbar here
#     disp.ax_.set_title(fig_title, fontdict = title_font)

#     disp.ax_.grid(None)

In [None]:
# fig, (ax1, ax2) = plt.subplots(1, 2, dpi=200)
# # fig.set_figheight(4)
# # fig.set_figwidth(10)
# build_confusion_matrix_test(ax=ax1, fig_title= "Fig A")
# # build_confusion_matrix_test(ax=ax2, fig_title= "Fig B")
# # plt.tight_layout()
# # test_plot



In [None]:
# import inspect
# lines = inspect.getsource(ConfusionMatrixDisplay)
# print(lines)

In [None]:
# get feature importance from the model 
# used above on top features
def get_feature_importance(modl, coef = False):
    if coef:
        importance = modl.steps[1][1].coef_
        importance = importance[0].tolist() # convert numpy to list
    else:
        importance = modl.steps[1][1].feature_importances_
        
    print("10 Feature importance: \n", importance[1:10])
    print("Length: \n", len(importance))
    return(importance)

In [None]:
# Feature importance
non_cpgs_imp_rf = get_feature_importance(modl = model_rf1)

In [None]:
# Feature importance
non_cpgsXY_imp_rf = get_feature_importance(modl = model_rf2)

In [None]:
top_fea['ptsdpm_wo_NonCpGs'][0].columns

In [None]:
top_fea['ptsdpm_wo_NonCpGsXY'][0].columns

In [None]:
# Creat dataframes of important features with score
def make_df(fea_importance, top_features):
    """
    Function to make data frame of important features
    Input: 
    fea_importance: featur importance
    top_features: top features df/ to feature names if not list
    """
    if isinstance(fea_importance, list): # if input is list
        print("list passed as input...")
        return [pd.DataFrame(x, top_features[k][0].columns) 
         for x, k in zip(fea_importance, top_features.keys())]
    else:
        return pd.DataFrame(fea_importance, top_features)
    

In [None]:
importance_dfs_rf = make_df(fea_importance=[non_cpgs_imp_rf, non_cpgsXY_imp_rf],
                            top_features=top_fea)

In [None]:
importance_dfs_rf

In [None]:
def arrange_df(df):
    df.reset_index(inplace=True)
    df.columns = ['Feature', "Importance"]
    df.sort_values(by=['Importance'], inplace=True)
    return(df)

In [None]:
importance_dfs_rf = [arrange_df(df = x) for x in importance_dfs_rf]

In [None]:
importance_dfs_rf

In [None]:
importance_dfs_rf[0]

In [None]:
# save feature importance in csv
importance_dfs_rf[0].to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGs_RF_selected_wd_EN.csv",
                                   index=False)

importance_dfs_rf[1].to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGsXY_RF_selected_wd_EN.csv",
                                   index=False)

In [None]:
# Lasso ------------------
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=300) # axes for two figures
fig.set_figheight(3.8)
fig.set_figwidth(8)
model_lasso1 = run_model(top_fea = top_fea, 
                         qcd_data_ml = qcd_data,
                         clf = clf_lso, 
                         key = "ptsdpm_wo_NonCpGs", 
                         title = "Current PTSD (Lasso)-\nwithout NonCpG probes",
                         plot_label='B',
                         ax=ax1,
                         colbar=False
                        )


model_lasso2 = run_model(top_fea = top_fea, 
                         qcd_data_ml = qcd_data,
                         clf = clf_lso, 
                         key = "ptsdpm_wo_NonCpGsXY", 
                         title = "Current PTSD (Lasso)-\nwithout NonCpG and XY probes",
                         ax = ax2,
                         yax = False)

plt.tight_layout()

In [None]:
# feature importance - coefficients
non_cpgs_imp_lasso, non_cpgsXY_imp_lasso = [get_feature_importance(modl=x, coef = True) 
                            for x in [model_lasso1, model_lasso2]]

importance_dfs_lso = make_df(fea_importance=[non_cpgs_imp_lasso, non_cpgsXY_imp_lasso],
                            top_features=top_fea)

In [None]:
importance_dfs_lso = [arrange_df(df = x) for x in importance_dfs_lso]
importance_dfs_lso

In [None]:
# save feature importance in csv
importance_dfs_lso[0].to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGs_Lasso_selected_wd_EN.csv",
                                   index=False)

importance_dfs_lso[1].to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGsXY_Lasso_selected_wd_EN.csv",
                                   index=False)

In [None]:
# elesticNet -----------------------
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=200) # axes for two figures
fig.set_figheight(3)
fig.set_figwidth(6)
model_elasticnet1 = run_model(top_fea = top_fea, 
                              qcd_data_ml = qcd_data,
                              clf = clf_EN, 
                              key = "ptsdpm_wo_NonCpGs",
                              title= "Current PTSD (ElasticNet)-\nwithout NonCpG probes",
                              plot_label='C',
                              ax=ax1,
                              colbar=False)

model_elasticnet2 = run_model(top_fea = top_fea, 
                              qcd_data_ml = qcd_data,
                              clf = clf_EN, 
                              key = "ptsdpm_wo_NonCpGsXY",
                              title= "Current PTSD (ElasticNet)-\nwithout NonCpG and XY probes",
                              ax=ax2,
                              yax=False)
plt.tight_layout()

### ISTSS plot - confusion matrix

In [None]:
fig, ax1 = plt.subplots(1, 1, dpi=300) # axes for two figures
fig.set_figheight(2)
fig.set_figwidth(2.5)
model_elasticnet_istss = run_model(top_fea = top_fea, 
                                   qcd_data_ml = qcd_data,
                                   clf = clf_EN, 
                                   key = "ptsdpm_wo_NonCpGsXY",
                                   title= "Confusion matrix (ElasticNet)",
                                   ax = ax1)
ax1.set_xticklabels(["No PTSD", "PTSD"])
ax1.set_yticklabels(["No PTSD", "PTSD"])
plt.tight_layout()



In [None]:
# feature importance - coefficients
non_cpgs_imp_en, non_cpgsXY_imp_en = [get_feature_importance(modl=x, coef = True) 
                            for x in [model_elasticnet1, model_elasticnet2]]

importance_dfs_en = make_df(fea_importance=[non_cpgs_imp_en, non_cpgsXY_imp_en],
                            top_features=top_fea)

In [None]:
importance_dfs_en

In [None]:
importance_dfs_en = [arrange_df(df = x) for x in importance_dfs_en]
importance_dfs_en

In [None]:
# save feature importance in csv
# First was using l1_ratio = 0.05
# importance_dfs_en[0].to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGs_EN_selected_wd_EN.csv",
#                                    index=False)

# importance_dfs_en[1].to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGsXY_EN_selected_wd_EN.csv",
#                                    index=False)


importance_dfs_en[0].to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGs_EN_selected_wd_EN_l1_r_0.1.csv",
                                   index=False)

importance_dfs_en[1].to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGsXY_EN_selected_wd_EN_l1_r_0.1.csv",
                                   index=False)

In [None]:
## Elastic net with cv
# This was Cross validation to choose l1_ratio
# It shows l1_ratio = 0.1 works better
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=300) # axes for two figures
fig.set_figheight(3.8)
fig.set_figwidth(8)
EN_CV_1 = run_model(top_fea = top_fea, 
                              qcd_data_ml = qcd_data,
                              clf = clf_EN_CV, 
                              key = "ptsdpm_wo_NonCpGs",
                              title= "Current PTSD (ElasticNet CV)-\nwithout NonCpG probes",
                              plot_label='C',
                              ax=ax1,
                              colbar=False)

EN_CV_2 = run_model(top_fea = top_fea, 
                              qcd_data_ml = qcd_data,
                              clf = clf_EN_CV, 
                              key = "ptsdpm_wo_NonCpGsXY",
                              title= "Current PTSD (ElasticNet CV)-\nwithout NonCpG and XY probes",
                              ax=ax2,
                              yax=False)
plt.tight_layout()


In [None]:
# We will save these models to use in future
EN_CV_models = {'ptsdpm_wo_NonCpGs': EN_CV_1, 
                "ptsdpm_wo_NonCpGsXY": EN_CV_2}

In [None]:
# Save
joblib.dump(EN_CV_models, "G:/PGC ML/Combined Data/2022-03-30_15-07-11/CV_EN_models.pkl")

In [None]:
EN_CV_1[1].intercept_

In [None]:
EN_CV_1[1].l1_ratio_

In [None]:
EN_CV_1[1].Cs_

In [None]:
EN_CV_1[1].scores_

#### As many features as zero coefficient, lets check model accuracy using features with non-zero coefficient only

In [None]:
# how many have non-zero coefficient
en_fea_nonzero_imp = [(x.loc[x["Importance"] != 0]) for x in importance_dfs_en]
en_fea_nonzero_imp

In [None]:
# How many  have zero coeff
[(x["Importance"] == 0).sum() for x in importance_dfs_en]

In [None]:
for i in range(len(en_fea_nonzero_imp)):
    print(i)
en_fea_nonzero_imp[1]

In [None]:
def get_nonzero_features(top, fea):
    
    """
    Function to get non zero coefficient features
    
    Input:
    top: top features in dictionary
    fea: nonzero coef features
    
    Output: 
    Dictionary of non zero coefficients
    """
    rang = np.arange(len(fea))
    nonzero =  dict()
    for key, ind in zip(top.keys() , rang):
        df = top[key][0].copy()
        labels = top[key][1].copy()
        features = fea[ind]["Feature"].tolist()
        print(key)
        nonzero[key] = (df.loc[:, df.columns.str.contains('|'.join(features))],
                  labels)
    return nonzero

In [None]:
top_fea_nonzero = get_nonzero_features(top = top_fea,
                                       fea = en_fea_nonzero_imp)

In [None]:
# check if we got all the important features
# matching top features should be all true
[(top_fea_nonzero[k][0].columns.isin(en_fea_nonzero_imp[i]["Feature"].tolist())).all() 
 for k, i in zip(top_fea.keys(), range(len(en_fea_nonzero_imp)))]

In [None]:
[top_fea_nonzero[k][0].shape for k in top_fea_nonzero.keys()]

In [None]:
# Now run the model for features with nonzero coefficients
fig, (ax1, ax2) = plt.subplots(1, 2, dpi=300) # axes for two figures
fig.set_figheight(3.8)
fig.set_figwidth(8)
model_en_nonzero1 = run_model(top_fea = top_fea_nonzero, 
                              qcd_data_ml = qcd_data,
                              clf = clf_EN, 
                              key = "ptsdpm_wo_NonCpGs",
                              title= "Current PTSD (ElasticNet-nonzero)-\nwithout NonCpG probes",
                              plot_label='C',
                              ax=ax1,
                              colbar=False)

model_en_nonzero2 = run_model(top_fea = top_fea_nonzero, 
                              qcd_data_ml = qcd_data,
                              clf = clf_EN, 
                              key = "ptsdpm_wo_NonCpGsXY",
                              title= "Current PTSD (ElasticNet-nonzero)-\nwithout NonCpG and XY probes",
                              ax=ax2,
                              yax=False)
plt.tight_layout()

#### AUC curve

In [None]:
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn import svm

def get_auc(imp_data, ptsd_labels, classifier, model,
            name = False, ax = False):
    
    import matplotlib.pyplot as plt
    
    #restore default settings
    import matplotlib as mpl
#     mpl.rcParams.update(mpl.rcParamsDefault)
#     inline_rc = dict(mpl.rcParams)
#     mpl.rcParams.update(inline_rc)
    
    tprs = []
    aucs = []
    print("Classifier :", classifier)
    print("Dimension:", imp_data.shape)
    print("Dataset :", name)
    
    # Run classifier with cross-validation and plot ROC curves
    cv = StratifiedKFold(n_splits=10)
    
    mean_fpr = np.linspace(0, 1, 100)

    # fig settings
    if not ax:
        print("here .......")
        fig, ax = plt.subplots(dpi=300)
        fig.set_figheight(1.7)
        fig.set_figwidth(2.1)
    else:
        print("Here +++++++++")
    
#     SMALL_SIZE = 8
#     MEDIUM_SIZE = 10
#     BIGGER_SIZE = 12

#     plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
#     plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
#     plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
#     plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
#     plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
#     plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
#     plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
    
    for i, (train, test) in enumerate(cv.split(imp_data, ptsd_labels)):
        classifier_copy = deepcopy(classifier)
        classifier_copy.fit(imp_data[train], ptsd_labels[train])
        viz = plot_roc_curve(classifier_copy, imp_data[test], ptsd_labels[test],
                             name='ROC fold {}'.format(i),
                            alpha=0.8, lw=.6, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

    ax.plot([0, 1], [0, 1], linestyle='--', lw=.8, color='r',
            label='Chance', alpha=1)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=.8, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, 
                    color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    # if no name passed as an argument for title
    if not name:
        ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
               title= model)
    else:
        ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
               title="ROC: "+ 
               name+ " ("+ model + ")")
        
    # legend, axis and ticks size    
    ax.legend(loc="lower right", prop={'size': 4})

    # ax.title.set_size(7)
    # ax.xaxis.label.set_size(5)
    # ax.yaxis.label.set_size(5)
    # ax.tick_params(axis='both', which='minor', labelsize=4)
    
    # ax.xticks.label.set_size(4)
    # ax.yticks.label.set_size(4)
    # plt.xticks(fontsize=4)
    # plt.yticks(fontsize=4)

    if not ax:
        plt.show()
    

In [None]:
top_fea.keys()

In [None]:
len(qcd_data['ptsdpm_wo_NonCpGs'][1])

In [None]:
# clfs = [clf_rf, clf_lso, clf_EN]
# models = [ "Random Forest", "Lasso", "Elastic Net"]

In [None]:
# get roc for all models for all data (train + test)
# We should not use test data to tune the model to avoid information leakage

for key in top_fea.keys(): # loop over keys
    p_labels = qcd_data[key][1]
    data = top_fea[key][0].values

    # models
    clfs = [clf_rf, clf_lso, clf_EN]
    models = [ "Random Forest", "Lasso", "Elastic Net"]

    # using all models and top features
    for j in range(len(clfs)):
            get_auc(imp_data=data, ptsd_labels= p_labels, 
                    name = "\nCurrent PTSD -" + key,
                    classifier=clfs[j],
                    model = models[j])

### ISTSS - AUC

In [None]:
p_labels_istss = qcd_data['ptsdpm_wo_NonCpGsXY'][1]
data_istss = top_fea['ptsdpm_wo_NonCpGsXY'][0].values

fig, ax1 = plt.subplots(1, 1, dpi=300) # axes for two figures
fig.set_figheight(2)
fig.set_figwidth(2.5)

get_auc(imp_data=data_istss, 
        ptsd_labels= p_labels_istss,       
        name = "\n", 
        classifier=clf_EN,
        model = "ElasticNet",
        ax=ax1)

In [None]:
# ------------Test
# get_auc(imp_data=data, ptsd_labels= p_labels, 
#         name = "\nCurrent PTSD -" + 'ptsdpm_wo_NonCpGs', 
#         classifier=clfs[0],
#         model = models[0])

In [None]:
# check roc for top features with nonzero coefficient
for key in top_fea_nonzero.keys(): # loop over keys
    p_labels = qcd_data[key][1]
    data = top_fea_nonzero[key][0].values

    # models
    clfs = [clf_rf, clf_lso, clf_EN]
    models = [ "Random forest", "Lasso", "ElasticNet"]

    # using all models and top features
    for j in range(len(clfs)):
            get_auc(imp_data=data, ptsd_labels= p_labels, 
                    name = "\nCurrent PTSD nonzero -" + key,
                    classifier=clfs[j],
                    model = models[j])

### ROC without Trauma and Childhood mt

In [None]:
top_wo_pheno = top_fea['ptsdpm_wo_NonCpGsXY'][0].copy()
top_wo_pheno = top_wo_pheno.drop(["Traumanum", "Childhood_Mt"], axis=1)
top_wo_pheno 

### ISTSS accuracy without childhood trauma and cumulative trauma

In [None]:
top_wo_pheno_istss = {"ptsdpm_wo_NonCpGsXY": (top_wo_pheno, 
                                               top_fea['ptsdpm_wo_NonCpGsXY'][1].copy())
                      }
fig, ax3 = plt.subplots(1, 1, dpi=300)
fig.set_figheight(2)
fig.set_figwidth(2.5)
elasticnet_istss_wo_exp = run_model(top_fea = top_wo_pheno_istss, 
                                   qcd_data_ml = qcd_data,
                                   clf = clf_EN, 
                                   key = "ptsdpm_wo_NonCpGsXY",
                                   title= "Confusion matrix (ElasticNet) \n without exposure variables",
                                   ax = ax3)
ax3.set_xticklabels(["No PTSD", "PTSD"])
ax3.set_yticklabels(["No PTSD", "PTSD"])
plt.tight_layout()

### Get weights of CpGs from the model which was run without exposure variables

In [None]:

# feature importance - coefficients
non_cpgsXY_imp_wo_exp = get_feature_importance(modl=elasticnet_istss_wo_exp, coef = True) 

importance_dfs_en_wo_exp = make_df(fea_importance=[non_cpgsXY_imp_wo_exp],
                            top_features=top_wo_pheno_istss )

In [None]:
importance_dfs_en_wo_exp[0]

In [None]:
importance_dfs_en_wo_exp = arrange_df(df = importance_dfs_en_wo_exp[0])
importance_dfs_en_wo_exp
importance_dfs_en_wo_exp

In [None]:
importance_dfs_en_wo_exp.to_csv("G:/PGC ML/Combined Data/2022-03-30_15-07-11/Important_features_wo_non_CpGsXY_EN_WO_Exposure_Vars_selected_wd_EN_l1_r_0.1.csv",
                                   index=False)

In [None]:
# (top_wo_pheno_istss['ptsdpm_wo_NonCpGsXY'][1] == qcd_data['ptsdpm_wo_NonCpGsXY'][1]).all()

In [None]:
p_labels_istss = qcd_data['ptsdpm_wo_NonCpGsXY'][1]
data_istss = top_wo_pheno.values.copy()
get_auc(imp_data=data_istss, 
        ptsd_labels= p_labels_istss,       
        name = "\n", 
        classifier=clf_EN,
        model = "ElasticNet, without exposure variables")

In [None]:
top_wo_pheno = top_wo_pheno.values
get_auc(imp_data=top_wo_pheno, ptsd_labels= p_labels, 
        name = "\nCurrent PTSD -" + 'ptsdpm_wo_NonCpGs', 
        classifier=clfs[0],
        model = models[0])

In [None]:
get_auc(imp_data=top_wo_pheno, ptsd_labels= p_labels, 
        name = "\nCurrent PTSD -" + 'ptsdpm_wo_NonCpGs', 
        classifier=clfs[1],
        model = models[1])

In [None]:
get_auc(imp_data=top_wo_pheno, ptsd_labels= p_labels, 
        name = "\nCurrent PTSD -" + 'ptsdpm_wo_NonCpGs', 
        classifier=clfs[2],
        model = models[2])

### Plots for manuscript

In [None]:

# Now run the model for features with nonzero coefficients
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, width_ratios=[1,1,1.25], dpi=300) # axes for two figures
fig.set_figheight(2.5)
fig.set_figwidth(8)
en = run_model(top_fea = top_fea, 
                qcd_data_ml = qcd_data,
                clf = clf_EN, 
                key = "ptsdpm_wo_NonCpGsXY",
                title= "Elastic Net",
                ax=ax1,
                colbar=False)

# ax1.set_xticklabels(["No PTSD", "PTSD"])
# ax1.set_yticklabels(["No PTSD", "PTSD"])

lasso = run_model(top_fea = top_fea, 
qcd_data_ml = qcd_data,
clf = clf_lso, 
key = "ptsdpm_wo_NonCpGsXY",
title= "Lasso",
ax=ax2,
yax=False,

colbar=False)

rf = run_model(top_fea = top_fea, 
                qcd_data_ml = qcd_data,
                clf = clf_rf, 
                key = "ptsdpm_wo_NonCpGsXY",
                title= "Random Forest",
                ax=ax3,
                yax=False
                )

for x in [ax1, ax2, ax3]:
    if x ==ax1:
        x.set_yticklabels(["No PTSD", "PTSD"])
    if x in [ax2, ax3]:
        x.set_yticklabels([])

    x.set_xticklabels(["No PTSD", "PTSD"])

plt.tight_layout()

### AUC for manuscript

In [None]:
# Plot AUC for manuscript
p_labels = qcd_data['ptsdpm_wo_NonCpGsXY'][1]
data = top_fea['ptsdpm_wo_NonCpGsXY'][0].values


fig, (ax1, ax2, ax3) = plt.subplots(1, 3, dpi=300) # axes for three figures
fig.set_figheight(2.7)
fig.set_figwidth(7)
    
# Plot elastic net, lasso and random forest in order
get_auc(imp_data=data, 
        ptsd_labels= p_labels, 
        classifier=clfs[2],
        model = models[2],
        ax = ax1)

get_auc(imp_data=data, 
        ptsd_labels= p_labels, 
        classifier=clfs[1],
        model = models[1],
        ax = ax2)

get_auc(imp_data=data, 
        ptsd_labels= p_labels, 
        classifier=clfs[0],
        model = models[0],
        ax = ax3)

for x in [ax1, ax2, ax3]:
    if x in [ax2, ax3]:
        x.set_yticklabels([])
        x.set(ylabel=None)

plt.tight_layout()

### Both confusion matrix and AUC together

In [None]:
# Plot AUC for manuscript
p_labels = qcd_data['ptsdpm_wo_NonCpGsXY'][1]
data = top_fea['ptsdpm_wo_NonCpGsXY'][0].values


fig, axes = plt.subplots(2, 3, width_ratios=[1,1,1.25], dpi=300) # axes for two figures
ax1, ax2, ax3, ax4, ax5, ax6 = axes.flatten()
fig.set_figheight(5)
fig.set_figwidth(8)
en = run_model(top_fea = top_fea, 
                qcd_data_ml = qcd_data,
                clf = clf_EN, 
                key = "ptsdpm_wo_NonCpGsXY",
                title= "Elastic Net",
                plot_label= 'A',
                ax=ax1,
                colbar=False)

# ax1.set_xticklabels(["No PTSD", "PTSD"])
# ax1.set_yticklabels(["No PTSD", "PTSD"])

lasso = run_model(top_fea = top_fea, 
                  qcd_data_ml = qcd_data,
                  clf = clf_lso, 
                  key = "ptsdpm_wo_NonCpGsXY",
                  plot_label='B',
                  title= "Lasso",
                  ax=ax2,
                  yax=False,
                 colbar=False)

rf = run_model(top_fea = top_fea, 
                qcd_data_ml = qcd_data,
                clf = clf_rf, 
                key = "ptsdpm_wo_NonCpGsXY",
                title= "Random Forest",
                plot_label='C',
                ax=ax3,
                yax=False
                )

for x in [ax1, ax2, ax3]:
    if x ==ax1:
        x.set_yticklabels(["No PTSD", "PTSD"])
    if x in [ax2, ax3]:
        x.set_yticklabels([])

    x.set_xticklabels(["No PTSD", "PTSD"])


# fig, (ax1, ax2, ax3) = plt.subplots(1, 3, dpi=300) # axes for three figures
# fig.set_figheight(2.8)
# fig.set_figwidth(8)
    
get_auc(imp_data=data, 
        ptsd_labels= p_labels, 
        classifier=clf_EN,
        model = '',
        ax = ax4)

get_auc(imp_data=data, 
        ptsd_labels= p_labels, 
        classifier=clf_lso,
        model = '',
        ax = ax5)

get_auc(imp_data=data, 
        ptsd_labels= p_labels, 
        classifier=clf_rf,
        model = '',
        ax = ax6)

for x in [ax5, ax6]:
    x.set_yticklabels([])
    x.set(ylabel=None)

plt.tight_layout()

### Accuracy and AUC for Model 1

In [None]:
# Plot AUC for manuscript
p_labels = qcd_data['ptsdpm_wo_NonCpGsXY'][1]
data = top_fea['ptsdpm_wo_NonCpGsXY'][0].values

fig, axes = plt.subplots(1, 2, width_ratios=[1.2, 1], dpi=300) # axes for two figures
ax1, ax2 = axes.flatten()
fig.set_figheight(2.2)
fig.set_figwidth(5)
en = run_model(top_fea = top_fea, 
                qcd_data_ml = qcd_data,
                clf = clf_EN, 
                key = "ptsdpm_wo_NonCpGsXY",
                title= "Confusion matrix: Elastic Net",
                ax=ax1,
                colbar=True)

ax1.set_xticklabels(["No PTSD", "PTSD"])
ax1.set_yticklabels(["No PTSD", "PTSD"])


    
get_auc(imp_data=data, 
        ptsd_labels= p_labels, 
        classifier=clf_EN,
        model = 'ROC: Elastic Net',
        ax = ax2)


plt.tight_layout()

### Accuracy and AUC for Model 2 

In [None]:
# top_wo_pheno_istss = {"ptsdpm_wo_NonCpGsXY": (top_wo_pheno, 
#                                                top_fea['ptsdpm_wo_NonCpGsXY'][1].copy())
#                       }

# p_labels_istss = qcd_data['ptsdpm_wo_NonCpGsXY'][1]

data_istss = top_wo_pheno.values.copy()

fig, axes = plt.subplots(1, 2, width_ratios=[1.2, 1], dpi=300) # axes for two figures
ax1, ax2 = axes.flatten()
fig.set_figheight(2.2)
fig.set_figwidth(5)
en = run_model(top_fea = top_wo_pheno_istss, 
                qcd_data_ml = qcd_data,
                clf = clf_EN, 
                key = "ptsdpm_wo_NonCpGsXY",
                title= "Confusion matrix: Elastic Net",
                ax=ax1,
                colbar=True)

ax1.set_xticklabels(["No PTSD", "PTSD"])
ax1.set_yticklabels(["No PTSD", "PTSD"])


    
get_auc(imp_data=data_istss, 
        ptsd_labels= p_labels_istss, 
        classifier=clf_EN,
        model = 'ROC: Elastic Net',
        ax = ax2)


plt.tight_layout()

In [None]:
get_auc(imp_data=data, ptsd_labels= p_labels, 
        classifier=clfs[0],
        model = models[0])

In [None]:
models


#### As we have trained the models on PTSDpm, lets use them to predict lifetime ptsd


In [None]:
qcd_data.keys()

In [None]:
ptsd_life_k = ['ptsdlife_wo_NonCpGs', 'ptsdlife_wo_NonCpGsXY']
ptsd_life = {k:qcd_data[k] for k in ptsd_life_k if k in ptsd_life_k}

In [None]:
ptsd_life.keys()

In [None]:
ptsd_life['ptsdlife_wo_NonCpGs'][0].iloc[:, 0:5]

In [None]:
qcd_data["ptsdlife_wo_NonCpGs"][1]

In [None]:
fea_indx

In [None]:
# Function to train on current PTSD and test on lifetime PTSD
def run_model_lifetime(important_fea, qcd_data, clf, key, 
                        life_key, imp_indx, title, 
                       random = True, percent = None):
    
    # Training on current PTSD
    print("Training on :", key)
    X_train, X_test, y_train, y_test = train_test_split(qcd_data[key][0], 
                                                        qcd_data[key][1],
                                                        test_size = 0.25,
                                                        random_state = 0,
                                                        stratify=qcd_data[key][1])
    
    # Samples used in training
    train_s = X_train['Basename']

    # Remove sample identifier
    X_train, X_test = [x.loc[:,~x.columns.str.contains("Basename")] 
                   for x in [X_train, X_test]]
    
    # Index of significant features
    f_indx = imp_indx
    print(f_indx)
    
    # Get sifnificant features from train and test
    features = important_fea[key][f_indx][1]
    print("# of features:", len(features))
    X_train = X_train.loc[:, X_train.columns.isin(features)].values
    X_test = X_test.loc[:, X_test.columns.isin(features)].values
    
    print("Dimension current ptsd, train and test:", 
          X_train.shape,X_test.shape)
    
    # fit on current ptsd
    clf.fit(X_train, y_train)
    
    
    # Now test on lifetime PTSD
    k = life_key
    print("Testing on lifetime ptsd :", k)
    if(random): # select samples randomly
        print("Randomly selecting the samples...")
        X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(qcd_data[k][0],
                                                                    qcd_data[k][1],
                                                                    test_size = 0.25,
                                                                    random_state = 0,
                                                                    stratify=qcd_data[k][1])
        X_test_imp_fea = X_test_l.loc[:, X_test_l.columns.isin(features)].copy()
        X_test_imp_fea_comb = X_test_imp_fea.copy()
        X_test_imp_fea_comb["ptsdlife"] = y_test_l
        X_test_imp_fea_comb["Basename"] = X_test_l["Basename"]
        X_test_l = X_test_imp_fea.values
        print("In Training current PTSD from testset in lifetime :",
                    X_test_imp_fea_comb['Basename'].isin(train_s).sum())
        
        
    else: # select samples not used in training
        # Now get the samples from lifetime ptsd that 
        # are not used in train (using current ptsd)
        print("Selecting the samples not overlaping with training set...")
        comb = qcd_data[k][0].copy()
        comb['ptsdlife'] = qcd_data[k][1]
        others = comb.loc[~comb['Basename'].isin(train_s)] # remove those in training
        
        if percent is not None:
            print("Using {} % of data".format(percent))
            others = others.sample(frac=percent).copy()
        
        # make test data including important features
        X_test_imp_fea_comb = others.loc[:, others.columns.isin(features)].copy()
        X_test_l = X_test_imp_fea_comb.values
        y_test_l = np.array(others['ptsdlife'])
        
        # make a copy to return to look at case/controls with remitted
        X_test_imp_fea_comb['ptsdlife'] = others['ptsdlife']
        X_test_imp_fea_comb['Basename'] = others['Basename']
    
    print("Dimension lifetime ptsd, test:", 
          X_test_l.shape)
    
    pred = clf.predict(X_test_l)
    print('Classification accuracy using ptsdpm model: {:.3f}\n',
          classification_report(y_test_l, pred))
    
    build_confusion_matrix(predictions = pred,
                           ytest=y_test_l,
                           labels=[0,1], 
                           title = title)
    
    return X_test_imp_fea_comb

In [None]:
fea_indx[0]

In [None]:
important_fea.keys()

In [None]:
# Predicting lifetime PTSD using the model trained on current PTSD
ptsdlife_testset = dict()
for i, val in enumerate(ptsd_life_k):
    for j in range(len(clfs)):
        title = "\nLifetime PTSD (" + models[j] +  ")-" + val 
        out = run_model_lifetime(important_fea=important_fea,
                       qcd_data= qcd_data, 
                       clf = clfs[j], key = keys[i], 
                       life_key=ptsd_life_k[i], imp_indx=fea_indx[i],
                       title = title)
        ptsdlife_testset[keys[i]] = out

#### Test lifetime PTSD with the samples not selected randomly and not overlaping with Current PTSD test set

In [None]:
# Predicting lifetime PTSD using the model trained on current PTSD
# using the samples not used in training and without drawing
# the samples at random
ptsdlife_testset_wo_rand = dict()
for i, val in enumerate(ptsd_life_k):
    for j in range(len(clfs)):
        title = "\nLifetime PTSD (" + models[j] +  ")-" + val 
        out_wo_rand = run_model_lifetime(important_fea=important_fea,
                       qcd_data= qcd_data, 
                       clf = clfs[j], key = keys[i], 
                       life_key=ptsd_life_k[i], imp_indx=fea_indx[i],
                       title = title, random=False)
        ptsdlife_testset_wo_rand[keys[i]] = out_wo_rand

In [None]:
ptsdlife_testset.keys()

In [None]:
ptsdlife_testset['ptsdpm_wo_NonCpGs']

In [None]:
# Check how many of the samples are in current PTSD
# ptsdlife_testset['ptsdpm_wo_NonCpGs']['Basename'].isin(qcd_data['ptsdpm_wo_NonCpGs'][0]['Basename']).sum()

In [None]:
# Get current PTSD information from phenotype file
pheno = pd.read_csv("G:/PGC ML/Pre_Processed Data/2021-11-15_21-41-53/DNHS_GTP_MRS_ArmyS_Prismo_Pheno.csv") 

# Use only one from ptsdpm_wo_NonCpGs/ptsdpm_wo_NonCpGXY
# Both have the same samples but different features
comn_wd_ptsdpm = [pd.merge(x['ptsdpm_wo_NonCpGs'], 
        pheno[["Ptsdpm", "Basename"]], on='Basename') for x in [ptsdlife_testset, ptsdlife_testset_wo_rand]]

In [None]:
[x.shape for x in comn_wd_ptsdpm]

In [None]:
for df in comn_wd_ptsdpm:
    df['Ptsdpm'] = df['Ptsdpm'].astype('Int64')

[x.shape for x in comn_wd_ptsdpm]


In [None]:
# Number of remitted - ptsdpm == 0 & ptsdlife ==1
[(((x['Ptsdpm'] == 0) & (x['ptsdlife'] == 1)).sum()) for x in comn_wd_ptsdpm]

In [None]:
# Number of cases and controls in lifetime PTSD testset
count = [(x['ptsdlife'].value_counts()) for x in comn_wd_ptsdpm]
[(x, x/x.sum()) for x in count] # percent

In [None]:
# -----------------------
# when percentage as input for test set 
# ptsdlife_testset_wo_rand_per = dict()
# for i, val in enumerate(ptsd_life_k):
#     for j in range(len(clfs)):
#         title = "\nLifetime PTSD (" + models[j] +  ")-" + val 
#         out_wo_rand_per = run_model_lifetime(important_fea=important_fea,
#                        qcd_data= qcd_data, 
#                        clf = clfs[j], key = keys[i], 
#                        life_key=ptsd_life_k[i], imp_indx=fea_indx[i],
#                        title = title, random=False, percent = 0.30)
#         ptsdlife_testset_wo_rand_per[keys[i]] = out_wo_rand

In [None]:
fea_indx
keys[0]

In [None]:
ptsd_life_k

In [None]:
# out_wo_rand_per1 = run_model_lifetime_test(important_fea=important_fea,
#                        qcd_data= qcd_data, 
#                        clf = clfs[2], key = keys[1], 
#                        life_key=ptsd_life_k[1], imp_indx=fea_indx[1],
#                        title = title, random=False)

In [None]:
# out_wo_rand_per1 = run_model_lifetime_test(important_fea=important_fea,
#                        qcd_data= qcd_data, 
#                        clf = clfs[2], key = keys[1], 
#                        life_key=ptsd_life_k[1], imp_indx=fea_indx[1],
#                        title = title, random=False)

In [None]:
# lets save the lifetime test dataset with important features 
# to generate methylation risk score

path = "G:/PGC ML/Combined Data/2022-03-30_15-07-11/"
for i, k in enumerate(ptsdlife_keys):
    print("Testing on lifetime ptsd :", k)
    X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(qcd_data[k][0], 
                                                                qcd_data[k][1],
                                                                test_size = 0.25,
                                                                random_state = 0,
                                                           stratify=qcd_data[k][1])

    ptsd_life_test = X_test_l.loc[:, X_test_l.columns.isin(important_fea[keys[i]][fea_indx[i]][1])]

    # all should be true
    print("All matching :",
         (ptsd_life_test.columns == important_fea[keys[i]][fea_indx[i]][1]).all())

    
    ptsdlife_test_imp = ptsd_life_test.copy()
    ptsdlife_test_imp['PtsdLife'] = y_test_l
    ptsdlife_test_imp['Basename'] = X_test_l['Basename']
    
    
    #save
    ptsdlife_test_imp.to_csv(path + "ElasticNet_"+ k + "testset_important_features.csv", index=False)

    

### Check the accuracy without childhood maltreatment

In [None]:
top_fea.keys(), top_fea['ptsdpm_wo_NonCpGs'][0].shape

In [None]:
# drop childhood maltreatment
top_wo_cm = [dict({k:(top_fea[k][0].drop(columns=['Childhood_Mt']).copy(),
                       qcd_data[k][1])}) for k in top_fea.keys()]

In [None]:
# Shape after removing childhood maltreatment
for i, k in enumerate(keys):
    print(f"# of features in {k} df: {top_wo_cm[i][k][0].shape}")

In [None]:
# Model top features without childhood mt
en_wo_cm = run_model(top_wo_cm[0], qcd_data, clf_EN, keys[0],
         title = f"\nCurrent PTSD without CM (ElasticNet)-{keys[0]}")

In [None]:
en_wo_cm1 = run_model(top_wo_cm[1], qcd_data, clf_EN, keys[1],
         title = f"\nCurrent PTSD without CM (ElasticNet)-{keys[1]}")

In [None]:
# Random forest
# rf_wo_cm = run_model(top_wo_cm[0], qcd_data, clf_rf, keys[0],
#          title = f"\nCurrent PTSD without CM (Random forest)-{keys[0]}")

In [None]:
# rf_wo_cm1 = run_model(top_wo_cm[1], qcd_data, clf_rf, keys[1],
#          title = f"\nCurrent PTSD without CM (Random forest)-{keys[1]}")

In [None]:
# rf_wo_cm_imp

In [None]:
# get feature importance
# rf_wo_cm_imp = [x.steps[1][1].feature_importances_ for x in [rf_wo_cm, rf_wo_cm1]]
# print(f"# of feature imp dfs: {len(rf_wo_cm_imp)}")
# print(f"# of features in first df: {len(rf_wo_cm_imp[0])}")
# print(f"# of features in second df: {len(rf_wo_cm_imp[1])}")

In [None]:
def create_importance_df(imp_df, fea):
    imp_df_wo_cm = pd.DataFrame(imp_df, fea.columns) 
    imp_df_wo_cm.reset_index(inplace=True)
    imp_df_wo_cm.columns = ['Feature', "Importance"]
    imp_df_wo_cm.sort_values(by=['Importance'], inplace=True)
    return(imp_df_wo_cm)
    

In [None]:
# call function to create dfs
# imp_df_wo_cm = [create_importance_df(imp_df=rf_wo_cm_imp[i],
#                                      fea=top_wo_cm[i][keys[i]][0]) for i in range(len(rf_wo_cm_imp))]

In [None]:
# [x.shape for x in imp_df_wo_cm]

In [None]:
# [x.head() for x in imp_df_wo_cm]

#### Now we calculate methylation risk scores based on the weights (feature importance). The methylation risk score is the weighted sum of the important freatures for each individual

In [None]:
importance_dfs_en

In [None]:
# now divide the data into train and test and
# get the top features and scale the traumanum and childhood cols
# use only test set to calulate methylation risk scores
def get_top_features_testset(key, imp_df):
    X_train, X_test, y_train, y_test = train_test_split(qcd_data[key][0], 
                                                        qcd_data[key][1],
                                                        test_size = 0.25,
                                                        random_state = 0,
                                                        stratify=qcd_data[key][1])
    
    # Pull important features. We need only test data
    top_fea_train = X_train.loc[:, X_train.columns.isin(top_fea[key][0].columns)].copy()
    top_fea_test = X_test.loc[:, X_test.columns.isin(top_fea[key][0].columns)].copy()
    
    print(f"X_test dim: {top_fea_test.shape}")
    print(f"# of test labels: {len(y_test)}")
#     print(top_fea_test.head())
    
    # sort the colums based on the order in feature importance df
    top_fea_train = top_fea_train[imp_df['Feature']].copy()
    top_fea_test = top_fea_test[imp_df['Feature']].copy()
    
    # after sorting the columns, index is reset. So lets reindex it to original
    top_fea_test = top_fea_test.reindex(X_test.index) 
    ids = X_test['Basename'].copy()
    
    # check the order is same
    print("Training set in order :", (top_fea_train.columns == imp_df["Feature"]).all())
    print("Test set in order :", (top_fea_test.columns == imp_df["Feature"]).all())
    return(top_fea_test, y_test, ids)
    
    

In [None]:
top_fea_test1, top_fea_test2 = [get_top_features_testset(key = keys[i], 
                                   imp_df = importance_dfs_en[i]) for i in range(len(keys))]
print(f"dim first df : {top_fea_test1[0].shape}, # test labels: {len(top_fea_test1[1])}")
print(f"dim second df : {top_fea_test2[0].shape}, # test labels: {len(top_fea_test2[1])}")

In [None]:
# Convert feature column to index
importance_dfs_new = [x.set_index('Feature') for x in importance_dfs_en]
[x.head() for x in importance_dfs_new ]

In [None]:
# Now scale two columns
from sklearn.preprocessing import MinMaxScaler
scl_cols = ["Traumanum", "Childhood_Mt"]

def scale_data(df, cols):
    scaler = MinMaxScaler()
    indx = df.index
    scaled = scaler.fit_transform(df[cols])
    scaled = pd.DataFrame(scaled, indx, columns=cols)
    return(scaled)
    
    

In [None]:
# call scale function on the required columns of test data
scaled_test = [scale_data(df = x, 
                          cols=scl_cols) for x in [top_fea_test1[0], 
                                                   top_fea_test2[0]]]
scaled_test

In [None]:
# reset index to replace the columns with scaled
# [x.reset_index(drop=True, inplace=True) for x in [top_fea_test1[0],
#                                                  top_fea_test2[0]]]
top_fea_test1[0][scl_cols] = scaled_test[0][scl_cols]
top_fea_test2[0][scl_cols] = scaled_test[1][scl_cols]


In [None]:
top_fea_test1

In [None]:
# Get risk scores for both train and test data
# get weighted sum
# top_fea_train["risk_score"] = top_fea_train.dot(importance_df_new['Importance'])
top_fea_test1[0]["risk_score"] = top_fea_test1[0].dot(importance_dfs_new[0]['Importance'])
top_fea_test2[0]["risk_score"] = top_fea_test2[0].dot(importance_dfs_new[1]['Importance'])
top_fea_test1

In [None]:
top_fea_test2[0]

In [None]:
# make copy
rsk_scores_test =  [top_fea_test1[0].copy(), top_fea_test2[0].copy()]

In [None]:
for x in rsk_scores_test:
    print(f"dim : {x.shape}")

In [None]:
top_fea_test1[0]

In [None]:
top_fea_test1[2]

In [None]:
# Adding PTSD labels. We also need to reset the index of 
# basenames to add the ids to risk_scores df
rsk_scores_test[0]["ptsdpm"] = top_fea_test1[1]
# top_fea_test1[2].reset_index(drop=True, inplace = True)
rsk_scores_test[0]["Basename"] = top_fea_test1[2]

# top_fea_test2[2].reset_index(drop=True, inplace = True)
rsk_scores_test[1]["ptsdpm"] = top_fea_test2[1]
rsk_scores_test[1]["Basename"] = top_fea_test2[2]

In [None]:
# print head
print(f"First df : {rsk_scores_test[0].shape}")
print(f"Second df : {rsk_scores_test[1].shape}")

In [None]:
rsk_scores_test[0]

In [None]:
# Check if we have all the important features before writing the data
rsk_scores_test[0].columns.isin(top_fea['ptsdpm_wo_NonCpGs'][0].columns).sum()

In [None]:
rsk_scores_test[1].columns.isin(top_fea['ptsdpm_wo_NonCpGsXY'][0].columns).sum()

In [None]:
# save risk scores on test data 2021-10-02_23-12-43
# rsk_scores_train.to_csv("G:/PGC ML/Combined Data/2021-11-27_19-16-35/risk scores ptsdpm training data.csv",
#                        index=False)

writer_test = pd.ExcelWriter(path + 'Elasticnet risk scores ptsdpm test data.xlsx', 
                        engine='xlsxwriter')


rsk_scores_test[0].to_excel(writer_test, sheet_name="Without NonCpG Probes",
                         index=False)
rsk_scores_test[1].to_excel(writer_test, sheet_name="Without NonCpGXY Probes",
                         index=False)

writer_test.save()

# rsk_scores_test.to_csv("G:/PGC ML/Combined Data/2021-11-27_19-16-35/risk scores ptsdpm test data.csv", 
#                       index=False)

In [None]:
# linear model on test data using risk scores
from sklearn.linear_model import LinearRegression
rsk_scores = np.array(rsk_scores_test[0].iloc[:, -3:-2])
df = rsk_scores_test[0].iloc[:, :-1]
y_test = np.array(rsk_scores_test[0].iloc[:, -2:-1]) # ptsd as outcome
reg = LinearRegression().fit(rsk_scores, y_test)

In [None]:
reg.score(rsk_scores, y_test)

### Other old code

In [None]:
# Lets use updated data frame to train and test the model
# it has a risk score column
# create dict with df and number of features
top_fea_wd_rs = dict({'ptsdpm_cov_adj': (top_fea_test, 301)})
top_fea_wd_rs


In [None]:
# key = 'ptsdpm_cov_adj'
# run_model(top_fea = top_fea_wd_rs, 
#           qcd_data_ml = qcd_data,
#           clf = clf_rf, key = key, 
#          title = "current PTSD (Random forest)")

In [None]:
# roc curve
# p_labels = qcd_data[key][1]
# data = top_fea_wd_rs[key][0].values
# get_auc(imp_data=data, ptsd_labels= p_labels, 
#         name = "current PTSD", classifier = clf_rf,
#        model = "Random forest")

In [None]:
# Now using only risk scores to predict ptsd
risk_sc = dict({"ptsdpm_cov_adj":(top_fea_test.iloc[:, -1:], 1)})
risk_sc

In [None]:
# using weights on training data
top_fea_all = top_fea['ptsdpm_cov_adj'][0].copy()
top_fea_sor_all = top_fea_all[importance_df['Feature']]
top_fea_sor_all.shape

In [None]:
scaler = MinMaxScaler()
all_scaled = scaler.fit_transform(top_fea_sor_all[scl_cols])
all_scaled = pd.DataFrame(all_scaled, columns=scl_cols)
all_scaled

In [None]:
# reset index to replace the columns with scaled 
top_fea_sor_all.reset_index(drop=True, inplace=True)
top_fea_sor_all[scl_cols] = all_scaled[scl_cols]
top_fea_sor_all

In [None]:
risk_sc_all = top_fea_sor_all.dot(importance_df_new['Importance'])
top_fea_sor_all['risk_score'] = risk_sc_all
top_fea_sor_all

In [None]:
top_fea_sor_all

In [None]:
risk_sc_all = dict({'ptsdpm_cov_adj':(top_fea_sor_all.iloc[:, -1:], len(risk_sc_all))})
risk_sc_all

In [None]:
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn import svm


def get_auc_varient(imp_data, ptsd_labels, name, 
           classifier, classifier1, model):
    import matplotlib.pyplot as plt
    tprs = []
    aucs = []
    print("Classifier :", classifier)
    # Run classifier with cross-validation and plot ROC curves
    cv = StratifiedKFold(n_splits=10)

    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots()
    for i, (train, test) in enumerate(cv.split(imp_data, ptsd_labels)):
        
        # fit model
        classifier1.fit(imp_data[train], ptsd_labels[train])
        
        # get importance and calculate risk scores
        imp_per =  classifier1.named_steps.balancedrandomforestclassifier.feature_importances_
        risk_trn = imp_data[train].dot(imp_per)
        risk_df_trn = pd.DataFrame(risk_trn, columns=['risk'])
        
        # risk scores for test using weights calculated
        # on training data
        risk_test = imp_data[test].dot(imp_per)
        risk_df_test = pd.DataFrame(risk_test, columns=['risk'])
        
        # fit model again using only risk scores
        classifier.fit(risk_df_trn, ptsd_labels[train])

        
        viz = plot_roc_curve(classifier, risk_df_test, ptsd_labels[test],
                             name='ROC fold {}'.format(i),
                             alpha=0.3, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, 
                    color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
           title="Receiver operating characteristic: "+ 
           name+ " ("+ model + ")" )
    ax.legend(loc="lower right")
    plt.show()
    


In [None]:
run_model(top_fea = risk_sc_all, 
          qcd_data_ml = qcd_data,
          clf = clf_rf, key = key, 
         title = "current PTSD (Random forest)")

In [None]:
# p_labels = qcd_data[key][1]
# data = top_fea[key][0].values
# get_auc_varient(imp_data=data, ptsd_labels= p_labels, 
#                 name = "current PTSD", 
#                 classifier = clf_rf,
#                 classifier1 = clf_rf,
#                 model = "Random forest")

In [None]:
run_model(top_fea = risk_sc_all, 
          qcd_data_ml = qcd_data,
          clf = clf_lso, key = key, 
         title = "current PTSD (Lasso)")

In [None]:
# data = top_fea[key][0].values
# get_auc_varient(imp_data=data, ptsd_labels= p_labels, 
#                 name = "current PTSD", 
#                 classifier = clf_lso,
#                 classifier1= clf_rf,
#                 model = "Lasso")

In [None]:
run_model(top_fea = risk_sc_all, 
          qcd_data_ml = qcd_data,
          clf = clf_EN, key = key, 
         title = "current PTSD (ElasticNet)")

In [None]:
# using only risk scores to predict
p_labels = qcd_data[key][1]
data = risk_sc_all[key][0].values
for j in range(len(clfs)):
        get_auc(imp_data=data, ptsd_labels= p_labels, 
                name = "current PTSD", classifier=clfs[j],
                model = models[j])

In [None]:
# data = top_fea[key][0].values
# get_auc_varient(imp_data=data, ptsd_labels= p_labels, 
#         name = "current PTSD", 
#         classifier = clf_EN,
#         classifier1= clf_rf,
#         model = "ElasticNet")

In [None]:
# imp_data = top_fea[key][0].values
# ptsd_labels = qcd_data[key][1]
# cv = StratifiedKFold(n_splits=2)
# for i, (train, test) in enumerate(cv.split(imp_data, ptsd_labels)):
#         clf_rf.fit(imp_data[train], ptsd_labels[train])
        
        

In [None]:
# imp_per =  clf_rf.named_steps.balancedrandomforestclassifier.feature_importances_
# risk = imp_data[train].dot(imp_per)
# risk_df = pd.DataFrame(risk, columns=['risk'])

# clf_rf.fit(risk_df, ptsd_labels[train])


### Using best parameters

In [None]:
tuned_parms = joblib.load("G:/PGC ML/Model tuning/2021-10-20_15-35-10/Finetuned_models.pkl")

In [None]:
best_prms = tuned_parms['ptsdpm_cov_adj']['clf_tun'].best_params_
best_prms

#### Now using important features using kbest approach and hyperparameters

In [None]:
# Appropriate model according to feature set
tuned_clf = make_pipeline(MinMaxScaler(),
                BalancedRandomForestClassifier(**best_prms))
run_model(top_fea = top_fea, 
          qcd_data_ml = qcd_data,
          clf = tuned_clf, key = key, 
         title = "current PTSD (Random forest)")


In [None]:
data = top_fea[key][0].values
get_auc(imp_data=data, ptsd_labels= p_labels, 
        name = "current PTSD", classifier = tuned_clf,
       model = "Random forest")