In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)  

%run functions.py

# load dataset
df_new = pd.read_csv('dataset_with_all_vars_converted_to_numerical_values.csv')
# determine the outcome label 
col_y='sab'
# drop variables we do not use in static models
df_new = df_drop(df_new, ['startweek', 'censorweek'])
df_new.shape

# do the needed preparation
## Center the “number of pregnancies” variable at the median for the gravid cohort
median_numpregs = df_new['b_numpregs'].median()
temp_numpregs = df_new.loc[(df_new['b_everpregnant']==1), 'b_numpregs']
df_new.loc[(df_new['b_everpregnant']==1), 'b_numpregs'] = temp_numpregs - median_numpregs

# Statistical Feature Selection (SFS) 
# step 1
# drop variables wit std of zero
df_std = df_new.std()
drop_cols = df_std[df_std<0.001].index.values
df_new = df_drop(df_new, drop_cols)

# compute the variables statistics and save it in dataframe named "result"
y = df_new[col_y].astype(int)
result = stat_test(df_new, y)

# step 2
# Compute pairwise correlation of variables with each other and determine pairs with corr > 0.9
# Compute the correlation of highly correlated pairs with outcome as well as their p-value
# We remove one variable among highly correlated pairs
selected_col_to_remove, df_highcorr = highcorr_stats(df_new, col_y, thres=0.9)
df_new = df_drop(df_new, selected_col_to_remove)

# step 3
# We test the association between each variable and the outcome 
# Chi2 test is used for non-continuous variables, and KS test is used for continuous variables
# We remove variables that are not independently associated with the outcome based on p-value > 0.05
drop_cols = result.loc[result['p-value']>0.05,'Variable'].values
df_new = df_drop(df_new, drop_cols)


# STATIC MODEL DEVELOPMENT 
cols_rep=['AUC', 'Accuracy','weighted F1-score','weighted_precision_score','weighted_recall_score']# metrics we are interested in
my_scoring='roc_auc' # scoring metric for GridSearchCV
from warnings import filterwarnings
filterwarnings('ignore')

# FULL MODEL
# Logistic Regression with l1 norm regularization 
df_coef_,metrics_df = tr_predict(df_new, col_y=col_y,target_names = ['0', '1'], model='LR',penalty='l1',cv_folds=5,scoring=my_scoring)
df_AUCs=metrics_df.rename(index={0: 'LR-L1'})
result__=df_coef_.merge(result,how='left', on='Variable')
print(result__[['Variable','coef_','y_corr','p-value','y1_mean', 'y0_mean', 'All_mean', 'All_std']])
# Support Vector Machines with l1 norm regularization
df_coef_,metrics_df = tr_predict(df_new, col_y=col_y,target_names = ['0', '1'], model='SVM',penalty='l1',cv_folds=5,scoring=my_scoring)
df_AUCs=pd.concat([df_AUCs,metrics_df.rename(index={0: 'SVM-L1'})])
# Gradient Boosted Decision Trees, Light Gradient Boosting Machine
df_coef_,metrics_df = tr_predict(df_new, col_y=col_y,target_names = ['0', '1'], model='LGB',penalty='l1',cv_folds=5,scoring=my_scoring)
df_AUCs=pd.concat([df_AUCs,metrics_df.rename(index={0: 'GBT'})])
# Random Forest
df_coef_,metrics_df = tr_predict(df_new, col_y=col_y,target_names = ['0', '1'], model='RF',penalty='l1',cv_folds=5,scoring=my_scoring)
df_AUCs=pd.concat([df_AUCs,metrics_df.rename(index={0: 'RF'})])
print(df_AUCs)

# SPARSE MODEL
# Feature_selection by RFE
names = df_new.drop(col_y, axis=1).columns
num_of_cols = len(df_new.columns)
from sklearn.feature_selection import RFE,RFECV
metric_all_rfe = []
Xraw = df_new.drop(col_y, axis=1).values
my_range = range(1,num_of_cols)
my_penalty = 'l1'
for my_C in [0.1, 1]: # try different hyperparameter C with the LR model which we use as the estimator in RFE
    for n_select in my_range: # try different numbers of features to find how many features result in best performance
        # Standardize features by removing the mean and scaling to unit variance
        scaler = preprocessing.StandardScaler()
        # Fits transformer to X and returns a transformed version of X
        X = scaler.fit_transform(Xraw)
        # the LR model which we use as the estimator in RFE
        clf = LogisticRegression(C=my_C, penalty=my_penalty, tol=0.01, class_weight='balanced', solver='liblinear')#0.
        # select features by recursively considering smaller and smaller sets of features
        rfe = RFE(estimator= clf, n_features_to_select=n_select, step=1)
        rfe.fit(X, y.ravel())
        # Selected (i.e., estimated best) features are assigned rank 1
        # so we drop features ranked greater than 1
        X=df_new.drop(names[rfe.ranking_>1], axis=1)
         # evaluate the dataset of selected features using 'LR' model with 'l2' norm regularization
        df_coef_RFE, metric_df_RFE=tr_predict(X, col_y=col_y, target_names = ['0', '1'], model='LR',penalty='l2',cv_folds=5,scoring=my_scoring)
        metric_all_rfe.append([my_C, n_select]+metric_df_RFE.values.tolist()[0])
metric_all_rfe = pd.DataFrame(metric_all_rfe, columns=['my_C','n_select','AUC-mean','AUC-std','Accuracy-mean','Accuracy-std','weighted_F1_score-mean','weighted_F1_score-std','weighted_precision_score-mean','weighted_precision_score-std','weighted_recall_score-mean','weighted_recall_score-std'])
# we pick the my_C and n_select that lead to the model with highest 'AUC-mean' minus 'AUC-std' 
metric_all_rfe['AUC_'] = metric_all_rfe['AUC-mean'] - metric_all_rfe['AUC-std']
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(Xraw)
clf = LogisticRegression(C=metric_all_rfe.loc[metric_all_rfe['AUC_'].idxmax(),'my_C'], penalty=my_penalty, tol=0.01, class_weight='balanced', solver='liblinear')#0.
rfe = RFE(estimator=clf, n_features_to_select=metric_all_rfe.loc[metric_all_rfe['AUC_'].idxmax(),'n_select'], step=1)
rfe.fit(X, y.ravel())
X = df_new.drop(names[rfe.ranking_>1], axis=1)# our dataframe after featture selection by recursive feature elimination

# Logistic Regression with l2 norm regularization
df_coef_, metrics_df_=tr_predict(X, col_y=col_y, target_names = ['0', '1'], model='LR',penalty='l2',cv_folds=5,scoring=my_scoring)
result_RFE=df_coef_.merge(result,how='left', on='Variable')
print(result_RFE[['Variable','coef_','y_corr','p-value','y1_mean','y0_mean','All_mean','All_std']])
print(metrics_df_)