In [1]:
import psycopg2
import ipynb.fs.full.create_tables_model_a as cta
import ipynb.fs.full.create_tables_model_b as ctb
import ipynb.fs.full.lab_features as labf
import ipynb.fs.full.medications as meds
import ipynb.fs.full.daily as da
import ipynb.fs.full.gcs as gcs
import ipynb.fs.full.weight as we
import ipynb.fs.full.cultures as cu
import ipynb.fs.full.lines as li
import pandas as pd
import pickle
from sklearn.impute import KNNImputer
pd.options.mode.chained_assignment = None
total_a = 3100
total_b = 120

In [2]:
def model_a_feature_generation(conn):
    # aquire cohort table from postgres as dataframe:
    sql = "select * from model_a_cohort_updated"
    cohort = pd.io.sql.read_sql(sql, conn)
    # save the number of patients in the updated cohort:
    global total_a
    total_a = len(cohort)
    # save updated cohort table as csv:
    cohort.to_csv("C:/Temp/submission_tables/model_a_cohort_updated.csv", encoding='utf-8', index=False)
    
    # for every table generated previously - run the matching functions to generate features and save them as csv:
    relev_table = pd.read_csv("C:/Temp/submission_tables/a_relevant_events_clean.csv")
    labf.create_relev_features(cohort, relev_table, output_file_name="C:/Temp/submission_tables/a_relevant_events_clean_for_modeling.csv", 
                             table_name='a_relevant_events_clean')
    
    sql = "select * from respiratory_checks_clean"
    table = pd.io.sql.read_sql(sql, conn)
    labf.create_resp_features(cohort, table, output_file_name="C:/Temp/submission_tables/a_respiratory_checks_clean_for_modeling.csv", 
                             table_name='a_respiratory_checks_clean')

    sql = "select * from antibiotics"
    table = pd.io.sql.read_sql(sql, conn)
    meds.create_meds_features(cohort, table, output_file_name="C:/Temp/submission_tables/a_antibiotics_for_modeling.csv", 
                              table_name='a_antibiotics', kind='Antibiotics')
    
    sql = "select * from pressor_sedatives"
    table = pd.io.sql.read_sql(sql, conn)
    meds.create_meds_features(cohort, table, output_file_name="C:/Temp/submission_tables/a_pressor_sedatives_for_modeling.csv", 
                              table_name='a_pressor_sedatives', kind='Pressor-Sedatives')

    sql = "select * from gcs_score"
    table = pd.io.sql.read_sql(sql, conn)
    gcs.create_gcs_features_limited(cohort, table, output_file_name="C:/Temp/submission_tables/a_gcs_for_modeling.csv",
                            table_name="a_gcs")
    
    sql = "select * from cohort_a_stays_info_limited_relevantcols"
    table = pd.io.sql.read_sql(sql, conn)
    table.to_csv("C:/Temp/submission_tables/cohort_a_stays_info_limited_relevantcols.csv", encoding='utf-8', index=False)
    print("done creating cohort_a_stays_info_limited_relevantcols.csv")
    print("shape: ",table.shape)
    
    sql = "select * from a_blood_cultures"
    table = pd.io.sql.read_sql(sql, conn)
    cu.create_culture_features(cohort, table, output_file_name="C:/Temp/submission_tables/a_blood_cultures_for_modeling.csv",
                 culture_kind='Blood Culture', model='a', table_name='a_blood_cultures')
    
    sql = "select * from lines"
    table = pd.io.sql.read_sql(sql, conn)
    li.create_line_features(cohort, table, output_file_name="C:/Temp/submission_tables/a_lines_for_modeling.csv",
                 table_name='a_lines')

In [2]:
def model_b_feature_generation(conn):
    # aquire cohort table from postgres as dataframe:
    sql = "select * from model_b_cohort_updated"
    cohort = pd.io.sql.read_sql(sql, conn)
    # save the number of patients in the updated cohort:
    global total_b
    total_b = len(cohort)
    # save updated cohort table as csv:
    cohort.to_csv("C:/Temp/submission_tables/model_b_cohort_updated.csv", encoding='utf-8', index=False)
    
    # for every table generated previously - run the matching functions to generate features and save them as csv:
    table = pd.read_csv("C:/Temp/submission_tables/b_relevant_events_clean.csv")
    labf.create_lab_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_relevant_events_clean_for_modeling.csv", 
                             table_name='b_relevant_events_clean')
    
    sql = "select * from b_respiratory_checks_clean"
    table = pd.io.sql.read_sql(sql, conn)
    labf.create_lab_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_respiratory_checks_clean_for_modeling.csv", 
                             table_name='b_respiratory_checks_clean')
    
    sql = "select * from b_antibiotics"
    table = pd.io.sql.read_sql(sql, conn)
    meds.create_meds_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_antibiotics_for_modeling.csv", 
                              table_name='b_antibiotics', kind='Antibiotics')
    
    sql = "select * from b_pressor_sedatives"
    table = pd.io.sql.read_sql(sql, conn)
    meds.create_meds_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_pressor_sedatives_for_modeling.csv", 
                              table_name='b_pressor_sedatives', kind='Pressor-Sedatives')
    
    sql = "select * from b_output_liquid_table"
    table = pd.io.sql.read_sql(sql, conn)
    da.create_liquid_output_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_output_liquid_daily_for_modeling.csv", 
                                 table_name="b_output_liquid_daily", label='daily urine output')
    
    sql = "select * from b_input_liquid_clean"
    table = pd.io.sql.read_sql(sql, conn)
    da.create_liquid_input_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_input_liquid_daily_for_modeling.csv", 
                                 table_name="b_input_liquid_daily", model_type='b', label='daily liquid input')

    sql = "select * from b_gcs_score"
    table = pd.io.sql.read_sql(sql, conn)
    gcs.create_gcs_features_full(cohort, table, output_file_name="C:/Temp/submission_tables/b_gcs_for_modeling.csv",
                            table_name="b_gcs")
    
    sql = "select * from b_bmi_clean"
    table = pd.io.sql.read_sql(sql, conn)
    we.calculate_bmi(cohort, table, output_file_name="C:/Temp/submission_tables/b_bmi_for_modeling.csv",
                 table_name='b_bmi')
    
    sql = "select * from b_daily_weight_clean"
    table = pd.io.sql.read_sql(sql, conn)
    we.create_daily_weight(cohort, table, output_file_name="C:/Temp/submission_tables/b_daily_weight_clean_for_modeling.csv",
                 table_name='b_daily_weight_clean')
    
    sql = "select * from cohort_b_stays_info_limited_relevantcols"
    table = pd.io.sql.read_sql(sql, conn)
    table.to_csv("C:/Temp/submission_tables/cohort_b_stays_info_limited_relevantcols.csv", encoding='utf-8', index=False)
    print("done creating cohort_b_stays_info_limited_relevantcols.csv")
    print("shape: {}\n".format(table.shape))
    
    sql = "select * from cohort_b_general_info"
    table = pd.io.sql.read_sql(sql, conn)
    table.to_csv("C:/Temp/submission_tables/cohort_b_general_info.csv", encoding='utf-8', index=False)
    print("done creating cohort_b_general_info.csv")
    print("shape: {}\n".format(table.shape))
    
    sql = "select * from b_blood_cultures"
    table = pd.io.sql.read_sql(sql, conn)
    cu.create_culture_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_blood_cultures_for_modeling.csv",
                 culture_kind='Blood Culture', model='b', table_name='b_blood_cultures')
    
    sql = "select * from b_cultures"
    table = pd.io.sql.read_sql(sql, conn)
    cu.create_culture_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_cultures_for_modeling.csv",
                 culture_kind='Other Culture', model='b', table_name='b_cultures')
    
    sql = "select * from lines"
    table = pd.io.sql.read_sql(sql, conn)
    li.create_line_features(cohort, table, output_file_name="C:/Temp/submission_tables/b_lines_for_modeling.csv",
                 table_name='b_lines')

In [7]:
# make the final external validation set out of the created files

def make_output_df_from_files(model): 
    if model == 'a':
        url_to_save = "C:/Temp/submission_tables/model_a/external_validation_set.csv"
        df1 = pd.read_csv("C:/Temp/submission_tables/a_relevant_events_clean_for_modeling.csv")
        df2 = pd.read_csv("C:/Temp/submission_tables/a_antibiotics_for_modeling.csv")
        df3 = pd.read_csv("C:/Temp/submission_tables/a_pressor_sedatives_for_modeling.csv")
        df4 = pd.read_csv("C:/Temp/submission_tables/a_lines_for_modeling.csv")
        #df5 = pd.read_csv("C:/Temp/submission_tables/a_output_liquid_daily_for_modeling.csv")
        #df6 = pd.read_csv("C:/Temp/submission_tables/a_input_liquid_daily_for_modeling.csv")
        df7 = pd.read_csv("C:/Temp/submission_tables/a_gcs_for_modeling.csv")
        #df8 = pd.read_csv("C:/Temp/submission_tables/a_bmi_for_modeling.csv")
        #df9 = pd.read_csv("C:/Temp/submission_tables/a_daily_weight_clean_for_modeling.csv")
        df10 = pd.read_csv("C:/Temp/submission_tables/a_respiratory_checks_clean_for_modeling.csv")
        df11 = pd.read_csv("C:/Temp/submission_tables/a_blood_cultures_for_modeling.csv")
        #df12 = pd.read_csv("C:/Temp/submission_tables/a_cultures_for_modeling.csv")
        df13 = pd.read_csv("C:/Temp/submission_tables/cohort_a_stays_info_limited_relevantcols.csv")
        #df14 = pd.read_csv("C:/Temp/submission_tables/cohort_a_general_info.csv")
        
        # the commented out dfs are not needed for the 40 selected features the model was trained on, so we don't include
        # them in the external_validation_set.csv
        
        arr_dfs = [df1,df2,df3,df4,df7,df10,df11,df13]
        
    if model == 'b':
        url_to_save = "C:/Temp/submission_tables/model_b/external_validation_set.csv"
        df1 = pd.read_csv("C:/Temp/submission_tables/b_relevant_events_clean_for_modeling.csv")
        df2 = pd.read_csv("C:/Temp/submission_tables/b_antibiotics_for_modeling.csv")
        df3 = pd.read_csv("C:/Temp/submission_tables/b_pressor_sedatives_for_modeling.csv")
        df4 = pd.read_csv("C:/Temp/submission_tables/b_lines_for_modeling.csv")
        df5 = pd.read_csv("C:/Temp/submission_tables/b_output_liquid_daily_for_modeling.csv")
        df6 = pd.read_csv("C:/Temp/submission_tables/b_input_liquid_daily_for_modeling.csv")
        df7 = pd.read_csv("C:/Temp/submission_tables/b_gcs_for_modeling.csv")
        df8 = pd.read_csv("C:/Temp/submission_tables/b_bmi_for_modeling.csv")
        df9 = pd.read_csv("C:/Temp/submission_tables/b_daily_weight_clean_for_modeling.csv")
        df10 = pd.read_csv("C:/Temp/submission_tables/b_respiratory_checks_clean_for_modeling.csv")
        df11 = pd.read_csv("C:/Temp/submission_tables/b_blood_cultures_for_modeling.csv")
        df12 = pd.read_csv("C:/Temp/submission_tables/b_cultures_for_modeling.csv")
        df13 = pd.read_csv("C:/Temp/submission_tables/cohort_b_stays_info_limited_relevantcols.csv")
        df14 = pd.read_csv("C:/Temp/submission_tables/cohort_b_general_info.csv")

        arr_dfs = [df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11,df12,df13,df14['age'],df14['gender']]

    #cleaning unnecessary columns:
    cols_to_remove = ['identifier', 'unitstayid_in_cohort', 'Unnamed: 0', 'subject_id', 'hadm_id', 
                      'patienthealthsystemstayid', 'patientunitstayid']
    for df in arr_dfs:
        for col_name in cols_to_remove:
            try:
                df.drop(col_name, axis='columns', inplace=True)
            except:
                pass
                
    #concat all cleaned dfs:
    result = pd.concat(arr_dfs, axis=1)
    if model == 'a':
        result = result.reindex(sorted(result.columns), axis=1) #the model was trained on df with sorted columns, needed for matching with the saved scaler
    #display(result)
    result.to_csv(url_to_save, encoding='utf-8', index=False)

In [4]:
# Input:
# 1. file_path - a CSV file with the same format as the ‘model_a_mimic_cohort_v2’ without the target column.
# 2. db_conn - psycopg2.connect object for the new database, which will have exactly the same schemas and tables as in MIMIC.
# 3. model_type - 'a' or 'b'
# Output: 
# external_validation_set.csv - a CSV file with all the features to use in the model.

# Functionality:
# In this module we conducted feature engineering that includes: 
# non-human values removal, generation of statistics features and time features.
# For model A, we generate only a small subset of the features that were generated for the model creation phase,
# in order to shorten the running time, as was requested in the submission guidelines.
# For model B, we generate all of the features that were generated for the model creation phase, 
# since there are a lot less patients in the cohort and the running time is short anyway.

def module_1_cohort_creation(file_path, db_conn, model_type):
    try:
        cur = db_conn.cursor() # create a cursor
        cur.execute('''set search_path to mimiciii''')
        if model_type == 'a':
            # create tables with raw data from the db:
            cta.create_all_tables_model_a(file_path, db_conn, cur)
            # clean data and remove non-human values:
            cta.clean_tables(cur, db_conn)
            # generate all features for modeling:
            model_a_feature_generation(db_conn)
            # create and save external_validation_set.csv file:
            make_output_df_from_files(model_type)

        if model_type == 'b':
            # create tables with raw data from the db:
            ctb.create_all_tables_model_b(file_path, db_conn, cur)
            # clean data and remove non-human values:
            ctb.clean_tables(cur, db_conn)
            # generate all features for modeling:
            model_b_feature_generation(db_conn)
            # create and save external_validation_set.csv file:
            make_output_df_from_files(model_type)

        cur.close() # close communication with the PostgreSQL database server
        db_conn.commit() # commit the changes
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        db_conn.close()
        print('Database connection closed.')

    


In [9]:
# Output: 
# 1. cohort_exclusion.txt - includes:
#    “Exclusion Criteria are: ….”
#    “On Model model_type: Y patients were excluded (X% of the cohort)”
#    “Z patients were removed in the external validation set (M%)”
# 2. processed_external_validation_set.csv - the processed cohort (normalization, imputation...), 
#    which should be the input for your model.

def module_2_preprocessing(external_validation_set_csv, model_type):
    # create cohort_exclusion.txt:
    str = "Exclusion Criteria are: patients that have no records in relevant_events table.\n"
    if model_type == 'a':
        f = open("C:/Temp/submission_tables/model_a/cohort_exclusion.txt", "w")
        f.write(str)
        model_str = "On Model A: 25 patients were excluded ({:.2%} of the cohort).\n".format(25/3100)
        global total_a
        total = total_a
    elif model_type == 'b':
        f = open("C:/Temp/submission_tables/model_b/cohort_exclusion.txt", "w")
        f.write(str)
        model_str = "On Model B: 0 patients were excluded ({:.2%} of the cohort).\n".format(0/105)
        global total_b
        total = total_b
    f.write(model_str)
    ext_val_set = pd.read_csv(external_validation_set_csv) 
    in_set = len(ext_val_set)
    excluded = total - in_set
    ext_str = "{} patients were excluded in the external validation set ({:.2%}).\n".format(excluded, excluded/total)
    f.write(ext_str)
    f.close()
    
    # set parameters for createing processed_external_validation_set.csv:
    if model_type == 'a':
        neighbors = 10 # for KNN imputation
        scaler_file = "C:/Temp/submission_tables/scaler_model_a.sav" #saved scaler that was fitted on the train data
        # the features that the model was trained on:
        features = ['Anion Gap amount', 'Arterial Line, amount during hospital stay',
                    'Bicarbonate, delta bw last, min',
                    'Calculated Total CO2, hours to target from min',
                    'Central Venous Line, amount during hospital stay',
                    'Chloride, delta bw last, max',
                    'Doses of any Antibiotics in the last day before target time',
                    'Doses of any Pressor-Sedatives in the last day before target time',
                    'Doses of any Pressor-Sedatives the patient got',
                    'GCS Total, hours to target from first', 'Glucose amount',
                    'Got any Antibiotics during hospital stay',
                    'Got any Antibiotics in the last day before target time',
                    'Got any Pressor-Sedatives during hospital stay',
                    'Got any Pressor-Sedatives in the last day before target time',
                    'Heart Rate last', 'Heart Rate max', 'Hematocrit amount',
                    'Hours from first Antibiotics dose to target time',
                    'Lactate, hours to target from max', 'Magnesium amount',
                    'Number of Blood Cultures taken in the ICU prior to target',
                    'Number of different Antibiotics in the last day before target time',
                    'Number of different Antibiotics the patient got',
                    'Number of different Pressor-Sedatives in the last day before target time',
                    'Number of different Pressor-Sedatives the patient got',
                    'Platelet Count last', 'Platelet Count min',
                    'Platelet Count, hours to target from max', 'Potassium amount',
                    'Respiratory Rate, hours to target from min', 'Sodium amount',
                    'hours_from_icu_intime_to_targettime',
                    'number of invasive lines inserted during hospital stay',
                    'pH, hours to target from min', 'pO2 75th percentile', 'pO2 average',
                    'pO2 median', 'patient had Arterial Line during hospital stay',
                    'patient had Central Venous Line during hospital stay']
        output_file = "C:/Temp/submission_tables/model_a/processed_external_validation_set.csv"
        
        data = pd.read_csv(external_validation_set_csv) 
        data = data[features] #keep only the features described above
        all_cols = data.columns #for re-creating dataframe from np array later
        # impute the data:
        data_np = data.to_numpy() 
        imputer = KNNImputer(n_neighbors=neighbors, weights='uniform', metric='nan_euclidean').fit(data_np)
        data_imp_np = imputer.transform(data_np)
        # load scaler that was fitted on the described above features, and then scale the test data:
        loaded_scaler = pickle.load(open(scaler_file, 'rb'))
        data_imp_np = loaded_scaler.transform(data_imp_np)
        # re-create dataframe from the np array: 
        final_data = pd.DataFrame(data_imp_np, columns = all_cols)
    
    elif model_type == 'b':
        selected_cols_file = "C:/Temp/submission_tables/model_b/selected_cols.csv" #the features the scaler was fitted on
        neighbors = 5 # for KNN imputation
        scaler_file = "C:/Temp/submission_tables/scaler_model_b.sav" #saved scaler that was fitted on the train data
        # the features that the model was trained on:
        features = ['Lymphocytes average', 'Lymphocytes median', 'Lymphocytes 75th percentile',
                    'Hematocrit, delta bw last, 25th percentile',
                    'INR(PT), hours to target from min',
                    'Tidal Volume (spontaneous), delta bw last, median',
                    'Tidal Volume (spontaneous), delta bw last, 25th percentile',
                    'Respiratory Rate, hours to target from last',
                    'Tidal Volume (observed), hours to target from min',
                    'Tidal Volume (spontaneous), hours to target from last']
        output_file = "C:/Temp/submission_tables/model_b/processed_external_validation_set.csv"
        
        # create processed_external_validation_set.csv:
        data = pd.read_csv(external_validation_set_csv) 
        selected_cols = pd.read_csv(selected_cols_file) # the columns the scaler was fitted on
        selected_cols = selected_cols['columns'].tolist()
        data = data[selected_cols] #keep only the selected_cols
        all_cols = data.columns #for re-creating dataframe from np array later
        # impute the data:
        data_np = data.to_numpy() 
        imputer = KNNImputer(n_neighbors=neighbors, weights='uniform', metric='nan_euclidean').fit(data_np)
        data_imp_np = imputer.transform(data_np)
        # load scaler that was fitted on the train data, and then scale the test data:
        loaded_scaler = pickle.load(open(scaler_file, 'rb'))
        data_imp_np = loaded_scaler.transform(data_imp_np)
        # re-create dataframe from the np array: 
        data_imp_df = pd.DataFrame(data_imp_np, columns = all_cols)
        # keep only the cols that match the features for testing:
        final_data = data_imp_df[features]
    
    #display(final_data)
    final_data.to_csv(output_file, encoding='utf-8', index=False)
        
      

In [11]:
# This module uses the trained model that was trained on model_a_mimic_cohort_v2, model_b_mimic_cohort_v2, and model_a_eicu_cohort_v2.
# processed_external_validation_set.csv is the test data for the model. 
# Output: 
# a continuous predicted risk score (like model_a/b_mimic_cohort_risk_score_group_N.csv) (our N = 9)
# This file, along with model_a/b_mimic_cohort_target.csv which contains the target of each patient, 
# will be the input for "validation_set_evaluation.ipynb" that will evaluate the model and calculate AUPR and AUROC.

def module_3_model(processed_external_validation_set_csv, model_type):
    test_data = pd.read_csv(processed_external_validation_set_csv)
    if model_type == 'a':
        cohort_df = pd.read_csv("C:/Temp/submission_tables/model_a_cohort_updated.csv")
        # load the model:
        loaded_model = pickle.load(open("C:/Temp/submission_tables/final_model_a.sav", 'rb'))
        output_file = "C:/Temp/submission_tables/model_a_mimic_cohort_risk_score_group_9.csv"
    elif model_type == 'b':
        cohort_df = pd.read_csv("C:/Temp/submission_tables/model_b_cohort_updated.csv")
        # load the model:
        loaded_model = pickle.load(open("C:/Temp/submission_tables/final_model_b.sav", 'rb'))
        output_file = "C:/Temp/submission_tables/model_b_mimic_cohort_risk_score_group_9.csv"
    cohort_df = cohort_df.sort_values(by=['identifier'])
    Y_pred_prob = loaded_model.predict_proba(test_data)[:,1]
    df = pd.DataFrame()
    df['identifier'] = cohort_df['identifier']
    df['risk_score'] = Y_pred_prob
    
    #display(df)
    df.to_csv(output_file, encoding='utf-8', index=False)
    
 


In [7]:
# This module creates the submitted trained model B, based on model_b_mimic_cohort_v2.csv. 
# Please supply a path for the cohort.

import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.under_sampling import TomekLinks 

import six
import sys
sys.modules['sklearn.externals.six'] = six
import joblib
sys.modules['sklearn.externals.joblib'] = joblib

all_data = 0
model_b_mimic_cohort_v2_csv = "C:/Temp/model_b_mimic_cohort_v2.csv"


def module_5_model_b_creation(model_type, model_b_mimic_cohort_v2_csv):
    create_tables(model_b_mimic_cohort_v2_csv) # connect to mimic db and collect all the needed data by creating tables
    X_train, Y_train = load_data_model_b(model_b_mimic_cohort_v2_csv) # X_train as df, Y_train as np array
    X_train = remove_rare_data_cols(X_train) # remove features with over 70% missing values rate
    X_train = remove_low_variance_cols(X_train) # remove features with variance lower then 0.15, now X_train is np array
    X_train = impute_data(X_train) # impute using KNN with 5 neighbors
    X_train, Y_train = balance(X_train, Y_train) # undersampling using TomekLinks
    X_train, scaler = standardization(X_train) # using StandardScaler on the train data
    pickle.dump(scaler, open("C:/Temp/submission_tables/scaler_model_b.sav", 'wb')) # save the scaler for scaling the test data later
    X_train = feature_selection(X_train, Y_train) # using logistic regression for selection, keeping 10 features
    clf = train(X_train, Y_train) # using logistic regression 
    pickle.dump(clf, open("C:/Temp/submission_tables/final_model_b.sav", 'wb')) # save the trained model


        
def create_tables(model_b_mimic_cohort_v2_csv):
    db_conn = psycopg2.connect(
        host="localhost",
        database="mimic",
        user="postgres",
        password="postgres")
    try:
        cur = db_conn.cursor() # create a cursor
        cur.execute('''set search_path to mimiciii''')
        # create tables with raw data from the db:
        ctb.create_all_tables_model_b(model_b_mimic_cohort_v2_csv, db_conn, cur, target=1)
        # clean data and remove non-human values:
        ctb.clean_tables(cur, db_conn)
        # generate all features for modeling:
        model_b_feature_generation(db_conn)
        # create and save external_validation_set.csv file:
        make_output_df_from_files(model_type)
        cur.close() # close communication with the PostgreSQL database server
        db_conn.commit() # commit the changes
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
    finally:
        db_conn.close()
        print('Database connection closed.')

        
def load_data_model_b(cohort_file): 
    data = pd.read_csv("C:/Temp/submission_tables/model_b/external_validation_set.csv") # whole mimic b dataset as train
    cohort = pd.read_csv(cohort_file)
    cohort = cohort.sort_values(by=['identifier']) #chorot sorted by identifier
    target = cohort['target']
    target[target == 'Inappropriate'] = 1 
    target[target == 'Appropriate'] = 0
    target = target.to_numpy() #target as np array ordered by identifier    
    return data, target


def remove_rare_data_cols(df):
    print("before removing high rate missing values columns: ", len(df.columns))
    df = df.dropna(thresh=df.shape[0]*0.3,how='all',axis=1) # remove features with over 70% missing values rate
    print("after removing high rate missing values columns: ", len(df.columns))
    # save in a global variable the entire data after removing rare data columns, as dataframe:
    global all_data
    all_data = df 
    
    return df


def remove_low_variance_cols(data):
    data_np = data.to_numpy() # transforms the data df to numpy array
    print("before removing low variance columns: ", data_np.shape)
    selector = VarianceThreshold(threshold=0.15) 
    data_np = selector.fit_transform(data_np) # remove features with variance lower then 0.15
    print("after removing low variance columns: ", data_np.shape)
    cols = selector.get_support(indices=True) # the indices of the remainung features
    
    global all_data
    selected_cols_lst = all_data.columns.values[cols] # the names of the remainung features
    all_data = all_data[selected_cols_lst] # save in a global variable the entire remaining data as dataframe
    # save in csv file the remaining features names, to remember the features the scaler will be fitted on later on:
    selected_cols_df = pd.DataFrame({'columns':selected_cols_lst})
    selected_cols_df.to_csv("C:/Temp/submission_tables/model_b/selected_cols.csv")
                  
    return data_np

    
def impute_data(data): 
    imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
    imputer.fit(data)
    data = imputer.transform(data)
    return data


def balance(X, y):
    sample = TomekLinks() #undersampling (removing appropriate samples) using TomekLinks
    X, y = sample.fit_resample(X, y.astype('int'))
    return X, y


def standardization(data):
    scaler = StandardScaler()
    scaler.fit(data)
    data = scaler.transform(data)
    return data, scaler


def feature_selection(x, y, print_features=1): 
    y = y.astype('int')
    clf = LogisticRegression().fit(x, y) # using logistic regression for selection
    selector = SelectFromModel(clf, max_features=10, prefit=True)
    new_data = selector.transform(x)
    cols = selector.get_support(indices=True)
    scores = clf.coef_[0][cols]
    
    if print_features:
        global all_data
        names = all_data.columns.values[cols]
        index = cols
        names_scores = list(zip(names, scores, index))
        ns_df = pd.DataFrame(data = names_scores, columns=['Feat_name', 'score', 'index'])
        ns_df = ns_df.sort_values(['score'], ascending = [True])
        display(ns_df)
        ns_df.to_csv("C:/Temp/selected_features_model_b.csv")

    return new_data


def train(X_train, Y_train):
    clf = LogisticRegression(solver='lbfgs', max_iter=1000) 
    Y_train = Y_train.astype('int')
    clf = clf.fit(X_train, Y_train)
    return clf

   
    
    
    
