### Import all the required libraries

In [None]:
from scipy.sparse import hstack
from boruta import BorutaPy
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
import warnings
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import numpy as np
import os
import sys
import re
import json

import time

from tqdm import tqdm

print(sys.version_info)

home_dir = os.getenv("HOME")
print(os.getenv("PYTHONPATH"))

warnings.filterwarnings("ignore")

## Helper functions

In [None]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    df_types = df.dtypes
    for col in df.columns:
        col_type = df[col].dtype
        # filter data types excluding object and datetime
        if (col_type != object) and (col_type != 'M8[ns]'):
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
                        np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
                        np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
                        np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
                        np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(
                        np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
                        np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) /
                                        start_mem))

    return df


def import_data(file):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True)
    df = reduce_mem_usage(df)
    return df

### Data loading

In [None]:
main_new_dataset = '../data/train_final_5f_all_labs_ext_med_hash'

# this variable is the column that we will use as the target variable for the model
target = 'INPT_DEATH_YN'
#target = 'AKIN_EVENT'

df = load_obj(main_new_dataset)
print(df.shape)

df.rename(columns={
    "OR_CASE_ID": "case_id",
    "PAT_ID": "patientid"
},
    inplace=True)

In [None]:
df = df.loc[:, ~df.columns.duplicated()]

In [None]:
df = reduce_mem_usage(df)

### Data processing

In [None]:
AKIN_THRESHOLD = 0

df['AKIN_EVENT'] = df['AKI_AKIN_CLASS']


df['PRIM_SURG_PROV_ID'] = df['PRIM_SURG_PROV_ID'].replace('E1032',
                                                          1032).astype(int)
df_train = df[df['DATE_OF_SERVICE'] < '2019-01-01']
df_test = df[df['DATE_OF_SERVICE'] >= '2019-01-01']

### Data cleaning and preparation

In [None]:
numeric_features = df.select_dtypes(include='number').drop([
    'encounter_id', 'ADMSN_ID', 'ASA_STATUS', 'CASE_START', 'CASE_END',
    'LAST_EF_RESULT_DATE', 'TOT_RBC', 'CRYSTALLOID_ML', 'COLLOID_ML',
    'FLOOR_2_ICU_YN', 'POSTOP_AKI_AKIN_CLASS', 'AKI_AKIN_CLASS', 'AKIN_EVENT',
    'INPT_DEATH_YN', 'PRIM_SURG_PROV_MINUTES', 'OPEN_ACCESS_YN',
    'GYN_ONC_ERAS_YN', 'patientid', 'or_case_id'
],
    axis=1).columns

In [None]:
cat_features = df.select_dtypes(include=['object', 'category']).drop(
    [
        'case_id', 'SEX', 'LAST_EF', 'HCUP_DESC', 'PRIMARY_CPT', 'CPT_DESC',
        'DATE_OF_SERVICE', 'CASE_SRV_NAME', 'PROC_NAME'
    ],
    axis=1).columns

In [None]:
for var in cat_features:
    try:
        # drop_first uses k-1 dummies out of k categories
        print(var)
        if var not in df.columns.values:
            print("Missing ", var)

        df = pd.get_dummies(df, columns=[var], drop_first=True)
        pass
    except ValueError:
        pass
    except KeyError:
        pass
# remove categorical variables (string values)
for var in cat_features:
    try:
        df.drop(var, axis=1, inplace=True)
        pass
    except ValueError:
        print(var, 'already dropped')
    except KeyError:
        print(var, 'already dropped')

In [None]:
#X = df_train
y_train = df_train['INPT_DEATH_YN'].values
#y = df_train['AKIN_EVENT'].values

In [None]:
# Fit One Hot Encoder using default spase matrix
encoder = OneHotEncoder(sparse=False)
encoder.fit(X[cat_features])

X_sparse = encoder.transform(X[cat_features])

In [None]:
X_sparse = pd.DataFrame(X_sparse,
                              columns=encoder.get_feature_names(cat_features))

In [None]:
X_enc = np.hstack((X[numeric_features], X_sparse))

In [None]:
feature_names = list(numeric_features.astype(str)) + list(
                                encoder.get_feature_names(cat_features))

In [None]:
X_enc_1 = pd.DataFrame(X_enc, columns=feature_names).fillna(0)

In [None]:
X_enc_1.replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
X_enc_1 = reduce_mem_usage(X_enc_1)

### Apply Feature selection algorithm 

In [None]:
rf = RandomForestClassifier(n_jobs=-1, max_depth=7, class_weight='balanced')

feat_selector = BorutaPy(rf, n_estimators='auto', random_state=1, verbose=2)
feat_selector.fit(X_enc_1.astype('float32').values, y_train)

In [None]:
feat_selector.support_
feat_selector.ranking_

final_features = list()

features = [f for f in X_enc_1.columns]
indexes = np.where(feat_selector.support_ == True)
for x in np.nditer(indexes):
    final_features.append(feature_names[x])

### Save results

In [None]:
json.dump(final_features, open('feature_importance_final_akin.txt', 'w'))