In [1]:
# install libraries not included
!pip install catboost
!pip install bayesian-optimization

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting bayesian-optimization
  Downloading bayesian_optimization-3.1.0-py3-none-any.whl.metadata (11 kB)
Collecting colorama>=0.4.6 (from bayesian-optimization)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading bayesian_optimization-3.1.0-py3-none-any.whl (36 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama, bayesian-optimization
Successfully installed bayesian-optimization-3.1.0 colorama-0.4.6


In [2]:
import numpy as np
import pandas as pd
import os
from google.colab import drive

# import libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, SMOTENC
from imblearn.combine import SMOTEENN, SMOTETomek

from catboost import CatBoostClassifier, CatBoostRegressor, Pool

from bayes_opt import BayesianOptimization

In [18]:
# script parameters
alpha = 1 # exp smoothing (1 = no smoothing)
# data_augmentation = 'none'
data_augmentation = 'subsample-majority'
# data_augmentation = 'smotenc'
# data_augmentation = 'adasyn'

In [19]:
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# os.chdir('/content/drive/My Drive/ISYE6740_project')

# for dirname, _, filenames in os.walk('/content/drive/My Drive/ISYE6740_project/data/'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [21]:
# # debug purpose
# person_id_debug = 72083416

In [27]:
def process_sepsis_data(folder = 'train'):
    path_base = '/content/drive/My Drive/ISYE6740_project/data/'
    fname='SepsisLabel_' + folder
    if folder == 'train':
        sub_path = 'training_data/'
    else:
        sub_path = 'testing_data/'

    fpath = path_base + sub_path + fname + '.csv'

    # load data
    df = pd.read_csv(fpath)

    # convert to datetime
    df['measurement_datetime'] = pd.to_datetime(df['measurement_datetime'], errors='coerce')
    df['measurement_datetime_hour'] = df['measurement_datetime'].dt.floor('h')

    # sort table
    # df = df.sort_values(by=['person_id', 'measurement_datetime'])

    # if folder == 'train':
    #     # create new column with cumulative sum of Sepsis events
    #     # (just the first eevent will be considered, as we are not provided information about previous occurence as feature)
    #     df['sepsis_hours'] = (df.groupby(['person_id'], as_index=False)['SepsisLabel'].cumsum())

        # derivative of sepsis events - not used
        # df['sep_diff'] = df['SepsisLabel'].rolling(window=2).apply(np.diff)
        # df['sep_diff'] = df['SepsisLabel'].diff()
        # df['win_delimeters'] = 0
        # df.loc[df['sep_diff'] == 1, 'win_delimeters'] = 1
        # df['windows'] = df.groupby(['person_id'], as_index=False)['win_delimeters'].cumsum()

        # df = df[df['sepsis_hours']<=1]

    return df

def process_measure_data(folder = 'train', meas_type = 'lab', alpha=0.2):
    path_base = '/content/drive/My Drive/ISYE6740_project/data/'
    fname='measurement_'+ meas_type + '_' + folder
    if folder == 'train':
        sub_path = 'training_data/'
    else:
        sub_path = 'testing_data/'

    fpath = path_base + sub_path + fname + '.csv'

    # load data
    df = pd.read_csv(fpath)

    # convert to datetime
    df['measurement_datetime'] = pd.to_datetime(df['measurement_datetime'], errors='coerce')

    # map categorical variable for observatiuon table
    if meas_type == 'observation':
        di = {'Normal capillary filling':0, 'Decreased capillary filling time':-1, 'Increased capillary filling time':1}
        df['Capillary refill [Time]'] = df['Capillary refill [Time]'].map(di)

        di = {'Absent':-1, 'Present':1, 'Weak':0}
        df['Pulse'] = df['Pulse'].map(di)
        df['Arterial pulse pressure'] = df['Arterial pulse pressure'].map(di)

        di = {'Normal':0, 'Sluggish':1}
        df['Right pupil Pupillary response'] = df['Right pupil Pupillary response'].map(di)
        df['Left pupil Pupillary response'] = df['Left pupil Pupillary response'].map(di)

    # sort table
    df = df.sort_values(by=['person_id', 'visit_occurrence_id', 'measurement_datetime'])

    df = df.set_index('measurement_datetime')
    df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().interpolate(method='linear')

    # df[['person_id', 'visit_occurrence_id']] = df[['person_id', 'visit_occurrence_id']].fillna(method='ffill')

    df = df.reset_index()

    # cols = list(df.columns)
    # cols.remove('person_id')
    # cols.remove('measurement_datetime')
    # cols.remove('visit_occurrence_id')

    # df[cols] = df.groupby(['person_id'])[cols].ffill()
    # df[cols] = df.groupby(['person_id'])[cols].bfill()

    # df = df.fillna(0) # no data imputation possible

    # df = df.reset_index()
    df.rename(columns = {'measurement_datetime': 'measurement_datetime_hour'}, inplace=True)

    df['person_id'] = df['person_id'].astype(int)
    df['visit_occurrence_id'] = df['visit_occurrence_id'].astype(int)

    # exponential moving average
    cols = list(df.columns)
    cols.remove('person_id')
    cols.remove('measurement_datetime_hour')
    cols.remove('visit_occurrence_id')

    # df[cols] = df[cols].ewm(alpha=alpha).mean()

    for col in cols:
        df[col] = df.groupby(['person_id'])[col].ewm(alpha=alpha).mean().values

    df = df.reset_index()


    # df.drop('level_0', 'visit_occurrence_id'], axis=1, inplace=True)
    df.drop(['level_0', 'visit_occurrence_id', 'index'], axis=1, inplace=True)

    return df

def process_demo_data(folder = 'train', one_hot_encoding=False):
    path_base = '/content/drive/My Drive/ISYE6740_project/data/'
    fname='person_demographics_episode' + '_' + folder

    if folder == 'train':
        sub_path = 'training_data/'
    else:
        sub_path = 'testing_data/'

    fpath = path_base + sub_path + fname + '.csv'

    # load data
    df = pd.read_csv(fpath)

    # convert to datetime
    # df['measurement_datetime'] = pd.to_datetime(df['measurement_datetime'], errors='coerce')
    # df['measurement_datetime_day'] = df['measurement_datetime'].dt.floor('d')

    if one_hot_encoding:
      # map categorical variable for observatiuon table
      df = pd.get_dummies(df, columns=['gender'], dtype=float)
      # di = {'MALE':0, 'FEMALE':1}
      # df['gender'] = df['gender'].map(di)

    df.drop(['visit_occurrence_id', 'visit_start_date', 'birth_datetime'], axis=1, inplace=True)

    return df

def process_drug_data(folder = 'train'):
    path_base = '/content/drive/My Drive/ISYE6740_project/data/'
    fname='drugsexposure' + '_' + folder

    if folder == 'train':
        sub_path = 'training_data/'
    else:
        sub_path = 'testing_data/'

    fpath = path_base + sub_path + fname + '.csv'

    # load data
    df = pd.read_csv(fpath)

    # convert to datetime
    df['drug_datetime_hourly'] = pd.to_datetime(df['drug_datetime_hourly'], errors='coerce')

    # create new column by concatenating drug_concept_id and route_concept id
    df['drug_route'] = df['drug_concept_id'].astype(str) + '_' + df['route_concept_id'].astype(str)

    # remove drug_concept_id and route_cocept_id columns
    df.drop(['drug_concept_id', 'route_concept_id'], axis=1, inplace=True)



    # one-hot encode
    df = pd.get_dummies(df, columns=['drug_route'], dtype=float)

    # sort table
    df = df.sort_values(by=['person_id', 'visit_occurrence_id', 'drug_datetime_hourly'])

    df = df.drop_duplicates(subset=['person_id', 'visit_occurrence_id', 'drug_datetime_hourly'])

    df = df.set_index(['drug_datetime_hourly'])

    # fill rules
    cols_sum_sum = list(df.columns)
    cols_sum_sum.remove('person_id')
    # cols_sum_sum.remove('drug_datetime_hourly')
    cols_sum_sum.remove('visit_occurrence_id')


    df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq()

    df[['person_id', 'visit_occurrence_id']] = df[['person_id', 'visit_occurrence_id']].ffill()
    df[cols_sum_sum] = df[cols_sum_sum].fillna(0)



    # group by person_id and visit_occurence_id cum sum of other columns
    df[cols_sum_sum] = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False)[cols_sum_sum].cumsum()


    # # df = df.resample('d', on='measurement_datetime').mean()

    # df[['person_id', 'visit_occurrence_id']] = df[['person_id', 'visit_occurrence_id']].fillna(method='ffill')

    # df = df.fillna(0)

    df = df.reset_index()

    # cols = list(df.columns)
    # cols.remove('person_id')
    # cols.remove('drug_datetime_hourly')
    # cols.remove('visit_occurrence_id')

    # df[cols] = df.groupby(['person_id'])[cols].ffill()
    # df[cols] = df.groupby(['person_id'])[cols].bfill()

    # df = df.fillna(0) # no data imputation possible

    df = df.reset_index()
    df.rename(columns = {'drug_datetime_hourly': 'measurement_datetime_hour'}, inplace=True)
    df['person_id'] = df['person_id'].astype(int)



    # # exponential moving average
    cols = list(df.columns)
    cols.remove('person_id')
    cols.remove('measurement_datetime_hour')
    cols.remove('visit_occurrence_id')


    for col in cols:
        df[col] = df.groupby(['person_id'])[col].ewm(alpha=alpha).mean().values

    # df = df.reset_index()


    # df.drop(['index', 'level_0', 'visit_occurrence_id'], axis=1, inplace=True)
    df.drop(['index', 'level_0', 'visit_occurrence_id'], axis=1, inplace=True)

    return df

def process_procedure_data(folder = 'train', one_hot_encode=False):
    path_base = '/content/drive/My Drive/ISYE6740_project/data/'
    fname='proceduresoccurrences' + '_' + folder

    if folder == 'train':
        sub_path = 'training_data/'
    else:
        sub_path = 'testing_data/'

    fpath = path_base + sub_path + fname + '.csv'

    # load data
    df = pd.read_csv(fpath)

    # convert to datetime
    df['procedure_datetime_hourly'] = pd.to_datetime(df['procedure_datetime_hourly'], errors='coerce')

    # one-hot encode
    if one_hot_encode:
      df = pd.get_dummies(df, columns=['procedure'], dtype=float)

    # sort table
    df = df.sort_values(by=['person_id', 'visit_occurrence_id', 'procedure_datetime_hourly'])

    df = df.drop_duplicates(subset=['person_id', 'visit_occurrence_id', 'procedure_datetime_hourly'])

    df = df.set_index(['procedure_datetime_hourly'])

    df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')
    # # df = df.resample('d', on='measurement_datetime').mean()

    # df[['person_id', 'visit_occurrence_id']] = df[['person_id', 'visit_occurrence_id']].fillna(method='ffill')

    df = df.reset_index()

    # cols = list(df.columns)
    # cols.remove('person_id')
    # cols.remove('procedure_datetime_hourly')
    # cols.remove('visit_occurrence_id')

    # df[cols] = df.groupby(['person_id'])[cols].ffill()
    # df[cols] = df.groupby(['person_id'])[cols].bfill()
    # df = df.fillna(0) # no data imputation possible

    # df = df.reset_index()

    df.rename(columns = {'procedure_datetime_hourly': 'measurement_datetime_hour'}, inplace=True)

    # # # exponential moving average
    # cols = list(df.columns)
    # cols.remove('person_id')
    # cols.remove('measurement_datetime_hour')
    # cols.remove('visit_occurrence_id')

    # # # df[cols] = df[cols].ewm(alpha=alpha).mean()

    # for col in cols:
    #     df[col] = df.groupby(['person_id'])[col].ewm(alpha=alpha).mean().values

    # df = df.reset_index()

    df['person_id'] = df['person_id'].astype(int)

    df.drop([ 'level_0', 'visit_occurrence_id'], axis=1, inplace=True)

    return df

def process_device_data(folder = 'train', one_hot_encode=False):
    path_base = '/content/drive/My Drive/ISYE6740_project/data/'
    fname='devices' + '_' + folder

    if folder == 'train':
        sub_path = 'training_data/'
    else:
        sub_path = 'testing_data/'

    fpath = path_base + sub_path + fname + '.csv'

    # load data
    df = pd.read_csv(fpath)

    # convert to datetime
    df['device_datetime_hourly'] = pd.to_datetime(df['device_datetime_hourly'], errors='coerce')

    # one-hot encode
    if one_hot_encode:
      df = pd.get_dummies(df, columns=['device'], dtype=float)

    # sort table
    df = df.sort_values(by=['person_id', 'visit_occurrence_id', 'device_datetime_hourly'])

    df = df.drop_duplicates(subset=['person_id', 'visit_occurrence_id', 'device_datetime_hourly'])

    df = df.set_index(['device_datetime_hourly'])

    df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')
    # # df = df.resample('d', on='measurement_datetime').mean()


    df = df.reset_index()
    # cols = list(df.columns)
    # cols.remove('person_id')
    # cols.remove('device_datetime_hourly')
    # cols.remove('visit_occurrence_id')

    # df[cols] = df.groupby(['person_id'])[cols].ffill()
    # df[cols] = df.groupby(['person_id'])[cols].bfill()
    # df = df.fillna(0) # no data imputation possible

    # df = df.reset_index()
    df.rename(columns = {'device_datetime_hourly': 'measurement_datetime_hour'}, inplace=True)
    df['person_id'] = df['person_id'].astype(int)

    # # # exponential moving average
    # cols = list(df.columns)
    # cols.remove('person_id')
    # cols.remove('measurement_datetime_hour')
    # cols.remove('visit_occurrence_id')

    # # # df[cols] = df[cols].ewm(alpha=alpha).mean()

    # for col in cols:
    #     df[col] = df.groupby(['person_id'])[col].ewm(alpha=alpha).mean().values

    # df = df.reset_index()


    df.drop(['level_0', 'visit_occurrence_id'], axis=1, inplace=True)

    return df

def process_obs_data(folder = 'train', one_hot_encode=False):
    path_base = '/content/drive/My Drive/ISYE6740_project/data/'
    fname='observation' + '_' + folder

    if folder == 'train':
        sub_path = 'training_data/'
    else:
        sub_path = 'testing_data/'

    fpath = path_base + sub_path + fname + '.csv'

    # load data
    df = pd.read_csv(fpath)

    # convert to datetime
    df['observation_datetime'] = pd.to_datetime(df['observation_datetime'], errors='coerce')

    # one-hot encode
    if one_hot_encode:
      df = pd.get_dummies(df, columns=['valuefilled'], dtype=float)

    # sort table
    df = df.sort_values(by=['person_id', 'visit_occurrence_id', 'observation_datetime'])

    df = df.drop_duplicates(subset=['person_id', 'visit_occurrence_id', 'observation_datetime'])

    df = df.set_index(['observation_datetime'])

    df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')
    # # df = df.resample('d', on='measurement_datetime').mean()

    # df = df.fillna(method='ffill')

    df = df.reset_index()
    # cols = list(df.columns)
    # cols.remove('person_id')
    # cols.remove('observation_datetime')
    # cols.remove('visit_occurrence_id')

    # df[cols] = df.groupby(['person_id'])[cols].ffill()
    # df[cols] = df.groupby(['person_id'])[cols].bfill()
    # df = df.fillna(0) # no data imputation possible

    # df = df.reset_index()
    df.rename(columns = {'observation_datetime': 'measurement_datetime_hour', 'valuefilled': 'admission_reason'}, inplace=True)
    df['person_id'] = df['person_id'].astype(int)

    # # # exponential moving average
    # cols = list(df.columns)
    # cols.remove('person_id')
    # cols.remove('measurement_datetime_hour')
    # cols.remove('visit_occurrence_id')
    # cols.remove('observation_concept_id')
    # cols.remove('observation_concept_name')

    # # # df[cols] = df[cols].ewm(alpha=alpha).mean()

    # for col in cols:
    #     df[col] = df.groupby(['person_id'])[col].ewm(alpha=alpha).mean().values

    # df = df.reset_index()


    df.drop(['level_0', 'visit_occurrence_id', 'observation_concept_id', 'observation_concept_name'], axis=1, inplace=True)

    return df


def merge_with_main_table(main, sub, folder = 'train', keys=['person_id', 'measurement_datetime_hour']):
    main = main.merge(sub.drop_duplicates(subset=keys), on=keys, how='left')

    main = main.sort_values(by=['person_id', 'measurement_datetime_hour'])

    # main = main.reset_index()

    cols = list(main.columns)
    cols.remove('person_id')
    cols.remove('measurement_datetime')
    cols.remove('measurement_datetime_hour')

    # cols.remove('visit_occurrence_id')
    if folder == 'train':
        cols.remove('SepsisLabel')

    # main[cols] = main.groupby(['person_id'], as_index=False)[cols].\
    #     transform(lambda x: x.fillna(x.mean()))

    # main[cols] = main.groupby(['person_id'])[cols].ffill()
    # main[cols] = main.groupby(['person_id'])[cols].bfill()

    # main[cols] = main[cols].fillna(0) # no data imputation possible

    return main

In [30]:
# smoothing average alpha

train_df = process_sepsis_data('train')

# train_df_sub.tail(10)
# train_df_sub.info()
# train_df.info()
# train_df_sub.to_csv('train_df_sub.csv')
meas_lab_train_df = process_measure_data('train', meas_type='lab', alpha=alpha)
meas_meds_train_df = process_measure_data('train', meas_type='meds', alpha=alpha)
meas_obs_train_df = process_measure_data('train', meas_type='observation', alpha=alpha)

demo_train_df = process_demo_data('train')

drugs_train_df = process_drug_data('train')

procedure_train_df = process_procedure_data('train')

device_train_df = process_device_data('train')

obs_train_df = process_obs_data('train')

# meas_lab_train_df_sub = meas_lab_train_df[meas_lab_train_df['person_id'] == person_id_debug]
# train_df_sub.head(50)

# merge tables
train_df = merge_with_main_table(folder = 'train', main=train_df, sub = meas_lab_train_df, keys=['person_id', 'measurement_datetime_hour'])
train_df = merge_with_main_table(folder = 'train', main=train_df, sub = meas_meds_train_df, keys=['person_id', 'measurement_datetime_hour'])
train_df = merge_with_main_table(folder = 'train', main=train_df, sub = meas_obs_train_df, keys=['person_id', 'measurement_datetime_hour'])

train_df = merge_with_main_table(folder = 'train', main=train_df, sub = demo_train_df, keys=['person_id'])

train_df = merge_with_main_table(folder = 'train', main=train_df, sub = drugs_train_df, keys=['person_id', 'measurement_datetime_hour'])

train_df = merge_with_main_table(folder = 'train', main=train_df, sub = procedure_train_df, keys=['person_id', 'measurement_datetime_hour'])

train_df = merge_with_main_table(folder = 'train', main=train_df, sub = device_train_df, keys=['person_id', 'measurement_datetime_hour'])

train_df = merge_with_main_table(folder = 'train', main=train_df, sub = obs_train_df, keys=['person_id', 'measurement_datetime_hour'])

# # train_df_sub = train_df[train_df['person_id'] == person_id_debug]
# # train_df_sub = train_df[train_df['person_id'] == person_id_debug]

# train_df[['route_concept_id_Intraperitoneal', 'route_concept_id_Intratracheal']]=0

# train_df['measurement_datetime'] = train_df['measurement_datetime'].astype(str)
# train_df['person_id_datetime'] = train_df['person_id'].str.cat(submission['measurement_datetime'], sep='_')

# train_df = train_df.set_index(['measurement_datetime'])
# train_df = train_df.groupby(['person_id'], as_index=False).resample('h').asfreq().interpolate()

# train_df = train_df.reset_index()
cols = list(train_df.columns)
cols.remove('person_id')
cols.remove('measurement_datetime')
cols.remove('measurement_datetime_hour')

train_df[cols] = train_df.groupby(['person_id'])[cols].ffill()
train_df[cols] = train_df.groupby(['person_id'])[cols].bfill()
train_df[['gender', 'procedure', 'device', 'admission_reason']] = \
  train_df[['gender', 'procedure', 'device', 'admission_reason']].fillna('unknown')
train_df = train_df.fillna(0) # no data imputation possible

train_df = train_df.reset_index()

train_df.drop(['index'], axis=1, inplace=True)

  df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')
  df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')
  df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')


In [32]:
# predict on test dataset
# smooething average alpha
test_df = process_sepsis_data('test')

# train_df_sub.tail(10)
# train_df_sub.info()
# train_df.info()
# train_df_sub.to_csv('train_df_sub.csv')
meas_lab_test_df = process_measure_data('test', meas_type='lab', alpha=alpha)
meas_meds_test_df = process_measure_data('test', meas_type='meds', alpha=alpha)
meas_obs_test_df = process_measure_data('test', meas_type='observation', alpha=alpha)

demo_test_df = process_demo_data('test')
# meas_lab_train_df_sub = meas_lab_train_df[meas_lab_train_df['person_id'] == person_id_debug]
# train_df_sub.head(50)

drugs_test_df = process_drug_data('test')

procedure_test_df = process_procedure_data('test')

device_test_df = process_device_data('test')

obs_test_df = process_obs_data('test')

# merge tables
test_df = merge_with_main_table(folder = 'test', main=test_df, sub = meas_lab_test_df, keys=['person_id', 'measurement_datetime_hour'])
test_df = merge_with_main_table(folder = 'test', main=test_df, sub = meas_meds_test_df, keys=['person_id', 'measurement_datetime_hour'])
test_df = merge_with_main_table(folder = 'test', main=test_df, sub = meas_obs_test_df, keys=['person_id', 'measurement_datetime_hour'])

test_df = merge_with_main_table(folder = 'test', main=test_df, sub = demo_test_df, keys=['person_id'])

test_df = merge_with_main_table(folder = 'test', main=test_df, sub = drugs_test_df, keys=['person_id', 'measurement_datetime_hour'])

test_df = merge_with_main_table(folder = 'test', main=test_df, sub = procedure_test_df, keys=['person_id', 'measurement_datetime_hour'])

test_df = merge_with_main_table(folder = 'test', main=test_df, sub = device_test_df, keys=['person_id', 'measurement_datetime_hour'])

test_df = merge_with_main_table(folder = 'test', main=test_df, sub = obs_test_df, keys=['person_id', 'measurement_datetime_hour'])

# test_df[['drug_concept_id_ceftolozane', 'drug_concept_id_isoproterenol', \
#          'drug_concept_id_nitrofurantoin', 'route_concept_id_Rectal']]=0

# cols_order = list(train_df.columns)
# cols_order.remove('SepsisLabel')
# test_df = test_df[cols_order]

cols_test = list(test_df.columns)
cols_test.remove('person_id')
cols_test.remove('measurement_datetime')
cols_test.remove('measurement_datetime_hour')

test_df[cols_test] = test_df.groupby(['person_id'])[cols_test].ffill()
test_df[cols_test] = test_df.groupby(['person_id'])[cols_test].bfill()
test_df[['gender', 'procedure', 'device', 'admission_reason']] = \
  test_df[['gender', 'procedure', 'device', 'admission_reason']].fillna('unknown')
test_df = test_df.fillna(0) # no data imputation possible

test_df = test_df.reset_index()
test_df.drop(['index'], axis=1, inplace=True)
# test_df['categorical_column_name'] = test_df['categorical_column_name'].fillna('unknown_category')

  df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')
  df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')
  df = df.groupby(['person_id', 'visit_occurrence_id'], as_index=False).resample('h').asfreq().fillna(method='ffill')


In [33]:
# some columns are not there in test_df, some are not in train_df, run model on intersection
intersection_test = list(set(test_df.columns).intersection(set(train_df.columns)))
intersection_train = intersection_test + ['SepsisLabel']

# re-order accroding to train dataset (otherwise XGBoost prediction won't work)
train_df = train_df[intersection_train]
test_df = test_df[intersection_test]

In [35]:
# define list of features columns
cols_feat = list(train_df.columns)

cols_feat.remove('person_id')
cols_feat.remove('measurement_datetime')
cols_feat.remove('SepsisLabel')
cols_feat.remove('measurement_datetime_hour')

# define list of categorical columns
non_numeric_columns = train_df.select_dtypes(exclude=['number']).columns.tolist()
non_numeric_columns.remove('measurement_datetime')
non_numeric_columns.remove('measurement_datetime_hour')

# define list of integer columns
integer_columns = train_df.select_dtypes(include=['int', 'int64', 'int32']).columns.tolist()

# define float + int column list
cols_num = list(set(cols_feat) - set(non_numeric_columns))

In [36]:
# data augmentation strategy
if data_augmentation == 'subsample-majority':
  train_df_0 = train_df[train_df['SepsisLabel'] == 0]
  train_df_0 = train_df_0.sample(frac=0.2, random_state=1) #0.05
  train_df_1 = train_df[train_df['SepsisLabel'] == 1]
  train_df_pruned = pd.concat([train_df_0, train_df_1], axis=0)

  # test_df_pruned = test_df.sort_values(by=['person_id', 'measurement_datetime'])
  # test_df_pruned = test_df.copy()


  # X_train = np.array(train_df_pruned[cols_feat].values)
  # y_train = np.array(train_df_pruned['SepsisLabel'].values)
  X_train = train_df_pruned[cols_feat]
  y_train = train_df_pruned['SepsisLabel']

elif data_augmentation == 'smotenc':
  sm = SMOTENC(categorical_features=non_numeric_columns, random_state=42)
  X = train_df[cols_feat]
  y = train_df['SepsisLabel']
  X_train, y_train = sm.fit_resample(X, y)

elif data_augmentation == 'adasyn':
  adasyn = ADASYN(sampling_strategy='minority', random_state=42)
  encoder_ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
  encoded_nominal = encoder_ohe.fit_transform(train_df[non_numeric_columns])
  encoded_nominal_df = pd.DataFrame(encoded_nominal, columns=encoder_ohe.get_feature_names_out(non_numeric_columns))
  X = pd.concat([train_df[cols_num], encoded_nominal_df], axis=1)
  y = train_df['SepsisLabel']
  X_train, y_train  = adasyn.fit_resample(X,y)

else:
  X_train = train_df[cols_feat]
  y_train = train_df['SepsisLabel']

if data_augmentation != 'adasyn':
  for col in ['gender', 'procedure', 'device', 'admission_reason']:
          X_train[col] = X_train[col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFram

In [37]:
# Define the objective function for Bayesian Optimization
def catboost_cv(depth, learning_rate, iterations, subsample, l2_leaf_reg):
    # Convert hyperparameters to the right format
    depth = int(depth)
    iterations = int(iterations)
    l2_leaf_reg = int(l2_leaf_reg)
    # Initialize the CatBoost model (CatBoostRegressor for regression)
    if data_augmentation != 'adasyn':
      model = CatBoostClassifier(
          depth=depth,
          learning_rate=learning_rate,
          iterations=iterations,
          subsample=subsample,
          l2_leaf_reg=l2_leaf_reg,
          verbose=False,
          cat_features=non_numeric_columns,
          max_ctr_complexity=2,
          border_count = 64,
          task_type='GPU',
          early_stopping_rounds=50,
          bootstrap_type = 'Bernoulli'
        )
    else:
      model = CatBoostClassifier(
          depth=depth,
          learning_rate=learning_rate,
          iterations=iterations,
          subsample=subsample,
          l2_leaf_reg=l2_leaf_reg,
          verbose=False,
          max_ctr_complexity=2,
          border_count = 64,
          task_type='GPU',
          early_stopping_rounds=50,
          bootstrap_type = 'Bernoulli')


    # Perform cross-validation and return the mean R-squared score (for regression)
    cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')

    return cross_val_scores.mean()

In [38]:
# Train Catboost using CV Bayesian search
# Define the hyperparameter search space with data types
param_space_cb = {
    'depth': (4, 10),             # Integer values for depth
    'learning_rate': (0.003, 0.2),  # Float values for learning rate
    'iterations': (500, 800),    # Integer values for iterations
    'subsample': (0.1, 0.5),       # Float values for subsample
    'l2_leaf_reg': (1, 9)        # Integer values for l2_leaf_reg
}

# Create the BayesianOptimization object and maximize it
bayesian_opt_cb = BayesianOptimization(
    f=catboost_cv, pbounds=param_space_cb, random_state=1)
bayesian_opt_cb.maximize(init_points=5, n_iter=10)
results_cb = pd.DataFrame(bayesian_opt_cb.res)
results_cb.sort_values(by='target', ascending=False, inplace=True)

|   iter    |  target   |   depth   | learni... | iterat... | subsample | l2_lea... |
-------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.8194378[39m | [39m6.5021320[39m | [39m0.1449039[39m | [39m500.03431[39m | [39m0.2209330[39m | [39m2.1740471[39m |
| [35m2        [39m | [35m0.8957025[39m | [35m4.5540315[39m | [35m0.0396932[39m | [35m603.66821[39m | [35m0.2587069[39m | [35m5.3105338[39m |
| [39m3        [39m | [39m0.8308238[39m | [39m6.5151670[39m | [39m0.1379882[39m | [39m561.33567[39m | [39m0.4512469[39m | [39m1.2191007[39m |
| [39m4        [39m | [39m0.8464114[39m | [39m8.0228050[39m | [39m0.0852090[39m | [39m667.60694[39m | [39m0.1561547[39m | [39m2.5848119[39m |
| [39m5        [39m | [39m0.8674865[39m | [39m8.8044674[39m | [39m0.1937475[39m | [39m594.02725[39m | [39m0.3769290[39m | [39m8.0111132[39m |
| [39m6        [39m | [39m0.8649584[39m | [

In [39]:
# Print the best hyperparameters and their corresponding R2 score
best_hyperparameters_cb = bayesian_opt_cb.max
best_hyperparameters_cb['params'] = {param: int(value) if param in [
    'depth', 'iterations', 'l2_leaf_reg'] else value for param, value in best_hyperparameters_cb['params'].items()}
# print(&quot;Best hyperparameters:&quot;, best_hyperparameters['params'])
# print(f&quot;Best R-squared Score: {best_hyperparameters['target']:.4f}&quot;)
best_hyperparameters_cb['params']

{'depth': 10,
 'learning_rate': np.float64(0.003),
 'iterations': 604,
 'subsample': np.float64(0.1),
 'l2_leaf_reg': 5}

In [40]:
# run CatBoost trained model w/ best hyperparameters on test dataset
path_base = '/content/drive/My Drive/ISYE6740_project/results/'
fname = path_base + 'test_predictions_alpha=' + str(alpha) + '_CatBoost_augmentation=' \
            + data_augmentation + '_common-features_' + '.csv'
#
p = best_hyperparameters_cb['params']
if data_augmentation != 'adasyn':
  clf_best = CatBoostClassifier(verbose = False, depth = p['depth'], iterations=p['iterations'], \
                              l2_leaf_reg = p['l2_leaf_reg'], \
                              learning_rate = p['learning_rate'], subsample = p['subsample'], \
                              cat_features=non_numeric_columns, max_ctr_complexity=2,
                              border_count = 64,
                              task_type='GPU',
                              early_stopping_rounds=50,
                              bootstrap_type = 'Bernoulli')
else:
  clf_best = CatBoostClassifier(verbose = False, depth = p['depth'], iterations=p['iterations'], \
                              l2_leaf_reg = p['l2_leaf_reg'], \
                              learning_rate = p['learning_rate'], subsample = p['subsample'], \
                              max_ctr_complexity=2,
                              border_count = 64,
                              task_type='GPU',
                              early_stopping_rounds=50,
                              bootstrap_type = 'Bernoulli')


clf_best.fit(X_train, y_train)

if data_augmentation != 'adasyn':
  X_test = test_df[cols_feat]
else:
  encoder_ohe_test = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
  encoded_nominal_test = encoder_ohe_test.fit_transform(test_df[non_numeric_columns])
  encoded_nominal_test_df = pd.DataFrame(encoded_nominal_test, columns=encoder_ohe_test.get_feature_names_out(non_numeric_columns))
  X_test = pd.concat([test_df[cols_num], encoded_nominal_test_df], axis=1)


pred_labels = clf_best.predict_proba(X_test)
pred_labels = pred_labels[:,1]

# create pandas dataframe for submission
# submission = test_df.copy()
# s = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')
submission = test_df.copy()
submission['person_id'] = submission['person_id'].astype(str)
submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
submission = submission[['person_id_datetime']]
submission['SepsisLabel'] = pred_labels
submission.to_csv(fname, index=False)

In [41]:
# XG Boost model

# data preparation
X_train_xgb = X_train.copy()
# fix feature name issue
import re
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

X_train_xgb.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) \
                       else col for col in X_train_xgb.columns.values]

# for col in ['gender', 'procedure', 'device', 'admission_reason']:
#         X_train_xgb[col] = X_train_xgb[col].astype('category')

# X_train_xgb=X_train_xgb[['Base excess in Venous blood by calculation', \
#                          'Base excess in Arterial blood by calculation', 'gender']]


def xgb_cv(max_depth, learning_rate, n_estimators, subsample, colsample_bytree, scale_pos_weight, gamma, reg_lambda, alpha):
    max_depth = int(max_depth)
    n_estimators = int(n_estimators)

    model = XGBClassifier(
        max_depth=max_depth,
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        subsample = subsample,
        colsample_bytree = colsample_bytree,
        scale_pos_weight=scale_pos_weight,
        min_split_loss = gamma,
        reg_lambda = reg_lambda,
        alpha = alpha,
        enable_categorical=True,
        # tree_method='gpu_hist',
        # predictor='gpu_predictor',
        device='cuda')

    cross_val_scores = cross_val_score(model, X_train_xgb, y_train, cv=5, scoring='roc_auc', error_score='raise')

    return cross_val_scores.mean()

In [42]:
# Train XGB using CV Bayesain
# Define the hyperparameter search space with data types
param_space_xgb = {
    'max_depth': (3, 10),             # Integer values for depth
    'learning_rate': (0.005, 0.2),  # Float values for learning rate
    'n_estimators': (100, 1000),    # Integer values for iterations
    'subsample': (0.5, 1),       # Float values for subsample
    'colsample_bytree': (0.5, 1),
    'scale_pos_weight': (0.5, 1),
    'gamma': (0, 1),
    'reg_lambda': (0, 1),
    'alpha': (0, 1)
}

# Create the BayesianOptimization object and maximize it
bayesian_opt_xgb = BayesianOptimization(
    f=xgb_cv, pbounds=param_space_xgb, random_state=1)
bayesian_opt_xgb.maximize(init_points=5, n_iter=10)
results_xgb = pd.DataFrame(bayesian_opt_xgb.res)
results_xgb.sort_values(by='target', ascending=False, inplace=True)

|   iter    |  target   | max_depth | learni... | n_esti... | subsample | colsam... | scale_... |   gamma   | reg_la... |   alpha   |
-------------------------------------------------------------------------------------------------------------------------------------


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


| [39m1        [39m | [39m0.8734523[39m | [39m5.9191540[39m | [39m0.1454632[39m | [39m100.10293[39m | [39m0.6511662[39m | [39m0.5733779[39m | [39m0.5461692[39m | [39m0.1862602[39m | [39m0.3455607[39m | [39m0.3967674[39m |
| [35m2        [39m | [35m0.8844331[39m | [35m6.7717171[39m | [35m0.0867429[39m | [35m716.69755[39m | [35m0.6022261[39m | [35m0.9390587[39m | [35m0.5136937[39m | [35m0.6704675[39m | [35m0.4173048[39m | [35m0.5586898[39m |
| [39m3        [39m | [39m0.8364379[39m | [39m3.9827085[39m | [39m0.0436297[39m | [39m820.67011[39m | [39m0.9841307[39m | [39m0.6567120[39m | [39m0.8461613[39m | [39m0.8763891[39m | [39m0.8946066[39m | [39m0.0850442[39m |
| [39m4        [39m | [39m0.8554048[39m | [39m3.2733834[39m | [39m0.0381169[39m | [39m890.32825[39m | [39m0.5491734[39m | [39m0.7105538[39m | [39m0.9789447[39m | [39m0.5331652[39m | [39m0.6918771[39m | [39m0.3155156[39m |
| [39m5        [39m | 

In [43]:
# Print the best hyperparameters and their corresponding R2 score
best_hyperparameters_xgb = bayesian_opt_xgb.max
best_hyperparameters_xgb['params'] = {param: int(value) if param in [
    'depth', 'iterations', 'l2_leaf_reg'] else value for param, value in best_hyperparameters_xgb['params'].items()}
# print(&quot;Best hyperparameters:&quot;, best_hyperparameters['params'])
# print(f&quot;Best R-squared Score: {best_hyperparameters['target']:.4f}&quot;)
best_hyperparameters_xgb['params']

{'max_depth': np.float64(9.434052139199991),
 'learning_rate': np.float64(0.005),
 'n_estimators': np.float64(717.7650234964225),
 'subsample': np.float64(0.5),
 'colsample_bytree': np.float64(0.5),
 'scale_pos_weight': np.float64(1.0),
 'gamma': np.float64(1.0),
 'reg_lambda': np.float64(0.0),
 'alpha': np.float64(1.0)}

In [44]:
# run XGBoost trained model w/ best hyperparameters on test dataset
path_base = '/content/drive/My Drive/ISYE6740_project/results/'
fname = path_base + 'test_predictions_alpha=' + str(alpha) + '_XGBoost_augmentation=' \
            + data_augmentation + '_common-features_' +  '.csv'
#
p = best_hyperparameters_xgb['params']
xgb_best = XGBClassifier(verbose = False,
                         max_depth = int(p['max_depth']),
                         learning_rate = p['learning_rate'],
                         n_estimators = int(p['n_estimators']),
                         subsample = p['subsample'],
                         colsample_bytree = p['colsample_bytree'],
                         scale_pos_weight = p['scale_pos_weight'],
                         min_split_loss = p['gamma'],
                         reg_lambda = p['reg_lambda'],
                         alpha = p['alpha'],
                         enable_categorical=True,
                        #  tree_method='gpu_hist',
                        # predictor='gpu_predictor',
                        device='cuda')


xgb_best.fit(X_train_xgb, y_train)

X_test_xgb = X_test.copy()

X_test_xgb.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) \
                       else col for col in X_test_xgb.columns.values]

if data_augmentation != 'adasyn':
  for col in ['gender', 'procedure', 'device', 'admission_reason']:
          X_test_xgb[col] = X_test_xgb[col].astype('category')

pred_labels = xgb_best.predict_proba(X_test_xgb)
pred_labels = pred_labels[:,1]

# create pandas dataframe for submission
# submission = test_df.copy()
# s = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')
submission = test_df.copy()
submission['person_id'] = submission['person_id'].astype(str)
submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
submission = submission[['person_id_datetime']]
submission['SepsisLabel'] = pred_labels
submission.to_csv(fname, index=False)

Parameters: { "verbose" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [45]:
# # pre-process data for NN
# batch_size = 512 # The number of samples per batch

# # random subset to remove label bias (1 are just 2% of total label, difficult to train)

# train_df_0 = train_df[train_df['SepsisLabel'] == 0]
# # train_df_0 = train_df_0.sample(frac=0.2, random_state=1) #0.05
# train_df_1 = train_df[train_df['SepsisLabel'] == 1]

# train_df_pruned = pd.concat([train_df_0, train_df_1], axis=0)
# # train_df_pruned = train_df_pruned.sort_values(by=['person_id', 'measurement_datetime'])

# class MyDataset(torch.utils.data.Dataset):
#     def __init__(self, X, y):
#         self.X = X
#         self.y = y

#     def __len__(self):
#         return len(self.X)

#     def __getitem__(self, idx):
#         return self.X[idx], self.y[idx]

# # prepare data for ML
# cols_feat = list(train_df.columns)

# cols_feat.remove('person_id')
# cols_feat.remove('measurement_datetime')
# cols_feat.remove('SepsisLabel')
# cols_feat.remove('measurement_datetime_hour')

# # manually removed features
# col_rem = ['Base excess in Venous blood by calculation', 'Base excess in Arterial blood by calculation', \
#           'Phosphate [Moles/volume] in Serum or Plasma', 'Bilirubin.total [Moles/volume] in Serum or Plasma',\
#           'Potassium [Moles/volume] in Blood', \
#           'Neutrophil Ab [Units/volume] in Serum', 'Bicarbonate [Moles/volume] in Arterial blood', \
#           'Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'Heart rate', \
#           'Measurement of oxygen saturation at periphery', 'Oxygen/Gas total [Pure volume fraction] Inhaled gas', \
#           'Capillary refill [Time]']

# for col in col_rem:
#     cols_feat.remove(col)

# # cols_feat.remove('visit_occurrence_id')

# cols_all = cols_feat.copy()
# cols_all.append('SepsisLabel')

# # normalize
# scaler = MinMaxScaler()
# train_df_pruned[cols_feat] = scaler.fit_transform(train_df_pruned[cols_feat])
# # train_df_pruned[cols_feat] = (train_df_pruned[cols_feat] - train_df_pruned[cols_feat].mean()) / train_df_pruned[cols_feat].std()

# # split in validation and train dataset
# t_df, v_df = train_test_split(train_df_pruned, test_size=0.4, random_state=11)

# t_c = MyDataset(torch.tensor(t_df[cols_feat].values, dtype=torch.float32), torch.tensor(t_df['SepsisLabel'].values, dtype=torch.long))
# v_c = MyDataset(torch.tensor(v_df[cols_feat].values, dtype=torch.float32), torch.tensor(v_df['SepsisLabel'].values, dtype=torch.long))

# # t_pt = torch.tensor(t_df[cols_all].values)
# # v_pt = torch.tensor(v_df[cols_all].values)

# # Create the data loaders for batching and shuffling the data
# train_loader = torch.utils.data.DataLoader(t_c, batch_size=batch_size, shuffle=True) # The training loader
# test_loader = torch.utils.data.DataLoader(v_c, batch_size=len(v_df), shuffle=False) # The test loader


In [None]:
# #resampling strategy
# smote=SMOTE(sampling_strategy='minority')
# x_smote,y_smote=smote.fit_resample(t_df[cols_feat],t_df['SepsisLabel'])
# # y_t.value_counts()

# adasyn = ADASYN(sampling_strategy='minority')
# x_adasyn, y_adasyn = adasyn.fit_resample(t_df[cols_feat],t_df['SepsisLabel'])

# blsmote = BorderlineSMOTE(sampling_strategy='minority', kind='borderline-1')
# x_blsmote, y_blsmote = blsmote.fit_resample(t_df[cols_feat],t_df['SepsisLabel'])

# # smote_enn = SMOTEENN()  # commented----too slow
# # x_smote_enn, y_smote_enn = smote_enn.fit_resample(t_df[cols_feat],t_df['SepsisLabel'])

# # smt = SMOTETomek(sampling_strategy='auto')  # commented----too slow
# # x_smt, y_smt = smt.fit_resample(t_df[cols_feat],t_df['SepsisLabel'])

In [None]:
# # search PCA that explains 99% of data
# n_comp_list = [50]
# for n_comp in n_comp_list:
#     pca = PCA(n_components=n_comp)
#     principalComponents = pca.fit_transform(t_df[cols_feat])
#     var = np.sum(pca.explained_variance_ratio_)
#     print('# comp = ' + str(n_comp) + ', explained variance = ' + str(var))

# x_pca_train = pd.DataFrame(data = principalComponents)
# x_pca_val = pd.DataFrame(data = pca.transform(v_df[cols_feat]))

In [None]:
# # pre-process data for NN
# # batch_size = 128 # The number of samples per batch

# # random subset to remove label bias (1 are just 2% of total label, difficult to train)

# # train_df_0 = train_df[train_df['SepsisLabel'] == 0]
# # # train_df_0 = train_df_0.sample(frac=0.2, random_state=1) #0.05
# # train_df_1 = train_df[train_df['SepsisLabel'] == 1]

# # train_df_pruned = pd.concat([train_df_0, train_df_1], axis=0)
# # test_df_pruned = test_df.sort_values(by=['person_id', 'measurement_datetime'])
# test_df_pruned = test_df.copy()

# class MyDataset(torch.utils.data.Dataset):
#     def __init__(self, X, y):
#         self.X = X
#         self.y = y

#     def __len__(self):
#         return len(self.X)

#     def __getitem__(self, idx):
#         return self.X[idx], self.y[idx]

# # prepare data for ML
# cols_feat = list(test_df.columns)

# cols_feat.remove('person_id')
# cols_feat.remove('measurement_datetime')
# # cols_feat.remove('SepsisLabel')
# cols_feat.remove('measurement_datetime_hour')
# # cols_feat.remove('visit_occurrence_id')

# #manually removed features
# col_rem = ['Base excess in Venous blood by calculation', 'Base excess in Arterial blood by calculation', \
#           'Phosphate [Moles/volume] in Serum or Plasma', 'Bilirubin.total [Moles/volume] in Serum or Plasma',\
#           'Potassium [Moles/volume] in Blood', \
#           'Neutrophil Ab [Units/volume] in Serum', 'Bicarbonate [Moles/volume] in Arterial blood', \
#           'Systolic blood pressure', 'Diastolic blood pressure', 'Respiratory rate', 'Heart rate', \
#           'Measurement of oxygen saturation at periphery', 'Oxygen/Gas total [Pure volume fraction] Inhaled gas', \
#           'Capillary refill [Time]']

# for col in col_rem:
#     cols_feat.remove(col)

# # normalize
# # scaler = MinMaxScaler()
# test_df_pruned[cols_feat] = scaler.transform(test_df_pruned[cols_feat])
# # train_df_pruned[cols_feat] = (train_df_pruned[cols_feat] - train_df_pruned[cols_feat].mean()) / train_df_pruned[cols_feat].std()


# # tt_c = MyDataset(torch.tensor(test_df_pruned[cols_feat].values, dtype=torch.float32))

# # t_pt = torch.tensor(t_df[cols_all].values)
# # v_pt = torch.tensor(v_df[cols_all].values)

# # Create the data loaders for batching and shuffling the data
# test_final_loader = torch.utils.data.DataLoader(torch.tensor(test_df_pruned[cols_feat].values, dtype=torch.float32), batch_size=len(test_df_pruned), shuffle=False) # The test loader

# # x_pca_test = pd.DataFrame(data = pca.transform(test_df_pruned[cols_feat]))

In [None]:
# def xgb_train_and_validate(X_train, y_train, X_val, y_val, max_depth=15, subsample=1, \
#                            colsample_bytree=1, scale_pos_weight = 1, booster='gbtree', \
#                           gamma = 0, eta = 0.3, lam = 1, alpha = 0):

#     xgb = XGBClassifier(max_depth=max_depth, subsample = subsample, \
#                         colsample_bytree = colsample_bytree, scale_pos_weight=scale_pos_weight, \
#                        booster = booster, min_split_loss = gamma, eta = eta, reg_lambda = lam, alpha = alpha)
#     xgb.fit(X_train, y_train)
#     y_xgb_pred = xgb.predict(X_val)

#     cm = confusion_matrix(y_val, y_xgb_pred)
#     tn, fp, fn, tp = cm.ravel()
#     # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
#     # plt.xlabel('Predicted')
#     # plt.ylabel('Actual')
#     # plt.title('Confusion Matrix')
#     # plt.show()

#     fp_tp_ratio = fp/(fp+tp)
#     print('fp/(fp+tp) = ' + str(fp_tp_ratio))

#     return (fp_tp_ratio, xgb)

In [None]:
# # X_train = np.array(t_df[cols_feat].values)
# # y_train = np.array(t_df['SepsisLabel'].values)

# # X_train = np.array(train_df_pruned[cols_feat].values)
# # y_train = np.array(train_df_pruned['SepsisLabel'].values)

# sampling_strategy = 'none'

# if sampling_strategy == 'smote':
#     X_train = np.array(x_smote.values)
#     y_train = np.array(y_smote.values)
# elif sampling_strategy == 'adasyn':
#     X_train = np.array(x_adasyn.values)
#     y_train = np.array(y_adasyn.values)
# elif sampling_strategy == 'blsmote':
#     X_train = np.array(x_blsmote.values)
#     y_train = np.array(y_blsmote.values)
# elif sampling_strategy == 'enn':
#     X_train = np.array(x_smote_enn.values)
#     y_train = np.array(y_smote_enn.values)
# elif sampling_strategy == 'smt':
#     X_train = np.array(x_smt.values)
#     y_train = np.array(y_smt.values)
# elif sampling_strategy == 'pca':
#     X_train = np.array(x_pca_train.values)
#     y_train = np.array(t_df['SepsisLabel'].values)
# else :
#     X_train = np.array(t_df[cols_feat].values)
#     y_train = np.array(t_df['SepsisLabel'].values)


# # classes_weights = class_weight.compute_sample_weight(
# #     class_weight='balanced',
# #     y=y_train
# # )
# if sampling_strategy == 'pca':
#     X_val = np.array(x_pca_val.values)
#     X_test = np.array(x_pca_test.values)
# else:
#     X_val = np.array(v_df[cols_feat].values)
#     X_test = np.array(test_df_pruned[cols_feat].values)
# y_val = np.array(v_df['SepsisLabel'].values)

# # best seeting no PCA
# max_depth_list = [10, 11, 12, 13, 14, 15]# [12]
# subsample_list = [0.5]
# colsample_bytree_list = [0.5]
# scale_pos_weight_list = [1]
# gamma_list = [1] #[0, 1, 5]
# eta_list = [0.1] #[0.1, 0.5, 0.8]
# lam_list = [0.5] #[0, 0.5, 1]
# alp_list = [0] #[0, 0.5, 1]

# # max_depth_list = [16]
# # subsample_list = [1]
# # colsample_bytree_list = [0.5]
# # scale_pos_weight_list = [1]
# # gamma_list = [1] #[0, 1, 5]
# # eta_list = [0.1] #[0.1, 0.5, 0.8]
# # lam_list = [0.5] #[0, 0.5, 1]
# # alp_list = [0] #[0, 0.5, 1]


# data_to_append = []
# i = 0
# for max_depth in max_depth_list:
#     for subsample in subsample_list:
#         for colsample_bytree in colsample_bytree_list:
#             for scale_pos_weight in scale_pos_weight_list:
#                 for gamma in gamma_list:
#                     for eta in eta_list:
#                         for lam in lam_list:
#                             for alp in alp_list:
#                                 fp_tp_ratio, xgb = xgb_train_and_validate(X_train, y_train, X_val, y_val, max_depth, \
#                                             subsample, colsample_bytree, scale_pos_weight, \
#                                             booster='gbtree', gamma=gamma, eta=eta, lam = lam, alpha = alp)
#                                 new_row = {'max_depth': max_depth, 'subsample': subsample, 'colsample_bytree': \
#                                           colsample_bytree, 'scale_pos_weight': scale_pos_weight, \
#                                            'gamma': gamma, 'eta': eta, 'lambda': lam, 'alpha': alp, \
#                                            'fp_tp_ratio':fp_tp_ratio}
#                                 data_to_append.append(new_row)

#                                 # run on test dataset
#                                 # X_test = np.array(test_df_pruned[cols_feat].values)
#                                 pred_labels = xgb.predict_proba(X_test)
#                                 pred_labels = pred_labels[:,1]

#                                 # create pandas dataframe for submission
#                                 # submission = test_df.copy()
#                                 # s = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')
#                                 submission = test_df_pruned.copy()
#                                 submission['person_id'] = submission['person_id'].astype(str)
#                                 submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
#                                 submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
#                                 submission = submission[['person_id_datetime']]
#                                 submission['SepsisLabel'] = pred_labels
#                                 submission.to_csv('A0p3submission_'+str(i) +'.csv', index=False)

#                                 i = i+1

# r_df = pd.DataFrame(data_to_append)
# r_df.to_csv('r_df_4.csv')

In [None]:
# # r_df = r_df.sort_values(by=['fp_tp_ratio'])
# # r_df.to_csv('r_df_3.csv')
# r_df.head(10)

In [None]:
# # catboost grid search
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# from sklearn.metrics import classification_report
# from sklearn.metrics import make_scorer, f1_score, auc

# X_train = np.array(train_df_pruned[cols_feat].values)
# y_train = np.array(train_df_pruned['SepsisLabel'].values)

# param_grid = {
#     'learning_rate': [0.1],
#     'depth':[7,8,9,10,11],
#     'l2_leaf_reg': [2, 3, 4],
#     'boosting_type': ['Ordered', 'Plain']
# }

# # clf = CatBoostClassifier(iterations=100, learning_rate=0.1, depth = 10, \
# #                          loss_function='CrossEntropy', eval_metric='AUC') #LogLoss

# scorer = make_scorer(auc, average='weighted')

# clf = CatBoostClassifier(iterations=30, loss_function='CrossEntropy', eval_metric='AUC') #LogLoss

# # clf_grid_results = clf.grid_search(grid, X_train, y_train, cv=5, verbose=20)

# grid_search = GridSearchCV(clf, param_grid, cv=5, scoring=scorer, n_jobs=-1)
# grid_search.fit(X_train, y_train)
# print("Grid Search - Best Hyperparameters:", grid_search.best_params_)

# # pred_labels = clf_grid_results.predict_proba(X_test)

# # pred_labels = pred_labels[:,1]

# # # create pandas dataframe for submission
# # # submission = test_df.copy()
# # # s = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')
# # submission = test_df_pruned.copy()
# # submission['person_id'] = submission['person_id'].astype(str)
# # submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
# # submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
# # submission = submission[['person_id_datetime']]
# # submission['SepsisLabel'] = pred_labels
# # submission.to_csv('CBA0p3submission_pct-val='+str(pct_val) +'.csv', index=False)

In [None]:
# pred_labels = clf.predict_proba(X_test)

# pred_labels = pred_labels[:,1]

# # create pandas dataframe for submission
# # submission = test_df.copy()
# # s = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')
# submission = test_df_pruned.copy()
# submission['person_id'] = submission['person_id'].astype(str)
# submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
# submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
# submission = submission[['person_id_datetime']]
# submission['SepsisLabel'] = pred_labels
# submission.to_csv('CBA0p3submission_'+str(28) +'.csv', index=False)

In [None]:
# # train with best hyperparameters
# r_df_best = r_df.iloc[[0]]
# max_depth = r_df_best['max_depth'].iloc[0]
# subsample = r_df_best['subsample'].iloc[0]
# colsample_bytree = r_df_best['colsample_bytree'].iloc[0]
# scale_pos_weight = r_df_best['scale_pos_weight'].iloc[0]

# fp_tp_ratio_best, xgb_best = xgb_train_and_validate(X_train, y_train, X_val, y_val, max_depth, \
#                             subsample, colsample_bytree, scale_pos_weight)

In [None]:
# # run on test dataset
# X_test = np.array(test_df_pruned[cols_feat].values)
# pred_labels = xgb_best.predict_proba(X_test)
# pred_labels = pred_labels[:,1]

# # create pandas dataframe for submission
# # submission = test_df.copy()
# # s = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')
# submission = test_df_pruned.copy()
# submission['person_id'] = submission['person_id'].astype(str)
# submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
# submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
# submission = submission[['person_id_datetime']]
# submission['SepsisLabel'] = pred_labels
# submission.to_csv('submission.csv', index=False)

In [None]:
# # Define the hyperparameters
# # rev1
# num_epochs = 3 # The number of times to iterate over the whole dataset
# learning_rate = 2e-3 # The learning rate for the optimizer
# n_L1 = 15
# n_L2 = 5
# n_o = 2
# l2_reg = 1e-4
# derate = 0.01

# # # rev2
# # num_epochs = 10 # The number of times to iterate over the whole dataset
# # learning_rate = 1e-3 # The learning rate for the optimizer
# # n_L1 = 40
# # n_L2 = 15
# # n_o = 2
# # l2_reg = 1e-5
# # derate = 0.02

# class_weights = torch.tensor([derate, 1.0], dtype=torch.float) # to compesnate for bias in label distribution


In [None]:
# # create pytorch object to train NN
# class Net(nn.Module):
#     def __init__(self):
#         super(Net, self).__init__()
#         # The network has two fully connected layers
#         self.fc1 = nn.Linear(len(cols_feat), n_L1) # The first layer takes the flattened image as input and outputs 512 features
#         self.fc2 = nn.Linear(n_L1, n_L2) # The second layer takes the 512 features as input and outputs 10 classes
#         # self.fc3 = nn.Linear(n_L2, n_L3)
#         self.output_layer = nn.Linear(n_L2, n_o)
#         self.sigmoid = nn.Sigmoid()

#     def forward(self, x):
#         # The forward pass of the network
#         # x = x.view(-1, 28*28) # Flatten the image into a vector
#         x = F.relu(self.fc1(x)) # Apply the ReLU activation function to the first layer
#         x = F.relu(self.fc2(x)) # Apply the second layer
#         # x = F.relu(self.fc3(x)) # Apply the second layer
#         x = F.softmax(self.output_layer(x), dim=1)
#         # x = self.sigmoid(x)
#         return x # Return the output logits

In [None]:
# # # Create an instance of the model and move it to the device (CPU or GPU)
# # device = torch.device(&quot;cuda&quot; if torch.cuda.is_available() else &quot;cpu&quot;) # Get the device
# model = Net() # Move the model to the device
# # print(model) # Print the model summary

# # Define the loss function and the optimizer
# criterion = nn.CrossEntropyLoss(weight = class_weights) # The cross entropy loss for multi-class classification
# # criterion = nn.BCELoss(weight = class_weights)
# optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l2_reg) # The stochastic gradient descent optimizer

# # Define a function to calculate the accuracy of the model
# def accuracy(outputs, labels, plot_confusion=False):
#     # The accuracy is the percentage of correct predictions
#     _, preds = torch.max(outputs, 1) # Get the predicted classes from the output logits
#     o_np = outputs.detach().numpy()
#     # pd.DataFrame(o_np).to_csv('calculated_output.csv')
#     p = o_np[:,1]/(o_np[:,0]+o_np[:,1])
#     # pd.DataFrame(p).to_csv('calculated_probability.csv')
#     if plot_confusion:
#         cm = confusion_matrix(labels.numpy(), np.argmax(o_np,1))
#         tn, fp, fn, tp = cm.ravel()
#         sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
#         plt.xlabel('Predicted')
#         plt.ylabel('Actual')
#         plt.title('Confusion Matrix')
#         plt.show()
#     auc = roc_auc_score(labels.numpy(), p)
#     return tp/(tp+fp) # torch.sum(preds == labels).item() / len(labels)

In [None]:
# # Define the training loop
# def train(model, train_loader, criterion, optimizer, epoch, plot_confusion=False):
#     # Set the model to training mode
#     model.train()
#     # Initialize the running loss and accuracy
#     running_loss = 0.0
#     running_acc = 0.0
#     # Loop over the batches of data
#     for i, (inputs, labels) in enumerate(train_loader):
#         # print(i)
#         # print(labels)
#         # Move the inputs and labels to the device
#         # inputs = inputs
#         # labels = labels
#         # Zero the parameter gradients
#         optimizer.zero_grad()
#         # Forward pass
#         outputs = model(inputs) # Get the output logits from the model
#         # print(outputs)
#         loss = criterion(outputs, labels) # Calculate the loss
#         # print(loss)
#         # Backward pass and optimize
#         loss.backward() # Compute the gradients
#         optimizer.step() # Update the parameters
#         # Print the statistics
#         running_loss += loss.item() # Accumulate the loss
#         try:
#            acc = accuracy(outputs, labels, plot_confusion)
#         except:
#             acc = 0
#         running_acc += acc # Accumulate the accuracy
#         if (i+1 ) % 200 == 0: # Print every 200 batches
#             print(f'Epoch {epoch}, Batch {i + 1}, Loss: {running_loss / 200:.4f}, AUC: {running_acc / 200:.4f}')
#             running_loss = 0.0
#             running_acc = 0.0

# # Define the test loop
# def test(model, test_loader, criterion, plot_confusion=True):
#     # Set the model to evaluation mode
#     model.eval()
#     # Initialize the loss and accuracy
#     test_loss = 0.0
#     test_acc = 0.0
#     # Loop over the batches of data
#     with torch.no_grad(): # No need to track the gradients
#         for inputs, labels in test_loader:
#             # Move the inputs and labels to the device
#             # inputs = inputs.to(device)
#             # labels = labels.to(device)
#             # Forward pass
#             outputs = model(inputs) # Get the output logits from the model
#             loss = criterion(outputs, labels) # Calculate the loss
#             # Print the statistics
#             try:
#                acc = accuracy(outputs, labels, plot_confusion)
#             except:
#                 acc = 0
#             test_loss += loss.item() # Accumulate the loss
#             test_acc += acc # Accumulate the accuracy
#     # Print the average loss and accuracy
#     print(f'Test Loss: {test_loss / len(test_loader):.4f}, Test Accuracy: {test_acc / len(test_loader):.4f}')
#     # pd.DataFrame(np.argmax(outputs.detach().numpy(), 1)).to_csv('output_val.csv')
#     # pd.DataFrame(labels.detach().numpy()).to_csv('label_val.csv')
# for epoch in range(1, num_epochs + 1):
#     # print(epoch)
#     train(model, train_loader, criterion, optimizer, epoch) # Train the model
#     test(model, test_loader, criterion, plot_confusion=True) # Test the model

In [None]:
# from sklearn.ensemble import AdaBoostClassifier
# ada = AdaBoostClassifier(n_estimators=50, learning_rate=1, random_state=42)

# ada.fit(X_train, y_train)

# # Make predictions on the test data
# y_xgb_pred = ada.predict(X_val)

# cm = confusion_matrix(y_val, y_xgb_pred)
# tn, fp, fn, tp = cm.ravel()
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

In [None]:
# X_test = np.array(test_df_pruned[cols_feat].values)
# pred_labels = xgb.predict_proba(X_test)
# pred_labels = pred_labels[:,1]

# # create pandas dataframe for submission
# # submission = test_df.copy()
# # s = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')
# submission = test_df_pruned.copy()
# submission['person_id'] = submission['person_id'].astype(str)
# submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
# submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
# submission = submission[['person_id_datetime']]
# submission['SepsisLabel'] = pred_labels
# submission.to_csv('submission.csv', index=False)

In [None]:
# test_df.tail(50)

In [None]:
# # evluate model on test dataset

# def test_final(model, test_loader, criterion):
#     # Set the model to evaluation mode
#     model.eval()

#     # Loop over the batches of data
#     with torch.no_grad(): # No need to track the gradients
#         for inputs in test_loader:
#             # Move the inputs and labels to the device
#             # inputs = inputs.to(device)
#             # labels = labels.to(device)
#             # Forward pass
#             outputs = model(inputs) # Get the output logits from the model
#             # loss = criterion(outputs, labels) # Calculate the loss
#             # Print the statistics
#             # try:
#             #    acc = accuracy(outputs, labels)
#             # except:
#             #     acc = 0
#             # test_loss += loss.item() # Accumulate the loss
#             # test_acc += acc # Accumulate the accuracy
#     # Print the average loss and accuracy
#     # print(f'Test Loss: {test_loss / len(test_loader):.4f}, Test Accuracy: {test_acc / len(test_loader):.4f}')
#     # print(inputs)
#     o_np = outputs.detach().numpy()
#     p = o_np[:,1]/(o_np[:,0]+o_np[:,1])
#     return p

# pred_labels = test_final(model, test_final_loader, criterion) # Test the model

In [None]:
# # create pandas dataframe for submission
# # submission = test_df.copy()
# # s = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')
# submission = test_df_pruned.copy()
# submission['person_id'] = submission['person_id'].astype(str)
# submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
# submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
# submission = submission[['person_id_datetime']]
# submission['SepsisLabel'] = pred_labels
# submission.to_csv('submission.csv', index=False)

In [None]:
# shift by 6 hours and update table
# submission_2 = test_df_pruned.copy()
# # submission['person_id'] = submission['person_id'].astype(str)
# # submission['measurement_datetime'] = submission['measurement_datetime'].astype(str)
# # submission['person_id_datetime'] = submission['person_id'].str.cat(submission['measurement_datetime'], sep='_')
# # submission = submission[['person_id_datetime']]
# submission_2['measurement_datetime_shifted'] = submission_2['measurement_datetime'] + pd.to_timedelta('6 hours')
# submission_2['SepsisLabel'] = pred_labels

# submission_2_shifted = submission_2[['person_id','measurement_datetime_shifted', 'SepsisLabel']].copy()
# submission_2 = submission_2[['person_id', 'measurement_datetime']]

# submission_2 = submission_2.merge(submission_2_shifted, left_on=['person_id','measurement_datetime'], \
#                            right_on=['person_id','measurement_datetime_shifted'], how='left')

# # submission_2.head()
# submission_2['person_id'] = submission_2['person_id'].astype(str)
# submission_2['measurement_datetime'] = submission_2['measurement_datetime'].astype(str)
# submission_2['person_id_datetime'] = submission_2['person_id'].str.cat(submission_2['measurement_datetime'], sep='_')
# submission_2 = submission_2[['person_id_datetime', 'SepsisLabel']]
# submission_2['SepsisLabel'] = submission_2['SepsisLabel'].fillna(0)
# submission_2.to_csv('submission.csv', index=False)
# submission_2.sort_values(by=['person_id_datetime']).head()

# # submission_2.head()

In [None]:
# list(test_df_pruned.columns)