In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from catboost import CatBoostClassifier
import time

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

import wandb
from wandb.lightgbm import wandb_callback

pd.set_option("display.max_columns", 181)
pd.set_option("display.min_rows", 200)

In [8]:
data_dictionary = pd.read_csv("DataDictionaryWiDS2021.csv")
unlabeled = pd.read_csv("UnlabeledWiDS2021.csv")
training = pd.read_csv("TrainingWiDS2021.csv")

In [9]:
column_datatype_mapping = dict(zip(data_dictionary['Variable Name'], data_dictionary['Data Type']))

del training['Unnamed: 0']
del unlabeled['Unnamed: 0']

In [10]:
all_data = training.append(unlabeled).drop(['encounter_id', 
                                            'hospital_id', 
                                            'diabetes_mellitus'], axis=1)

all_data.shape

(140391, 177)

In [11]:
cat_cols = ['ethnicity', 'gender', 'hospital_admit_source',
           'icu_admit_source', 'icu_stay_type', 'icu_type',
           'apache_2_diagnosis', 'apache_3j_diagnosis','icu_id']


binary_cols = data_dictionary.loc[data_dictionary['Data Type'] == 'binary','Variable Name'].values
num_cols = data_dictionary.loc[data_dictionary['Data Type'] == 'numeric','Variable Name'].values

    

for col in all_data.columns:
    if col in cat_cols:
        print(col)
        all_data[col] = LabelEncoder().fit_transform(all_data[col].astype('str'))
        all_data[col]= all_data[col].astype('category')        

ethnicity
gender
hospital_admit_source
icu_admit_source
icu_id
icu_stay_type
icu_type
apache_2_diagnosis
apache_3j_diagnosis


In [12]:
df_train = all_data[:len(training)]
df_pred = all_data[len(training):].reset_index(drop=True)
Y = training['diabetes_mellitus']

In [13]:
X_train, X_val, y_train, y_val = train_test_split(df_train, Y, test_size=0.20, random_state=42,shuffle=True )

In [None]:
cat_features = ['ethnicity', 'gender', 'hospital_admit_source',
           'icu_admit_source', 'icu_stay_type', 'icu_type',
           'apache_2_diagnosis', 'apache_3j_diagnosis','icu_id']

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=1000,
                           learning_rate=0.1,
                           depth=6,
                           eval_metric='AUC')

# Fit model
model.fit(df_train, Y, cat_features)

# Get predicted classes
preds_class = model.predict(X_val)

In [56]:
preds_proba = model.predict_proba(X_val)
metrics.roc_auc_score(y_val, preds_proba[:,1])

0.9208794027336712

In [57]:
submittion = pd.DataFrame([unlabeled.encounter_id,model.predict_proba(df_pred)[:,1]]).T
submittion.encounter_id = submittion.encounter_id.astype('int32')
submittion.set_index('encounter_id',inplace=True)
submittion.columns = ['diabetes_mellitus']
submittion.to_csv('submissions/SolutionWiDS2021_CatBoost.csv')

### CV with random search

In [20]:
# A parameter grid for CatBoost
params = {
        'depth': [4, 5, 6],
        'learning_rate': [0.01, 0.1, 0.2, 0.4],
        'l2_leaf_reg': [1, 3, 5]
        }

In [21]:
model = CatBoostClassifier(iterations=250)

folds = 4
param_comb = 250

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(model, param_distributions=params, n_iter=param_comb, scoring='roc_auc', 
                                   n_jobs=32, cv=skf.split(df_train.values, Y), verbose=3, 
                                   random_state=1001)

In [22]:
# Here we go
start=time.time()
random_search.fit(df_train.values, Y)
print(time.time()-start)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=32)]: Using backend LokyBackend with 32 concurrent workers.


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}