In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from catboost import CatBoostClassifier, Pool
import time

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

import wandb
from wandb.lightgbm import wandb_callback

pd.set_option("display.max_columns", 181)
pd.set_option("display.min_rows", 200)

In [2]:
def reduce_mem_usage(df: pd.DataFrame,
                     verbose: bool = True) -> pd.DataFrame:
    
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()

            if str(col_type)[:3] == 'int':

                if (c_min > np.iinfo(np.int32).min
                      and c_max < np.iinfo(np.int32).max):
                    df[col] = df[col].astype(np.int32)
                elif (c_min > np.iinfo(np.int64).min
                      and c_max < np.iinfo(np.int64).max):
                    df[col] = df[col].astype(np.int64)
            else:
                if (c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max):
                    df[col] = df[col].astype(np.float16)
                elif (c_min > np.finfo(np.float32).min
                      and c_max < np.finfo(np.float32).max):
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    reduction = (start_mem - end_mem) / start_mem

    msg = f'Mem. usage decreased to {end_mem:5.2f} MB ({reduction * 100:.1f} % reduction)'
    if verbose:
        print(msg)

    return df

In [3]:
y = pd.read_csv('TrainingWiDS2021.csv', index_col=[0])
y = y.diabetes_mellitus.values

unlabeled = pd.read_csv('UnlabeledWiDS2021.csv', index_col=[0])
unlabeled = unlabeled.encounter_id.values

In [4]:
train = pd.read_csv('2020features_train.csv')
test = pd.read_csv('2020features_test.csv')

In [5]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
#gc.collect()
train.shape, test.shape

Mem. usage decreased to 369.65 MB (67.1 % reduction)
Mem. usage decreased to 29.08 MB (67.0 % reduction)


NameError: name 'gc' is not defined

In [6]:
del train['Unnamed: 0']
del test['Unnamed: 0']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(train, y, test_size=0.20, random_state=42,shuffle=True )

In [None]:
cat_features = ['elective_surgery', 'icu_id', 'arf_apache', 'intubated_apache', 'ventilated_apache', 'cirrhosis','hepatic_failure', 'immunosuppression', 'leukemia', 'solid_tumor_with_metastasis', 'apache_3j_diagnosis_x','apache_2_diagnosis_x', 'apache_3j', 'apache_3j_diagnosis_split1', 'apache_2_diagnosis_split1', 'gcs_sum_type','hospital_admit_source', 'glucose_rate', 'glucose_wb', 'gcs_eyes_apache', 'glucose_normal', 'total_cancer_immuno','gender', 'total_chronic', 'icu_stay_type', 'apache_2_diagnosis_type', 'apache_3j_diagnosis_type']


#{'depth': 7, 'l2_leaf_reg': 16, 'learning_rate': 0.05}
#{'depth': 9, 'l2_leaf_reg': 14, 'learning_rate': 0.05}

# Initialize CatBoostClassifier
model = CatBoostClassifier(iterations=2500,
                           learning_rate=0.05,
                           depth=9,
                           l2_leaf_reg=14,
                           eval_metric='AUC',
                           od_type='IncToDec',
                           od_pval=.05,
                          od_wait=15)


eval_dataset = Pool(X_val,y_val,cat_features=cat_features)

# Fit model
#model.fit(X_train, y_train, cat_features,
#         eval_set=eval_dataset)

# Fit model
model.fit(train,y, cat_features,
         eval_set=eval_dataset)

# Get predicted classes
preds_class = model.predict(X_val)

In [9]:
preds_proba = model.predict_proba(X_val)
metrics.roc_auc_score(y_val, preds_proba[:,1])

0.9873081461322909

In [12]:
y

array([1, 1, 0, ..., 1, 0, 0])

In [13]:
train_for_ensemble = pd.DataFrame(zip(y,model.predict_proba(train)[:,1]), columns=['diabetes_mellitus','pred'])
train_for_ensemble.to_csv('ensemble/TRAIN_CatBoost_new_features_rand_search_2500_epochs.csv')

submittion = pd.DataFrame([unlabeled,model.predict_proba(test)[:,1]]).T
submittion.columns = ['encounter_id', 'diabetes_mellitus']
submittion.encounter_id = submittion.encounter_id.astype('int32')
submittion.set_index('encounter_id',inplace=True)
submittion.to_csv('ensemble/SOLUTION_CatBoost_new_features_rand_search_2500_epochs.csv')

In [22]:
submittion = pd.DataFrame([unlabeled,model.predict_proba(test)[:,1]]).T
submittion.columns = ['encounter_id', 'diabetes_mellitus']
submittion.encounter_id = submittion.encounter_id.astype('int32')
submittion.set_index('encounter_id',inplace=True)
submittion.to_csv('submissions/SolutionWiDS2021_CatBoost_new_features_rand_search_2500_epochs.csv')

### CV with random search

In [None]:
import time
start = time.time()

from catboost import CatBoost

cats = ['elective_surgery', 'icu_id', 'arf_apache', 'intubated_apache', 'ventilated_apache', 'cirrhosis','hepatic_failure', 'immunosuppression', 'leukemia', 'solid_tumor_with_metastasis', 'apache_3j_diagnosis_x','apache_2_diagnosis_x', 'apache_3j', 'apache_3j_diagnosis_split1', 'apache_2_diagnosis_split1', 'gcs_sum_type','hospital_admit_source', 'glucose_rate', 'glucose_wb', 'gcs_eyes_apache', 'glucose_normal', 'total_cancer_immuno','gender', 'total_chronic', 'icu_stay_type', 'apache_2_diagnosis_type', 'apache_3j_diagnosis_type']

model = CatBoostClassifier(cat_features=cats)

grid = {'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.125, 0.2],
        'depth': [4, 5, 6, 7, 8, 9],
        'l2_leaf_reg': [10, 12, 14, 16, 17, 18]}

randomized_search_result = model.randomized_search(grid, n_iter=250,
                                               X=X_train,
                                               y=y_train,
                                               partition_random_seed=0,
                                               plot=True)

#grid = {'learning_rate': [0.01, 0.5, 0.1, 0.2],'depth': [4, 6, 8, 10],'l2_leaf_reg': [5, 7, 9, 11, 13]}
#randomized_search_result = model.randomized_search(grid,X=X_train,y=y_train,n_iter=250,partition_random_seed=0,plot=True)

print('Time taken:', time.time() - start)
print(randomized_search_result['params'])