In [1]:
!pip install lightgbm -q

[0m

In [2]:
import os
import pickle
import numpy as np
import pandas as pd
import random as r
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

from time import time
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report

In [3]:
class CFG(object):
    def __init__(self,
         seed: int = 42,
         n_splits: int = 5,
         show_info: bool = False,
         ):

        self.seed = seed
        self.n_splits = n_splits
        self.show_info = show_info
        self.tr_data = "../input/heart-disease-prediction-tfug-chd-oct-2022/train.csv"
        self.ss_data = "../input/heart-disease-prediction-tfug-chd-oct-2022/sample_submission.csv"
        self.ts_data = "../input/heart-disease-prediction-tfug-chd-oct-2022/test.csv"
        self.model_save_path = "models"
        if not os.path.exists(self.model_save_path): os.makedirs(self.model_save_path)

cfg = CFG(seed=42, show_info=True)

In [4]:
def breaker(num: int=50, char: str="*") -> None:
    print("\n" + num*char + "\n")
    

def print_scores(accuracy: float, auc: float, precision: np.ndarray, recall: np.ndarray, f_score: np.ndarray) -> None:
    print(f"Accuracy  : {accuracy:.5f}")
    print(f"ROC-AUC   : {auc:.5f}")
    print(f"Precision : {precision}")
    print(f"Recall    : {recall}")
    print(f"F-Score   : {f_score}")
    

def get_scores(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    accuracy = accuracy_score(y_pred, y_true)
    auc = roc_auc_score(y_pred, y_true)
    precision, recall, f_score, _ = precision_recall_fscore_support(y_pred, y_true)

    return accuracy, auc, precision, recall, f_score

In [5]:
tr_df = pd.read_csv(cfg.tr_data)
tr_df = tr_df.drop(columns=["index"])

X = tr_df.iloc[:, :-1].copy().values
y = tr_df.iloc[:, -1].copy().values

sc_X = StandardScaler()
X = sc_X.fit_transform(X)

if cfg.show_info:
    breaker()
    for val in set(y):
        print(f"Class {val} count : {y[y == val].shape[0]}")
breaker()

best_auc: float = 0.0
fold: int = 1
    
for tr_idx, va_idx in KFold(n_splits=cfg.n_splits, random_state=cfg.seed, shuffle=True).split(X):
    X_train, X_valid, y_train, y_valid = X[tr_idx], X[va_idx], y[tr_idx], y[va_idx]
    model = lgb.LGBMClassifier(random_state=cfg.seed).fit(X_train, y_train)

    y_pred = model.predict(X_valid)
    if cfg.show_info:
        acc, auc, pre, rec, f1 = get_scores(y_valid, y_pred)
        print(f"Fold {fold}\n")
        print_scores(acc, auc, pre, rec, f1)
        print("")
    else:
        _, auc, _, _, _ = get_scores(y_valid, y_pred)

    if auc > best_auc:
        best_auc = auc
        best_fold = f"{fold}"

        with open(os.path.join(cfg.model_save_path, f"best_model.pkl"), "wb") as fp: pickle.dump(model, fp)

    with open(os.path.join(cfg.model_save_path, f"Fold_{fold}.pkl"), "wb") as fp: pickle.dump(model, fp)

    fold += 1

if cfg.show_info: 
    breaker()
    print(f"Best Fold : {best_fold}")

breaker()


**************************************************

Class 0 count : 386
Class 1 count : 454

**************************************************

Fold 1

Accuracy  : 0.52381
ROC-AUC   : 0.54149
Precision : [0.37362637 0.7012987 ]
Recall    : [0.59649123 0.48648649]
F-Score   : [0.45945946 0.57446809]

Fold 2

Accuracy  : 0.51190
ROC-AUC   : 0.51084
Precision : [0.50724638 0.51515152]
Recall    : [0.42168675 0.6       ]
F-Score   : [0.46052632 0.55434783]

Fold 3

Accuracy  : 0.53571
ROC-AUC   : 0.52761
Precision : [0.46575342 0.58947368]
Recall    : [0.46575342 0.58947368]
F-Score   : [0.46575342 0.58947368]

Fold 4

Accuracy  : 0.50595
ROC-AUC   : 0.50174
Precision : [0.43037975 0.57303371]
Recall    : [0.47222222 0.53125   ]
F-Score   : [0.45033113 0.55135135]

Fold 5

Accuracy  : 0.48810
ROC-AUC   : 0.48810
Precision : [0.48648649 0.4893617 ]
Recall    : [0.42857143 0.54761905]
F-Score   : [0.4556962  0.51685393]


**************************************************

Best Fold : 1

*

In [6]:
ts_df = pd.read_csv(cfg.ts_data)
ts_df = ts_df.drop(columns=["index"])
X_test = ts_df.copy().values

X_test = sc_X.transform(X_test)
y_pred = model.predict_proba(X_test)[:, 1]

ss_df = pd.read_csv(cfg.ss_data)
ss_df["target"] = y_pred
ss_df.to_csv("submission.csv", index=False)