In [7]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, f1_score, accuracy_score, log_loss, roc_auc_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from pprint import pprint

import warnings

warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names, but LGBMClassifier was fitted with feature names",
    category=UserWarning,
)

In [8]:
data = pd.read_csv("Data/insurance.csv")

data["target"] = np.select([
    data["charges"] <= 5000,
    (data["charges"] > 5000) & (data["charges"] <= 15000),
    data["charges"] > 15000],
    [0, 1, 2]
)

print(data["target"].value_counts())
data.head(3)

target
1    621
0    359
2    358
Name: count, dtype: int64


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,target
0,19,female,27.9,0,yes,southwest,16884.924,2
1,18,male,33.77,1,no,southeast,1725.5523,0
2,28,male,33.0,3,no,southeast,4449.462,0


In [9]:
X = data.drop(columns = ["charges", "target"])
y = np.asarray(data["target"])

pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler())
    ]), X.select_dtypes(include = "number").columns),

    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy = "most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown = "ignore", sparse_output = False))
    ]), X.select_dtypes(exclude = "number").columns)
])

X_train_full_raw, X_test_raw, y_train_full, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

X_train_full = pre.fit_transform(X_train_full_raw)
X_test = pre.transform(X_test_raw)

In [12]:
cv = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
learning_rates = [0.01, 0.05, 0.1, 0.5]

lr_results = {}

for lr in learning_rates:
    print(f"Learning Rate {lr}")
    print()

    best_iteration = []
    fold_accuracy = []
    fold_logloss = []

    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_full, y_train_full)):
        X_train = X_train_full[train_idx]
        y_train = y_train_full[train_idx]
        X_val = X_train_full[val_idx]
        y_val = y_train_full[val_idx]

        model = LGBMClassifier(
            objective = "multiclass",
            num_class = 3,
            n_estimators = 1000,
            learning_rate = lr,
            num_leaves = 31,
            subsample = 0.8,
            colsample_bytree = 0.8,
            random_state = 42,
            n_jobs = -1,
            verbosity = -1
        )

        model.fit(
            X_train, y_train,
            eval_set = [(X_val, y_val)],
            eval_metric = "multi_logloss",
            callbacks = [
                early_stopping(stopping_rounds = 50, verbose= False),
                log_evaluation(period = 0)
            ]
        )
        best_iter = model.best_iteration_
        best_iteration.append(best_iter)

        X_val_probs = model.predict_proba(X_val, num_iteration = best_iter)  #find out about [:, 1]
        X_val_preds = model.predict(X_val, num_iteration= best_iter)

        accuracy = accuracy_score(y_val, X_val_preds)
        fold_accuracy.append(round(accuracy, 2))

        logloss = log_loss(y_val, X_val_probs)
        fold_logloss.append(round(logloss,2))


    lr_results[lr] = {
        "Best Iterations" : best_iteration,
        "Fold Accuracys" : fold_accuracy,
        "Average Accuracy" : round(float(np.mean(fold_accuracy)), 2),
        "Fold Loglosses" : fold_logloss,
        "Average Logloss" : round(float(np.mean(fold_logloss)), 2)
    }

    print(f"Results for Learning Rate {lr}")
    print()

    pprint(lr_results[lr])
    print()
    print()

Learning Rate 0.01

Results for Learning Rate 0.01

{'Average Accuracy': 0.9,
 'Average Logloss': 0.36,
 'Best Iterations': [408, 407, 298, 366, 288],
 'Fold Accuracys': [0.91, 0.88, 0.87, 0.91, 0.91],
 'Fold Loglosses': [0.36, 0.37, 0.42, 0.32, 0.34]}


Learning Rate 0.05

Results for Learning Rate 0.05

{'Average Accuracy': 0.89,
 'Average Logloss': 0.37,
 'Best Iterations': [71, 81, 60, 64, 65],
 'Fold Accuracys': [0.91, 0.88, 0.86, 0.9, 0.91],
 'Fold Loglosses': [0.36, 0.37, 0.42, 0.33, 0.35]}


Learning Rate 0.1

Results for Learning Rate 0.1

{'Average Accuracy': 0.89,
 'Average Logloss': 0.37,
 'Best Iterations': [33, 53, 30, 43, 33],
 'Fold Accuracys': [0.9, 0.87, 0.87, 0.9, 0.92],
 'Fold Loglosses': [0.36, 0.38, 0.42, 0.33, 0.35]}


Learning Rate 0.5

Results for Learning Rate 0.5

{'Average Accuracy': 0.89,
 'Average Logloss': 0.38,
 'Best Iterations': [6, 4, 9, 6, 6],
 'Fold Accuracys': [0.9, 0.88, 0.87, 0.91, 0.9],
 'Fold Loglosses': [0.36, 0.41, 0.42, 0.34, 0.35]}




### LightGBM Classifier with 5 Fold Cross Validation & Grid Search

In [13]:
pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy = "median")),
        ("scaler", StandardScaler())
    ]), X.select_dtypes(include = "number").columns),

    ("cat", Pipeline([
        ("imp", SimpleImputer(strategy = "most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown= "ignore", sparse_output=False))
    ]), X.select_dtypes(exclude = "number").columns)
])


classifier = Pipeline([
    ("pre", pre),
    ("model", LGBMClassifier(
        objective = "multiclass",
        random_state = 42,
        n_jobs = -1,
        verbosity = -1,
        max_depth = -1
    ))
])

param_grid = {
    "model__n_estimators" : [300, 500],
    "model__learning_rate" : [0.01, 0.1],
    "model__feature_fraction" : [0.8],
    "model__num_leaves" : [31],
    "model__bagging_fraction" : [0.8],
    "model__bagging_freq" : [5]

}

In [14]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

grid = GridSearchCV(
    estimator = classifier,
    param_grid = param_grid,
    cv = cv,
    scoring = {
        "accuracy" : "accuracy",
    },
    refit = "accuracy",
    n_jobs = -1,
    verbose = 1
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

grid.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


0,1,2
,estimator,Pipeline(step...rbosity=-1))])
,param_grid,"{'model__bagging_fraction': [0.8], 'model__bagging_freq': [5], 'model__feature_fraction': [0.8], 'model__learning_rate': [0.01, 0.1], ...}"
,scoring,{'accuracy': 'accuracy'}
,n_jobs,-1
,refit,'accuracy'
,cv,StratifiedKFo... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.01
,n_estimators,500
,subsample_for_bin,200000
,objective,'multiclass'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [15]:
grid.best_params_

{'model__bagging_fraction': 0.8,
 'model__bagging_freq': 5,
 'model__feature_fraction': 0.8,
 'model__learning_rate': 0.01,
 'model__n_estimators': 500,
 'model__num_leaves': 31}