In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import  VotingClassifier
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.simplefilter(action="ignore")


In [2]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    """
     It gives the names of categorical, numerical and categorical but cardinal variables in the data set.
     Note: Categorical variables with numerical appearance are also included.

     parameters
     ------
         dataframe: dataframe
                 Dataframe from which variable names are to be taken
         cat_th: int, optional
                 Class threshold value for variables that are numeric but categorical
         car_th: int, optional
                 class threshold for categorical but cardinal variables

     returns
     ------
         cat_cols: list
                 Categorical variable list
         num_cols: list
                 Numerical variable list
         cat_but_car: list
                 List of cardinal variables with categorical view

     examples
     ------
         import seaborn as sns
         df = sns.load_dataset("iris")
         print(grab_col_names(df))


     Notes
     ------
         cat_cols + num_cols + cat_but_car = total number of variables
         num_but_cat is inside cat_cols.

    """
    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["category", "object", "bool"]]

    num_but_cat = [col for col in dataframe.columns if
                   dataframe[col].nunique() < 10 and dataframe[col].dtypes in ["int64", "float64", "int32", "float32"]]

    cat_but_car = [col for col in dataframe.columns if
                   dataframe[col].nunique() > 20 and str(dataframe[col].dtypes) in ["category", "object"]]

    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes in ["int64", "float64", "int32", "float32"]]
    num_cols = [col for col in num_cols if col not in cat_cols]

    print(f"Observations: {dataframe.shape[0]}\nVariables: {dataframe.shape[1]}\ncat_cols: {len(cat_cols)}\n"
          f"num_cols: {len(num_cols)}\ncat_but_car: {len(cat_but_car)}\nnum_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def replace_with_thresholds(dataframe, variable, q1=0.05, q3=0.95):
    low_limit, up_limit = outlier_thresholds(dataframe, variable, q1=0.05, q3=0.95)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit
    
def one_hot_encoder(dataframe, categorical_cols, drop_first=False):
    dataframe = pd.get_dummies(dataframe, columns=categorical_cols, drop_first=drop_first)
    return dataframe

def diabetes_data_prep(dataframe):
    dataframe.columns = [col.upper() for col in dataframe.columns]

    # Creation of new variables.
    
    # Age : Categorizing the age variable and creating a new age variable.
    dataframe.loc[(dataframe['AGE'] < 35), "NEW_AGE_CAT"] = 'young'
    dataframe.loc[(dataframe['AGE'] >= 35) & (dataframe['AGE'] <= 55), "NEW_AGE_CAT"] = 'middleage'
    dataframe.loc[(dataframe['AGE'] > 55), "NEW_AGE_CAT"] = 'old'
    
    # Glucose : Convert glucose value to a categorical variable
    dataframe['NEW_GLUCOSE_CAT'] = pd.cut(x=dataframe['GLUCOSE'], bins=[-1, 139, 200, 300], labels=["normal", "prediabetes", "diabetes"])
    
    # BMI below 18.5 is underweight, between 18.5 and 24.9 is healthy, between 24.9 and 29.9 is overweight and above 30 is obese
    dataframe['NEW_BMI_RANGE'] = pd.cut(x=dataframe['BMI'], bins=[-1, 18.5, 24.9, 29.9, 100],
                                 labels=["underweight", "healthy", "overweight", "obese"])
    
    # BloodPressure
    dataframe['NEW_BLOODPRESSURE'] = pd.cut(x=dataframe['BLOODPRESSURE'], bins=[-1, 79, 89, 123], labels=["normal", "hs1", "hs2"])

    # # Yaş ve beden kitle indeksini bir arada düşünerek kategorik değişken oluşturma
    dataframe.loc[(dataframe['AGE'] < 35) & (dataframe['BMI'] < 18.5), "NEW_AGE_BMI_CAT"] = 'young_underweight'
    dataframe.loc[(dataframe['AGE'] < 35) & (dataframe['BMI'] >= 18.5) & (dataframe['BMI'] < 24.9), "NEW_AGE_BMI_CAT"] = 'young_healthy'
    dataframe.loc[(dataframe['AGE'] < 35) & (dataframe['BMI'] >= 24.9) & (dataframe['BMI'] < 29.9), "NEW_AGE_BMI_CAT"] = 'young_overweight'
    dataframe.loc[(dataframe['AGE'] < 35) & (dataframe['BMI'] >= 29.9), "NEW_AGE_BMI_CAT"] = 'young_obese'
    
    dataframe.loc[(dataframe['AGE'] >= 35) & (dataframe['AGE'] <= 55) & (dataframe['BMI'] < 18.5), "NEW_AGE_BMI_CAT"] = 'middleage_underweight'
    dataframe.loc[(dataframe['AGE'] >= 35) & (dataframe['AGE'] <= 55) & (dataframe['BMI'] >= 18.5) & (dataframe['BMI'] < 24.9), "NEW_AGE_BMI_CAT"] = 'middleage_healthy'
    dataframe.loc[(dataframe['AGE'] >= 35) & (dataframe['AGE'] <= 55) & (dataframe['BMI'] >= 24.9) & (dataframe['BMI'] < 29.9), "NEW_AGE_BMI_CAT"] = 'middleage_overweight'
    dataframe.loc[(dataframe['AGE'] >= 35) & (dataframe['AGE'] <= 55) & (dataframe['BMI'] >= 29.9), "NEW_AGE_BMI_CAT"] = 'middleage_obese'
    
    dataframe.loc[(dataframe['AGE'] > 55) & (dataframe['BMI'] < 18.5), "NEW_AGE_BMI_CAT"] = 'old_underweight'
    dataframe.loc[(dataframe['AGE'] > 55) & (dataframe['BMI'] >= 18.5) & (dataframe['BMI'] < 24.9), "NEW_AGE_BMI_CAT"] = 'old_healthy'
    dataframe.loc[(dataframe['AGE'] > 55) & (dataframe['BMI'] >= 24.9) & (dataframe['BMI'] < 29.9), "NEW_AGE_BMI_CAT"] = 'old_overweight'
    dataframe.loc[(dataframe['AGE'] > 55) & (dataframe['BMI'] >= 29.9), "NEW_AGE_BMI_CAT"] = 'old_obese'
    
    # Yaş ve Glikoz değerlerini bir arada düşünerek kategorik değişken oluşturma
    dataframe.loc[(dataframe['AGE'] < 35) & (dataframe['GLUCOSE'] < 139), "NEW_AGE_GLUCOSE_CAT"] = 'young_normal'
    dataframe.loc[(dataframe['AGE'] < 35) & (dataframe['GLUCOSE'] >= 139) & (dataframe['GLUCOSE'] < 200), "NEW_AGE_GLUCOSE_CAT"] = 'young_prediabetes'
    dataframe.loc[(dataframe['AGE'] < 35) & (dataframe['GLUCOSE'] >= 200), "NEW_AGE_GLUCOSE_CAT"] = 'young_diabetes'
    
    dataframe.loc[(dataframe['AGE'] >= 35) & (dataframe['AGE'] <= 55) & (dataframe['GLUCOSE'] < 139), "NEW_AGE_GLUCOSE_CAT"] = 'middleage_normal'
    dataframe.loc[(dataframe['AGE'] >= 35) & (dataframe['AGE'] <= 55) & (dataframe['GLUCOSE'] >= 139) & (dataframe['GLUCOSE'] < 200), "NEW_AGE_GLUCOSE_CAT"] = 'middleage_prediabetes'
    dataframe.loc[(dataframe['AGE'] >= 35) & (dataframe['AGE'] <= 55) & (dataframe['GLUCOSE'] >= 200), "NEW_AGE_GLUCOSE_CAT"] = 'middleage_diabetes'
    
    dataframe.loc[(dataframe['AGE'] > 55) & (dataframe['GLUCOSE'] < 139), "NEW_AGE_GLUCOSE_CAT"] = 'old_normal'
    dataframe.loc[(dataframe['AGE'] > 55) & (dataframe['GLUCOSE'] >= 139) & (dataframe['GLUCOSE'] < 200), "NEW_AGE_GLUCOSE_CAT"] = 'old_prediabetes'
    dataframe.loc[(dataframe['AGE'] > 55) & (dataframe['GLUCOSE'] >= 200), "NEW_AGE_GLUCOSE_CAT"] = 'old_diabetes'
    

    cat_cols, num_cols, cat_but_car = grab_col_names(dataframe, cat_th=5, car_th=20)
    
    cat_cols = [col for col in cat_cols if "OUTCOME" not in col]

    df = one_hot_encoder(dataframe, cat_cols, drop_first=True)

    df.columns = [col.upper() for col in df.columns]

    cat_cols, num_cols, cat_but_car = grab_col_names(df, cat_th=5, car_th=20)

    cat_cols = [col for col in cat_cols if "OUTCOME" not in col]

    replace_with_thresholds(df, "INSULIN")

    X_scaled = StandardScaler().fit_transform(df[num_cols])
    df[num_cols] = pd.DataFrame(X_scaled, columns=df[num_cols].columns)

    y = df["OUTCOME"]
    X = df.drop(["OUTCOME"], axis=1)

    return X, y

def base_models(X, y, scoring="roc_auc"):
    print("Base Models....")
    classifiers = [('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
                   ('LightGBM', LGBMClassifier(verbose=-1)),
                   ('CatBoost', CatBoostClassifier(verbose=False))
                   ]

    for name, classifier in classifiers:
        cv_results = cross_validate(classifier, X, y, cv=3, scoring=scoring)
        print(f"{scoring}: {round(cv_results['test_score'].mean(), 4)} ({name}) ")


        
xgboost_params = {"learning_rate": [0.1, 0.01],
                  "max_depth": [5, 8],
                  "n_estimators": [100, 200]}

lightgbm_params = {"learning_rate": [0.01, 0.1],
                   "n_estimators": [300, 500]}

catboost_params = {"iterations": [200, 500],
                   "learning_rate": [0.01, 0.1]}

classifiers = [('XGBoost', XGBClassifier(use_label_encoder=False, eval_metric='logloss'), xgboost_params),
               ('LightGBM', LGBMClassifier(verbose=-1), lightgbm_params),
               ('CatBoost', CatBoostClassifier(verbose=False), catboost_params)]  

def hyperparameter_optimization(X, y, cv=3, scoring="roc_auc"):
    print("Hyperparameter Optimization....")
    best_models = {}
    for name, classifier, params in classifiers:
        print(f"########## {name} ##########")
        cv_results = cross_validate(classifier, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (Before): {round(cv_results['test_score'].mean(), 4)}")

        gs_best = GridSearchCV(classifier, params, cv=cv, n_jobs=-1, verbose=False).fit(X, y)
        final_model = classifier.set_params(**gs_best.best_params_)

        cv_results = cross_validate(final_model, X, y, cv=cv, scoring=scoring)
        print(f"{scoring} (After): {round(cv_results['test_score'].mean(), 4)}")
        print(f"{name} best params: {gs_best.best_params_}", end="\n\n")
        best_models[name] = final_model
    return best_models

def voting_classifier(best_models, X, y):
    print("Voting Classifier...")

    voting_clf = VotingClassifier(estimators=[('XGBoost', best_models["XGBoost"]),
                                              ('LightGBM', best_models["LightGBM"]),
                                              ('CatBoost', best_models["CatBoost"])],
                                  voting='soft').fit(X, y)

    cv_results = cross_validate(voting_clf, X, y, cv=3, scoring=["accuracy", "f1", "roc_auc"])
    print(f"Accuracy: {cv_results['test_accuracy'].mean()}")
    print(f"F1Score: {cv_results['test_f1'].mean()}")
    print(f"ROC_AUC: {cv_results['test_roc_auc'].mean()}")
    return voting_clf


In [9]:
def main():
    df = pd.read_csv("/home/mustafa/PycharmProjects/diabetes_pred/diabetes.csv")
    X, y = diabetes_data_prep(df)
    base_models(X, y)
    best_models = hyperparameter_optimization(X, y)
    voting_clf = voting_classifier(best_models, X, y)
    joblib.dump(voting_clf, "voting_clf.pkl")
    
    return voting_clf


if __name__ == "__main__":
    print("Start...")
    main()

Start...
Observations: 768
Variables: 15
cat_cols: 7
num_cols: 8
cat_but_car: 0
num_but_cat: 1
Observations: 768
Variables: 34
cat_cols: 26
num_cols: 8
cat_but_car: 0
num_but_cat: 1
Base Models....
roc_auc: 0.8042 (XGBoost) 
roc_auc: 0.8027 (LightGBM) 
roc_auc: 0.8364 (CatBoost) 
Hyperparameter Optimization....
########## XGBoost ##########
roc_auc (Before): 0.818
roc_auc (After): 0.818
XGBoost best params: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}

########## LightGBM ##########
roc_auc (Before): 0.8183
roc_auc (After): 0.8183
LightGBM best params: {'learning_rate': 0.01, 'n_estimators': 300}

########## CatBoost ##########
roc_auc (Before): 0.8381
roc_auc (After): 0.8381
CatBoost best params: {'iterations': 500, 'learning_rate': 0.01}

Voting Classifier...
Accuracy: 0.7721354166666666
F1Score: 0.6546764846378396
ROC_AUC: 0.8296285488076537
