In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import brier_score_loss, roc_auc_score
from sklearn.feature_selection import RFECV

In [4]:
import lightgbm as lgb

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from code import weight_of_evidence

In [7]:
import importlib

importlib.reload(weight_of_evidence)

<module 'code.weight_of_evidence' from '/Users/olivercairns/Desktop/code/WeightOfEvidenceDemo/code/weight_of_evidence.py'>

In [None]:
data = pd.read_csv("~/Downloads/application_train.csv")

In [None]:
EXCLUDE_COLS = ["SK_ID_CURR", "TARGET", "CODE_GENDER", "ORGANIZATION_TYPE"]

In [None]:
CATERORICAL_COLS = data.drop(columns=EXCLUDE_COLS).select_dtypes("O").columns

In [None]:
NUMERIC_COLS = data.drop(columns=EXCLUDE_COLS).select_dtypes("int64").columns

In [None]:
data[CATERORICAL_COLS] = data[CATERORICAL_COLS].fillna("MISSING")

In [None]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

In [None]:
ohe_preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, NUMERIC_COLS),
        ("cat", OneHotEncoder(handle_unknown="ignore"), CATERORICAL_COLS),
    ]
)

In [None]:
log_reg_cv = LogisticRegressionCV(penalty="l2", solver="lbfgs", max_iter=1e6, cv=3)

In [None]:
log_reg = LogisticRegression(solver="lbfgs", max_iter=1e6)

In [None]:
ohe_logit = Pipeline(
    steps=[("preprocessor", ohe_preprocessor), ("log_reg_classifier", log_reg),]
)

In [None]:
ohe_lgb = Pipeline(
    steps=[("preprocessor", ohe_preprocessor), ("lgb_classifier", lgb.LGBMClassifier())]
)

In [None]:
combined_results = pd.DataFrame()

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

In [None]:
X = data.drop(columns=EXCLUDE_COLS)
y = data.TARGET

In [None]:
def get_cv_results(X, y, cv, clf, model_name):
    results = {"auc": [], "fold": []}
    fold = 1
    for train, test in cv.split(X, y):
        print(f"fold {fold}")
        prediction = clf.fit(X.iloc[train], y.iloc[train]).predict_proba(X.iloc[test])
        auc = roc_auc_score(y_true=y.iloc[test], y_score=prediction[:, 1])
        results["auc"].append(auc)
        results["fold"].append(fold)
        print(f"auc {auc}")
        fold += 1
    results_df = pd.DataFrame(results)
    results_df["model"] = model_name
    return results_df

In [None]:
combined_results = get_cv_results(X, y, cv, ohe_logit, "ohe logit")

In [None]:
combined_results.groupby("model")[["auc"]].mean()

In [None]:
combined_results = combined_results.append(
    get_cv_results(X, y, cv, ohe_lgb, "ohe lgbm")
)

In [None]:
combined_results.groupby("model")[["auc"]].mean()

In [None]:
woebin_logit = Pipeline(
    steps=[
        ("tree_bin", weight_of_evidence.TreeBinner()),
        ("woe_scale", weight_of_evidence.WoeScaler()),
        ("standard_scale", StandardScaler()),
        ("log_reg_classifier", log_reg),
    ]
)

In [None]:
combined_results = combined_results.append(
    get_cv_results(X, y, cv, woebin_logit, "woe regression")
)

In [None]:
combined_results.groupby("model")[["auc"]].mean()

In [None]:
sns.scatterplot(data=combined_results, x="model", y="auc", hue="model")

In [None]:
"""rfe = RFECV(estimator=woebin_logit, 
            step=1, min_features_to_select=1, cv=cv)"""

In [None]:
woebin_logit.fit(X, y)

In [None]:
def plot_reg_coefs(var_names, coefficients, n=10, output_dir=None, verbose=True):
    """
    Find logit regression feature importance
    Args:
        var_names (series): variable names
        coefficients (series): regression coefficients
        n (integer): how many features to plot
    """
    coef_df = pd.DataFrame()
    coef_df["var_names"] = var_names
    coef_df["coef_vals"] = coefficients
    coef_df["abs_vals"] = np.abs(coef_df.coef_vals)
    coef_df = coef_df.set_index("var_names").sort_values(by="abs_vals", ascending=True)
    if verbose:
        plt.figure(figsize=(4, 8))
        ax = coef_df.tail(n).coef_vals.plot.barh()
        plt.title(f"Top {n} features - logistic regression \n")
        plt.show()
    return coef_df.reset_index()

In [None]:
var_importance = plot_reg_coefs(
    X.columns, woebin_logit["log_reg_classifier"].coef_[0], n=5
)

In [None]:
top_5 = var_importance.tail(5).var_names

In [None]:
weight_of_evidence.plot_bins(X[top_5], y, woebin_logit["tree_bin"].splits_)

In [None]:
weight_of_evidence.plot_bins(X[top_5], y, woebin_logit["tree_bin"].splits_, "log-odds")