In [None]:
import pandas as pd
import mlflow
import mlflow.sklearn
import scikitplot as skplt
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
from sklearn import model_selection
from sklearn import linear_model, tree, ensemble
from feature_engine import imputation
from sklearn import pipeline
from sklearn import metrics

POSTGRES_URI = "postgresql://postgres:postgres@localhost:5401/olist"
pd.set_option('display.max_rows', None)

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()

In [None]:
mlflow.sklearn.autolog()

In [None]:
conn = create_engine(POSTGRES_URI)

In [None]:
df_abt = pd.read_sql("SELECT * FROM analytics.abt_olist_churn", conn)
df_abt["date_reference"] = df_abt["date_reference"].astype(str)

df_oot = df_abt[df_abt["date_reference"] == '2018-01-01']
df_train = df_abt[df_abt["date_reference"] != '2018-01-01']

In [None]:
var_identity = ["date_reference", "date_ingestion", "seller_id"]
target = "churn"
to_remove = ["qty_recency", target] + var_identity
features = df_train.columns.tolist()
features = sorted(set(features) - set(to_remove))

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df_train[features],
    df_train[target],
    test_size=0.2,
    random_state=11,
)

print(f"churn propotion train: {y_train.mean():.4f}")
print(f"churn propotion test: {y_test.mean():.4f}")

In [None]:
missing_minus_1 = [
    "avg_order_interval",
    "avg_delivery_time_approved",
    "avg_delivery_time_ordered",
    "avg_expected_delivery_days",
    "min_score",
    "median_score",
    "avg_score",
    "max_score",
    "min_product_volume",
    "median_product_volume",
    "max_product_volume",
    "avg_product_volume",
]

missing_0 = [
    "min_installments",
    "median_installments",
    "max_installments",
    "avg_installments",
    "pct_delayed",
]

## Define MLFlow experiment

In [None]:
mlflow.set_experiment("olist_churn")

In [None]:
with mlflow.start_run():

    imputer_minus_100 = imputation.ArbitraryNumberImputer(-100, missing_minus_1)
    imputer_0 = imputation.ArbitraryNumberImputer(0, missing_0)

    model = ensemble.RandomForestClassifier(
        n_estimators=1000,
        min_samples_leaf=50,
        random_state=435,
    )

    model_pipeline = pipeline.Pipeline(
        [
            ('imputer_minus_100', imputer_minus_100),
            ('imputer_0', imputer_0),
            ('model', model),
        ]
    )

    grid_params = {
        "model__min_samples_leaf": [1, 2, 4, 5],
        "model__n_estimators": [250, 500, 750, 1000, 1500],
    }
    grid = model_selection.GridSearchCV(
        model_pipeline, grid_params, scoring="roc_auc", cv=3, verbose=3
    )
    grid.fit(X_train, y_train)

    proba = grid.predict_proba(X_train)
    train_auc = metrics.roc_auc_score(y_train, proba[:, 1])
    mlflow.log_metric("train_auc", train_auc)

    proba_test = grid.predict_proba(X_test)
    test_auc = metrics.roc_auc_score(y_test, proba_test[:, 1])
    mlflow.log_metric("test_auc", test_auc)

    proba_oot = grid.predict_proba(df_oot[features])
    oot_auc = metrics.roc_auc_score(df_oot[target], proba_oot[:, 1])
    mlflow.log_metric("oot_auc", oot_auc)

    mlflow.sklearn.log_model(
        grid,
        "model",
        registered_model_name="olist_churn_model",
    )

In [None]:
pd.DataFrame(grid.cv_results_)

In [None]:
skplt.metrics.plot_roc(y_train, proba)
plt.show()

In [None]:
ax = skplt.metrics.plot_ks_statistic(y_train, proba)
plt.show()

In [None]:
ax = skplt.metrics.plot_roc(y_test, proba_test)
plt.show()

In [None]:
ax = skplt.metrics.plot_ks_statistic(y_test, proba_test)
plt.show()

In [None]:
ax = skplt.metrics.plot_roc(df_oot[target], proba_oot)
plt.show()

In [None]:
fs_importance = grid.best_estimator_[-1].feature_importances_
fs_cols = grid.best_estimator_[:-1].get_feature_names_out()

pd.Series(fs_importance, index=fs_cols).sort_values(ascending=False)

In [None]:
ax = skplt.metrics.plot_lift_curve(y_train, proba)
ax.set_ylim(bottom=0.9)
plt.show()

In [None]:
ax = skplt.metrics.plot_lift_curve(y_test, proba_test)
ax.set_ylim(bottom=0.9)
plt.show()