In [None]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn import model_selection
from sklearn import linear_model, tree
from feature_engine import imputation
from sklearn import pipeline
from sklearn import metrics
import scikitplot as skplt

POSTGRES_URI = "postgresql://postgres:postgres@localhost:5401/olist"
pd.set_option('display.max_rows', None)

In [None]:
conn = create_engine(POSTGRES_URI)

In [None]:
df_abt = pd.read_sql("SELECT * FROM analytics.abt_olist_churn", conn)
df_abt["date_reference"] = df_abt["date_reference"].astype(str)

df_oot = df_abt[df_abt["date_reference"] == '2018-01-01']
df_train = df_abt[df_abt["date_reference"] != '2018-01-01']

In [None]:
var_identity = ["date_reference", "date_ingestion", "seller_id"]
target = "churn"
to_remove = ["qty_recency", target] + var_identity
features = df_train.columns.tolist()
features = sorted(set(features) - set(to_remove))

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df_train[features],
    df_train[target],
    test_size=0.2,
    random_state=11,
)

print(f"churn propotion train: {y_train.mean():.4f}")
print(f"churn propotion test: {y_test.mean():.4f}")

In [None]:
missing_minus_1 = [
    "avg_order_interval",
    "avg_delivery_time_approved",
    "avg_delivery_time_ordered",
    "avg_expected_delivery_days",
    "min_score",
    "median_score",
    "avg_score",
    "max_score",
    "min_product_volume",
    "median_product_volume",
    "max_product_volume",
    "avg_product_volume",
]

missing_0 = [
    "min_installments",
    "median_installments",
    "max_installments",
    "avg_installments",
    "pct_delayed",
]

In [None]:
imputer_minus_100 = imputation.ArbitraryNumberImputer(-100, missing_minus_1)
imputer_0 = imputation.ArbitraryNumberImputer(0, missing_0)

In [None]:
model = tree.DecisionTreeClassifier(min_samples_leaf=50)

In [None]:
model_pipeline = pipeline.Pipeline(
    [
        ('imputer_minus_100', imputer_minus_100),
        ('imputer_0', imputer_0),
        ('model', model),
    ]
)

model_pipeline.fit(X_train, y_train)

In [None]:
proba = model_pipeline.predict_proba(X_train)
print(f"train ROC AUC: {metrics.roc_auc_score(y_train, proba[:, 1]):.4f}")

In [None]:
ax = skplt.metrics.plot_roc(y_train, proba)

In [None]:
ax = skplt.metrics.plot_ks_statistic(y_train, proba)

In [None]:
proba_test = model_pipeline.predict_proba(X_test)

In [None]:
ax = skplt.metrics.plot_roc(y_test, proba_test)

In [None]:
ax = skplt.metrics.plot_ks_statistic(y_test, proba_test)

In [None]:
proba_oot = model_pipeline.predict_proba(df_oot[features])

In [None]:
ax = skplt.metrics.plot_roc(df_oot[target], proba_oot)

In [None]:
fs_importance = model_pipeline[-1].feature_importances_
fs_cols = model_pipeline[:-1].get_feature_names_out()

pd.Series(fs_importance, index=fs_cols).sort_values(ascending=False)

In [None]:
ax = skplt.metrics.plot_lift_curve(y_train, proba)
ax.set_ylim(bottom=0.9)

In [None]:
ax = skplt.metrics.plot_lift_curve(y_test, proba_test)
ax.set_ylim(bottom=0.9)