# First test of the modeling part of the pipeline: responsible for creating the model
Notebook structure (recommended)
2. Data loading: load preprocessed datasets and metadata.
4. Model: define, train, persist.
5. Evaluation: compute and save metrics and plots.
6. Save artifacts: model, transformers, metrics, config.

In [1]:
import pandas as pd
import numpy as np

df_train = pd.read_csv('../data/churn/train.csv')

In [2]:
# Define the specific dates range
start_date = "2023-06-01"
end_date = "2023-09-30"

safra = []
# Generate a random date within the specified range
for i in range(df_train.shape[0]):
    safra.append(
        pd.to_datetime(
            np.random.choice(pd.date_range(start=start_date, end=end_date))
        ).strftime("%Y%m")
    )

df_train["safra"] = safra

rng = np.random.RandomState(42)
n_ones = int(round(0.2 * len(df_train)))

df_train['no_action'] = 0
df_train.loc[rng.choice(df_train.index, size=n_ones, replace=False), 'no_action'] = 1


In [3]:
df_oot = df_train[df_train['safra'] == '202309']
df_train = df_train[df_train['safra'] != '202309']


### 1. Mdl Training

In [4]:
pipeline_parameters = {
    'description_general': '''

    <div style="display:grid; grid-template-columns:300px 1fr; gap:8px 16px; align-items:start;">
        <div class="txt-hg-blue">Projeto:</div>
        <div class="txt-hg-bold">Ariel - Concessão de Crédito Rotativo</div>

        <div class="txt-hg-blue">Objetivo:</div>
        <div>Desenvolver modelo auxiliar target na tomada de decisão.</div>

        <div class="txt-hg-blue">Target:</div>
        <div><span style="background:#ffe8e0; color:#7a2b15; padding:3px 8px; border-radius:12px; font-weight:600;">Over 60 Mob 6</span></div>

        <div class="txt-hg-blue">Período de Treinamento:</div>
        <div>2024-06 a 2024-08</div>

        <div class="txt-hg-blue">Período de OOT:</div>
        <div>2024-09 a 2024-10</div>

        <div class="txt-hg-blue">Métricas de Avaliação:</div>
        <div>Gini Coefficient, % Alto Risco Target, % Baixo Risco Não Target</div>
      </div>

    ''',
    'target_obs': 'no_action',
    'target_obs_inf': 'no_action',
    'no_action_var': 'no_action',
    'date_var': 'safra',
    'date_oot': ['202406', '202407', '202408'],
    'features_excluded': ['ID', 'safra', 'no_action'],
    'mdls': [],
    'target_mdl': '',
    'categorical_features': [],
    'description_specific': '',
    'model_name': '',
    'tbl_version': '',
    'inf_version': '',
    'mdl_version': '',
}


pipeline_parameters['description_specific'] = 'Modelo inicial sem seleção de features e sem otimizacao de parametros.'

pipeline_parameters['tbl_version'] = 'ariel_tbl_0_0_1'
pipeline_parameters['inf_version'] = 'ariel_inf_0_0_1'
pipeline_parameters['mdl_version'] = 'sand_0_0_1'

pipeline_parameters['model_name'] = 'ariel_mdl_0_0_1'


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import lightgbm as lgb

# Train a LightGBM classifier for churn prediction (new notebook cell)


# feature setup: drop identifiers and text surname
drop_cols = ['id', 'CustomerId', 'Surname', 'safra', 'no_action']
target = 'Exited'

X = df_train.drop(columns=drop_cols + [target])
y = df_train[target]

# mark categorical features so LightGBM can handle them natively
for c in ['Geography', 'Gender']:
    if c in X.columns:
        X[c] = X[c].astype('category')
    if c in df_train.columns:
        df_train[c] = df_train[c].astype('category')

# train/validation split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# model
model = lgb.LGBMClassifier(random_state=42)

# fit with early stopping
model.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    categorical_feature=['Geography', 'Gender']
)


# validation metrics
val_probs = model.predict_proba(X_val)[:, 1]
val_preds = (val_probs >= 0.5).astype(int)
print("Validation AUC:", round(roc_auc_score(y_val, val_probs), 4))
print("Validation Accuracy:", round(accuracy_score(y_val, val_preds), 4))
print(classification_report(y_val, val_preds, digits=4))

[LightGBM] [Info] Number of positive: 21088, number of negative: 78463
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001056 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 859
[LightGBM] [Info] Number of data points in the train set: 99551, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211831 -> initscore=-1.313923
[LightGBM] [Info] Start training from score -1.313923
Validation AUC: 0.8895
Validation Accuracy: 0.8664
              precision    recall  f1-score   support

           0     0.8903    0.9471    0.9178     19616
           1     0.7420    0.5658    0.6421      5272

    accuracy                         0.8664     24888
   macro avg     0.8162    0.7565    0.7800     24888
weighted avg     0.8589    0.8664    0.8594     24888



In [58]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import lightgbm as lgb

# Train a LightGBM classifier for churn prediction (new notebook cell)


# feature setup: drop identifiers and text surname
drop_cols = ['id', 'CustomerId', 'Surname', 'safra', 'no_action']
target = 'Exited'

X = df_train.drop(columns=drop_cols + [target])
y = df_train[target]

# mark categorical features so LightGBM can handle them natively
for c in ['Geography', 'Gender']:
    if c in X.columns:
        X[c] = X[c].astype('category')
    if c in df_train.columns:
        df_train[c] = df_train[c].astype('category')

# train/validation split
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=32, stratify=y)

# model
model2 = lgb.LGBMClassifier(random_state=42)

# fit with early stopping
model2.fit(
    X_tr, y_tr,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    categorical_feature=['Geography', 'Gender']
)


# validation metrics
val_probs = model2.predict_proba(X_val)[:, 1]
val_preds = (val_probs >= 0.5).astype(int)
print("Validation AUC:", round(roc_auc_score(y_val, val_probs), 4))
print("Validation Accuracy:", round(accuracy_score(y_val, val_preds), 4))
print(classification_report(y_val, val_preds, digits=4))

[LightGBM] [Info] Number of positive: 21088, number of negative: 78463
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000725 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1114
[LightGBM] [Info] Number of data points in the train set: 99551, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.211831 -> initscore=-1.313923
[LightGBM] [Info] Start training from score -1.313923
Validation AUC: 0.894
Validation Accuracy: 0.868
              precision    recall  f1-score   support

           0     0.8912    0.9483    0.9188     19616
           1     0.7474    0.5690    0.6461      5272

    accuracy                         0.8680     24888
   macro avg     0.8193    0.7587    0.7825     24888
weighted avg     0.8607    0.8680    0.8611     24888



In [6]:
df_train['model_score'] = model.predict_proba(X)[:, 1]

In [59]:
df_train['model_score2'] = model2.predict_proba(X)[:, 1]

In [68]:
def gini_from_scores(y_true, y_score):
    try:
        auc = roc_auc_score(y_true, y_score)
    except Exception:
        return np.nan
    return 2 * auc - 1

mdls = ['model_score']

# Gini per safra

gini_per_safra = pd.DataFrame()

for mdl in mdls:
    gini_safra = (
        df_train.groupby("safra")
        .apply(lambda g: gini_from_scores(g[target], g[mdl]))
        .reset_index()
        .rename(columns={0: f"gini_{mdl}"})
    )
    if gini_per_safra.empty:
        gini_per_safra = gini_safra
    else:
        gini_per_safra = gini_per_safra.merge(gini_safra, on="safra")

        
    gini_per_safra[f"gini_{mdl}"] = (gini_per_safra[f"gini_{mdl}"] * 100).round(2)


gini_per_safra


  .apply(lambda g: gini_from_scores(g[target], g[mdl]))


Unnamed: 0,safra,gini_model_score
0,202306,79.57
1,202307,79.2
2,202308,79.42


In [70]:
gini_cols

['gini_model_score']

In [73]:
import altair as alt

gini_cols = [c for c in gini_per_safra.columns if c.startswith("gini_")]
y_max = gini_per_safra[gini_cols].max(axis=0).values.max()
y_min = gini_per_safra[gini_cols].min(axis=0).values.min()

y_scale = alt.Scale(domain=[max(0, y_min - 5), y_max + 5], nice=True)

line = (
    alt.Chart(gini_per_safra)
    .mark_line(point=True)
    .encode(
        x=alt.X("safra:N", title="Safra", sort="ascending", axis=alt.Axis(labelAngle=-45)),
        y=alt.Y("gini_model_score:Q", title="Gini", scale=y_scale, axis=alt.Axis(labels=False)),
    )
)

labels = (
    alt.Chart(gini_per_safra)
    .mark_text(align="center", dy=-10, color="black")
    .encode(
        x=alt.X("safra:N", sort="ascending"),
        y=alt.Y("gini_model_score:Q", scale=y_scale),
        text=alt.Text("gini_model_score:Q", format=".1f"),
    )
)

chart_line = (line + labels).properties(width=200, height=250)

chart_line


In [103]:
# build gini per safra for both models and plot them on the same chart
mdls = ['model_score', 'model_score2']

def create_gini_chart(df_train, target, mdls):
        
    # compute gini per safra for each model
    gini_per_safra = (
        df_train.groupby("safra")
        .apply(lambda g: pd.Series({f"gini_{m}": gini_from_scores(g[target], g[m]) for m in mdls}))
        .reset_index()
    )

    for col in [f"gini_{m}" for m in mdls]:
        gini_per_safra[col] = (gini_per_safra[col] * 100).round(2)

    # long format for Altair
    gini_long = gini_per_safra.melt(id_vars="safra", value_vars=[f"gini_{m}" for m in mdls],
                                    var_name="model", value_name="gini")

    # y scale domain
    y_max = gini_long["gini"].max()
    y_min = gini_long["gini"].min()
    y_scale = alt.Scale(domain=[max(0, y_min - 5), y_max + 5], nice=True)
    # line + points per model with legend
    line = (
        alt.Chart(gini_long)
        .mark_line(point=True)
        .encode(
            x=alt.X("safra:N", title="Safra", sort="ascending", axis=alt.Axis(labelAngle=-45)),
            y=alt.Y("gini:Q", title="Gini", scale=y_scale),
            color=alt.Color("model:N", title="Model", legend=alt.Legend(orient="bottom")),
            tooltip=["safra", "model", alt.Tooltip("gini:Q", format=".2f")],
        )
    )

    # labels on points (no legend duplication) with smaller font
    labels = (
        alt.Chart(gini_long)
        .mark_text(align="center", dy=-10, fontSize=9)
        .encode(
            x=alt.X("safra:N", sort="ascending"),
            y=alt.Y("gini:Q", scale=y_scale),
            text=alt.Text("gini:Q", format=".1f"),
            color=alt.Color("model:N", legend=None),
        )
    )

    (chart := (line + labels).properties(width=200, height=150))
    return chart

gini_obs_graph = create_gini_chart(df_train, target='Exited', mdls=mdls)
gini_obs_inf_graph = create_gini_chart(df_train, target='Exited', mdls=mdls)

temp = df_train[df_train['no_action'] == 1]

gini_no_action_graph = create_gini_chart(temp, target='Exited', mdls=mdls)

  .apply(lambda g: pd.Series({f"gini_{m}": gini_from_scores(g[target], g[m]) for m in mdls}))
  .apply(lambda g: pd.Series({f"gini_{m}": gini_from_scores(g[target], g[m]) for m in mdls}))
  .apply(lambda g: pd.Series({f"gini_{m}": gini_from_scores(g[target], g[m]) for m in mdls}))


In [104]:
gini_obs_graph_json = gini_obs_graph.to_json()
gini_obs_inf_graph_json = gini_obs_inf_graph.to_json()
gini_no_action_graph_json = gini_no_action_graph.to_json()

In [96]:
gini_train = round(gini_from_scores(df_train['Exited'], df_train['model_score']) * 100, 2)
df_val = df_train[df_train.index.isin(X_val.index)]
gini_test = round(gini_from_scores(df_val['Exited'], model.predict_proba(df_val[model.feature_names_in_])[:, 1]) * 100, 2)


# mark categorical features so LightGBM can handle them natively
for c in ['Geography', 'Gender']:
    if c in X.columns:
        X[c] = X[c].astype('category')
    if c in df_oot.columns:
        df_oot[c] = df_oot[c].astype('category')

gini_oot = round(gini_from_scores(df_oot['Exited'], model.predict_proba(df_oot[model.feature_names_in_])[:, 1]) * 100, 2)
gini_no_action = round(gini_from_scores(df_train[df_train['no_action'] == 1]['Exited'], df_train[df_train['no_action'] == 1]['model_score']) * 100, 2)


#### 2.1 Description

In [106]:
### JINJA TEMPLATE
import os
from jinja2 import Template

# Read the template from the template sheet
template_path = "../sandMdlLog/MdlLog_template.html"
with open(template_path, "r") as file:
    sanEda_mdl_template = file.read()

# Create a Jinja2 template object

template = Template(sanEda_mdl_template)

# Render the template with the data
rendered_html = template.render(
    title=pipeline_parameters['model_name'],
    description_general = pipeline_parameters['description_general'],
    description_specific = pipeline_parameters['description_specific'],
    tbl_version = pipeline_parameters['tbl_version'],
    inf_version = pipeline_parameters['inf_version'],
    mdl_version = pipeline_parameters['mdl_version'],
    init_params_txt = str(pipeline_parameters),
    tgt_neg_criteria = '10',
    ntgt_aprv_criteria = '2',
    gini_obs_graph_json = gini_obs_graph_json,
    gini_obs_inf_graph_json = gini_obs_inf_graph_json,
    gini_no_action_graph_json = gini_no_action_graph_json,
    gini_train = gini_train,
    gini_test = gini_test,
    gini_oot = gini_oot,
    gini_noaction = gini_no_action
)

# Save the rendered HTML to a file
with open(f"report_{pipeline_parameters['model_name']}.html", "w") as file:
    file.write(rendered_html)

In [None]:
def report(self, report_version: str):
    ### OVERVIEW

    sandeda = SandEDA(
        self.df, self.target_name, self.time_name, self.id_name, self.top_n
    )
    res_general = sandeda.calc_general()

    overview_target_metric_time = res_general["target_general"][
        "target_metric_time"
    ]

    # Create the bar plot using Altair
    chart = (
        alt.Chart(
            pd.DataFrame(
                {
                    "Date": list(overview_target_metric_time.keys()),
                    "%": list(overview_target_metric_time.values()),
                }
            )
        )
        .mark_bar()
        .encode(x=alt.X("Date", sort="ascending"), y=alt.Y("%"))
        .properties(width=100, height=100)
        .configure_axis(labelAngle=45)
        .configure_title(fontSize=10)
    )

    overview_tab_general = pd.DataFrame(
        list(res_general["dataset_general"].items()),
        columns=["Description", "Value"],
        index=None,
    )
    overview_tab_general["Value"] = overview_tab_general["Value"].apply(
        lambda x: f"{x:,.0f}")
    overview_tab_general = overview_tab_general.to_html(index=False, border=0)

    overview_tab_full = pd.DataFrame(res_general["missing_zero"]).to_html(
        index=False, border=0
    )

    overview_target_name = res_general["target_general"]["target_name"]

    overview_target_metric = round((
        res_general["target_general"]["number_of_one"]
        / (
            res_general["target_general"]["number_of_zero"]
            + res_general["target_general"]["number_of_one"]
        )
    ) * 100, 2)

    overview_tgt_graph_json = chart.to_json()

    ### VARIABLES

    iv_, mi_ = sandeda.promising_features()
    psi_, ks_ = sandeda.variables_estability()
    miss_, zero_ = sandeda.variables_fillment()

    var_tab_ks = pd.DataFrame(
        {
        "Variable": [var for var, _ in ks_],
        "KS": [round(max(value.values()), 3) for _, value in ks_],
        }
    ).to_html(index=False, border=0)

    var_tab_psi = pd.DataFrame(
        {
            "Variable": [var for var, _ in psi_],
            "PSI": [round(max(value.values()), 3) for _, value in psi_],
        }
    ).to_html(index=False, border=0)

    var_tab_iv = pd.DataFrame(
        {"Variable": [var for var, _ in iv_], "IV": [round(value, 3) for _, value in iv_]}
    ).to_html(index=False, border=0)

    var_tab_mi = pd.DataFrame(
        {"Variable": [var for var, _ in mi_], "MI": [round(value, 3) for _, value in mi_]}
    ).to_html(index=False, border=0)

    var_tab_miss = pd.DataFrame(miss_).to_html(index=False, border=0)
    var_tab_zero = pd.DataFrame(zero_).to_html(index=False, border=0)

    ### ESPECIFIC VARIABLES

    variables_espec = sandeda.variables_espec()

    variables_espec_time = sandeda.variables_espec_time()

    var_espec_content = {}

    vars_keys = variables_espec.keys() - {
        self.id_name,
        self.target_name,
        self.time_name,
    }

    for var_espec in vars_keys:
        hist_var = variables_espec[var_espec]["histogram"]

        decil_var = variables_espec[var_espec]["decil"]

        del (
            variables_espec[var_espec]["histogram"],
            variables_espec[var_espec]["decil"],
        )

        var_spec_tab_desc = pd.DataFrame(
            {
                "Description": variables_espec[var_espec][
                    "descriptive_statistics"
                ].keys(),
                "Value": [
                    f"{value:,.0f}" if isinstance(value, (int, float)) else str(value) for value in variables_espec[var_espec][
                        "descriptive_statistics"
                    ].values()
                ],
            }
        ).to_html(index=False, border=0)

        var_spec_tab_quant = pd.DataFrame(
            {
                "Description": variables_espec[var_espec][
                    "quantile_statistics"
                ].keys(),
                "Value": [
                    f"{value:,.0f}" if isinstance(value, (int, float)) else str(value) for value in variables_espec[var_espec][
                        "quantile_statistics"
                    ].values()
                ],
            }
        ).to_html(index=False, border=0)

        if variables_espec[var_espec]["descriptive_statistics"][
            "number_of_unique_values"
        ] <= 50 or variables_espec[var_espec]["descriptive_statistics"][
            "variable_type"
        ] not in ["int64", "float64"]:
            # Convert the histogram data to a dataframe
            hist_data = pd.DataFrame(
                {
                    "Interval": list(hist_var.keys()),
                    "Count": list(hist_var.values()),
                }
            )
        else:
            hist_data = pd.DataFrame(
                {
                    "Interval": [
                        f"{interval.left:.1f}" for interval in hist_var.keys()
                    ],
                    "Count": list(hist_var.values()),
                }
            )

        # Create the bar plot using Altair
        hist_chart = (
            alt.Chart(hist_data)
            .mark_bar()
            .encode(
                x=alt.X(
                    "Interval",
                    title="",
                    sort=None,
                    axis=alt.Axis(labels=True, labelOverlap=True, labelFontSize=9),
                ),
                y=alt.Y("Count", title="Qty", axis=alt.Axis(labels=False)),
            )
            .properties(width=350, height=200, title="Histogram")
            .configure_axis(labelAngle=45)
            .configure_title(fontSize=14)
            .interactive(False)
        )  # Disable interactive features

        var_spec_hist_graph_json = hist_chart.to_json()

        decil_data = pd.DataFrame(
            {"Decil": list(decil_var.index), "Target": list(np.round(decil_var.values, 2))}
        )

        # Create the bar plot using Altair
        decil_chart = (
            alt.Chart(decil_data)
            .mark_bar()
            .encode(
                x=alt.X(
                    "Decil",
                    title="Decil",
                    sort=None,
                    axis=alt.Axis(labels=True, labelOverlap=True, labelFontSize=9),
                ),
                y=alt.Y("Target", title="% Target", axis=alt.Axis(labels=False)),
            )
            .properties(width=350, height=200, title="Target mean per Decil")
            .configure_axis(labelAngle=45)
            .configure_title(fontSize=12)
            .interactive(False)
        )  # Disable interactive features

        var_spec_decil_graph_json = decil_chart.to_json()

        # Create a dataframe with the number_per_quintile values for each time period
        df_quintile_time = pd.DataFrame(
            {
                "Date": list(variables_espec_time[var_espec].keys()),
                "number_per_quintile": [
                    variables_espec_time[var_espec][date]["number_per_quintile"]
                    for date in variables_espec_time[var_espec].keys()
                ],
            }
        )

        # Expand the number_per_quintile dictionary into separate columns
        df_quintile_time = df_quintile_time.join(
            pd.DataFrame(
                df_quintile_time.pop("number_per_quintile").tolist(),
                index=df_quintile_time.index,
            )
        )

        # Melt the dataframe to have a long format suitable for Altair
        df_melted = df_quintile_time.melt(
            id_vars="Date", var_name="Quintile", value_name="Count"
        )

        # Calculate the percentage for each quintile
        df_melted["Percentage"] = df_melted.groupby("Date")["Count"].transform(
            lambda x: x / x.sum() * 100
        )

        # Create the 100% stacked column chart using Altair
        stacked_chart = (
            alt.Chart(df_melted)
            .mark_bar()
            .encode(
                x=alt.X("Date", title="Date"),
                y=alt.Y("Percentage", title="Percentage", stack="normalize"),
                color=alt.Color("Quintile", title="Legend"),
            )
            .properties(width=800, height=200, title="Distribution Over Time")
            .configure_axis(labelAngle=45)
            .configure_title(fontSize=14)
            .configure_legend(orient="top")
            .interactive(False)
        )  # Disable interactive features

        var_spec_stacked_graph_json = stacked_chart.to_json()

        var_espec_content[var_espec] = {
            "tab_desc": var_spec_tab_desc,
            "tab_quant": var_spec_tab_quant,
            "hist": var_spec_hist_graph_json,
            "decil": var_spec_decil_graph_json,
            "hist_time": var_spec_stacked_graph_json,
        }

    ### JINJA TEMPLATE

    # Read the template from the template sheet
    template_path = os.path.join(os.path.dirname(__file__), "./template/sandEda_template.html")
    with open(template_path, "r") as file:
        sanEda_template = file.read()

    # Create a Jinja2 template object

    template = Template(sanEda_template)

    # Render the template with the data
    rendered_html = template.render(
        title=report_version,
        overview_tab_general=overview_tab_general,
        overview_tab_full=overview_tab_full,
        overview_target_metric=overview_target_metric,
        overview_target_name=overview_target_name,
        overview_tgt_graph_json=overview_tgt_graph_json,
        var_tab_iv=var_tab_iv,
        var_tab_mi=var_tab_mi,
        var_tab_miss=var_tab_miss,
        var_tab_zero=var_tab_zero,
        var_tab_psi=var_tab_psi,
        var_tab_ks=var_tab_ks,
        specific_variables=var_espec_content,
    )

    # Save the rendered HTML to a file
    with open(f"SandEDA_{report_version}.html", "w") as file:
        file.write(rendered_html)