# Home Credit Functions

## Libraries

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Plotting Functions

In [5]:
def create_countplot(
    df, ax, col, colors, plot_xlabel="", plot_ylabel="", percent=False
):
    """
    Creates a vertical countplot with the provided percentages on top.
    """
    counts = df[col].value_counts()
    category_labels = [str(cat) for cat in counts.index]
    bars = ax.bar(category_labels, counts.values, color=colors)

    ax.set_xlabel(plot_xlabel)
    ax.set_ylabel(plot_ylabel)
    ax.tick_params(axis="both", which="both", length=0)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    ax.grid(False)

    for spine in ax.spines.values():
        spine.set_color("black")

    if percent:
        total_height = counts.sum()
        for bar in bars:
            height = bar.get_height()
            ax.annotate(
                f"{height/total_height:.1%}",
                (bar.get_x() + bar.get_width() / 2, height),
                ha="center",
                va="bottom",
                fontsize=10,
                color="black",
                xytext=(0, 2),
                textcoords="offset points",
            )

    ax.set_title(ax.get_title(), y=0.5)

In [6]:
def create_proportion_barchart(
    df, ax, col_x, col_y, plot_title="", plot_xlabel="", plot_ylabel=""
):
    """
    Creates a bar plot showing the proportion of late payments (TARGET).
    """
    proportion_late = (
        df.groupby(col_x, observed=True)[col_y]
        .mean()
        .reset_index()
        .sort_values(by=col_y, ascending=False)
    )

    bars = proportion_late.plot(
        kind="bar", x=col_x, y=col_y, ax=ax, color=custom_palette[1:]
    )

    ax.set_xlabel(plot_xlabel)
    ax.set_ylabel(plot_ylabel)
    ax.tick_params(axis="x", which="both", length=0)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    ax.grid(False)
    ax.legend_ = None

    for spine in ax.spines.values():
        spine.set_color("black")

    for bar in bars.patches:
        width, height = bar.get_width(), bar.get_height()
        x, y = bar.get_xy()
        ax.annotate(
            f"{height:.1%}",
            (bar.get_x() + bar.get_width() / 2, height),
            ha="center",
            va="bottom",
            fontsize=10,
            color="black",
            xytext=(0, 2),
            textcoords="offset points",
        )

    if plot_title:
        ax.set_title(plot_title)

In [7]:
def create_stacked_barchart(
    df, ax, col_x, col_hue, colors, plot_xlabel="", plot_ylabel="", percent=False
):
    """
    Creates a stacked bar plot with the provided percentages on top.
    """
    counts = pd.crosstab(df[col_x], df[col_hue], normalize="index")
    bars = counts.plot(kind="bar", stacked=True, color=colors, ax=ax)

    ax.set_xlabel(plot_xlabel)
    ax.set_ylabel(plot_ylabel)
    ax.tick_params(axis="x", which="both", length=0)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

    ax.grid(False)
    ax.legend_ = None

    for spine in ax.spines.values():
        spine.set_color("black")

    if percent:
        for bar in bars.patches:
            width, height = bar.get_width(), bar.get_height()
            x, y = bar.get_xy()
            ax.annotate(
                f"{height:.1%}",
                (x + width / 2, y + height / 2),
                ha="center",
                va="center",
                fontsize=10,
                color="white",
                xytext=(0, 2),
                textcoords="offset points",
            )

    ax.set_title(ax.get_title(), y=0.5)

In [8]:
def create_barplot(
    df, ax, col, colors, plot_xlabel="", plot_ylabel="", show_values=False
):
    """
    Creates a vertical barplot with the provided values.
    """
    bars = ax.barh(df[col], df["Correlation"], color=colors)

    ax.set_xlabel(plot_xlabel)
    ax.set_ylabel(plot_ylabel)
    ax.tick_params(axis="both", which="both", length=0)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.grid(False)

    for spine in ax.spines.values():
        spine.set_color("black")

    if show_values:
        for bar in bars:
            width = bar.get_width()
            ax.annotate(
                f"{width:.2f}",
                (width, bar.get_y() + bar.get_height() / 2),
                ha="left",
                va="center",
                fontsize=10,
                color="black",
                xytext=(5, 0),  # Offset for the text
                textcoords="offset points",
            )

    ax.set_title(ax.get_title(), y=0.5)

## Cleaning Functions

In [10]:
def clean_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Cleans the DataFrame according to specified rules."""

    replacements = {
        "XNA": np.nan,
        "XAP": np.nan,
    }

    df.replace(replacements, inplace=True)

    df["DAYS_EMPLOYED"].replace(365243, np.nan, inplace=True)

    df["DAYS_LAST_PHONE_CHANGE"].replace(0, np.nan, inplace=True)

    df["AGE_YEARS"] = abs(df["DAYS_BIRTH"]) / 365
    df["YEARS_EMPLOYED"] = abs(df["DAYS_EMPLOYED"]) / 365

    house_cols_drop = [col for col in df.columns if col.endswith(("MEDI", "MODE"))]
    df.drop(columns=house_cols_drop, inplace=True)

    df.drop(columns=["DAYS_EMPLOYED", "DAYS_BIRTH"], inplace=True)

    df["NAME_EDUCATION_TYPE"].replace(
        "Secondary / secondary special", "Secondary", inplace=True
    )

    return df

In [11]:
def downcast(df):
    """
    Downcasts numerical columns to the smallest possible data types (integers and floats)
    and converts object columns to categorical to reduce memory usage.

    Parameters:
    df (pd.DataFrame): The DataFrame to optimize.

    Returns:
    pd.DataFrame: A new DataFrame with optimized memory usage.
    """

    df_int = df.select_dtypes(include=["int"]).apply(pd.to_numeric, downcast="integer")

    df_float = df.select_dtypes(include=["float"]).apply(
        pd.to_numeric, downcast="float"
    )

    df_obj = df.select_dtypes(include=["object"]).astype("category")

    df[df_int.columns] = df_int
    df[df_float.columns] = df_float
    df[df_obj.columns] = df_obj

    return df

In [12]:
def replace_unknown_values(column: pd.Series) -> pd.Series:
    """Replaces specified unknown values with np.nan in a pandas Series."""
    unknown_values = ["XNA", "Unknown", "XAP"]

    column = column.replace(unknown_values, np.nan)

    if pd.api.types.is_categorical_dtype(column):
        column = column.cat.remove_unused_categories()

    return column

In [13]:
def years_convert(column: pd.Series) -> pd.Series:
    """Converts a pandas Series of days to years, handling negative and NaN values."""
    if not pd.api.types.is_numeric_dtype(column):
        raise ValueError("Input column must be numeric.")

    years = abs(column) / 365

    return years

In [14]:
def clean_feature_names(features):
    cleaned_features = [
        feature.replace("/", "_").replace(" ", "_") for feature in features
    ]
    return cleaned_features

## Modeling Functions

In [16]:
def train_lgbm_model(model, X_train, y_train, X_val, y_val):
    """
    Fit the LightGBM model and evaluate PR AUC and ROC AUC scores.
    """
    categorical_features = X_train.select_dtypes(
        include=["category", "object"]
    ).columns.tolist()

    model.fit(X_train, y_train, categorical_feature=categorical_features)

    lgb_base_preds_train = model.predict_proba(X_train)[:, 1]
    lgb_base_preds_val = model.predict_proba(X_val)[:, 1]

    pr_auc_lgb_base_train = average_precision_score(y_train, lgb_base_preds_train)
    pr_auc_lgb_base_val = average_precision_score(y_val, lgb_base_preds_val)
    auc_score_train = roc_auc_score(y_train, lgb_base_preds_train)
    auc_score_val = roc_auc_score(y_val, lgb_base_preds_val)

    print(f"Base LGBM Train PR AUC Score: {pr_auc_lgb_base_train:.4f}")
    print(f"Base LGBM Validation PR AUC Score: {pr_auc_lgb_base_val:.4f}")
    print(f"\nBase LGBM Train AUC Score: {auc_score_train:.4f}")
    print(f"Base LGBM Validation AUC Score: {auc_score_val:.4f}")

In [17]:
def objective(trial):
    num_leaves = trial.suggest_int("num_leaves", 16, 160)
    min_child_samples = trial.suggest_int("min_child_samples", 10, 100)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 50)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 0.5)
    subsample = trial.suggest_float("subsample", 0.7, 1)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    reg_lambda = trial.suggest_float("reg_lambda", 0, 10)
    n_estimators = trial.suggest_int("n_estimators", 100, 10000)
    reg_alpha = trial.suggest_float("reg_alpha", 0, 10)

    lgb = LGBMClassifier(
        n_estimators=n_estimators,
        learning_rate=0.1,
        num_leaves=num_leaves,
        min_child_samples=min_child_samples,
        min_child_weight=min_child_weight,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        max_depth=max_depth,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        n_jobs=16,
        is_unbalance=True,
        verbose=-1,
    )

    with open(os.devnull, "w") as fnull:
        with contextlib.redirect_stdout(fnull):
            lgb.fit(
                X_train,
                y_train,
                eval_set=[(X_val, y_val)],
                callbacks=[log_evaluation(period=100), early_stopping(30)],
            )

    lgb_preds = lgb.predict_proba(X_val)[:, 1]
    pr = average_precision_score(y_val, lgb_preds)

    return pr

In [18]:
def plot_confusion_matrices(y_test, y_pred_dummy, final_pred, y_pred_adj):
    """
    Plot normalized confusion matrices for a dummy classifier, LightGBM, and adjusted LightGBM predictions.
    """

    conf_matrix_dummy = confusion_matrix(y_test, y_pred_dummy)
    conf_matrix_lgb = confusion_matrix(y_test, final_pred)
    conf_matrix_lgb_adj = confusion_matrix(y_test, y_pred_adj)

    conf_matrix_dummy = conf_matrix_dummy / conf_matrix_dummy.sum(axis=1, keepdims=True)
    conf_matrix_lgb = conf_matrix_lgb / conf_matrix_lgb.sum(axis=1, keepdims=True)
    conf_matrix_lgb_adj = conf_matrix_lgb_adj / conf_matrix_lgb_adj.sum(
        axis=1, keepdims=True
    )

    fig, axes = plt.subplots(1, 3, figsize=(13, 5))

    sns.heatmap(
        conf_matrix_dummy,
        annot=True,
        cmap="Blues",
        fmt=".2%",
        ax=axes[0],
        cbar=False,
        xticklabels=["Predicted Negative", "Predicted Positive"],
        yticklabels=["Actual Negative", "Actual Positive"],
    )
    axes[0].set_title("Dummy Classifier")
    axes[0].set_xlabel("")
    axes[0].set_ylabel("Actual")

    sns.heatmap(
        conf_matrix_lgb,
        annot=True,
        cmap="Blues",
        fmt=".2%",
        ax=axes[1],
        cbar=False,
        xticklabels=["Predicted Negative", "Predicted Positive"],
        yticklabels=["Actual Negative", "Actual Positive"],
    )
    axes[1].set_title("LightGBM")
    axes[1].set_xlabel("Predicted")
    axes[1].set_ylabel("")

    sns.heatmap(
        conf_matrix_lgb_adj,
        annot=True,
        cmap="Blues",
        fmt=".2%",
        ax=axes[2],
        cbar=False,
        xticklabels=["Predicted Negative", "Predicted Positive"],
        yticklabels=["Actual Negative", "Actual Positive"],
    )
    axes[2].set_title("LightGBM - Adjusted")
    axes[2].set_xlabel("")
    axes[2].set_ylabel("")

    plt.tight_layout()
    plt.show()

In [54]:
def remove_highly_correlated_features(
    df: pd.DataFrame, cat_features: list, threshold: float = 0.9
) -> list:
    """
    Identify numerical features that are highly correlated based on the training set,
    and return the list of features to drop.
    """
    num_df = df.drop(columns=cat_features)

    corr_matrix = num_df.corr().abs()

    upper_triangle = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_)
    )

    to_drop = [
        column
        for column in upper_triangle.columns
        if any(upper_triangle[column] > threshold)
    ]

    print(
        f"Number of highly correlated numerical features to be removed: {len(to_drop)}"
    )
    return to_drop


In [57]:
def target_encode_auto(
    train: pd.DataFrame, val: pd.DataFrame, test: pd.DataFrame, target: pd.Series
) -> tuple:
    """Automatically extract categorical columns and apply target encoding."""

    categorical_columns = train.select_dtypes(
        include=["object", "category"]
    ).columns.tolist()

    encoder = TargetEncoder(cols=categorical_columns)

    train_encoded = encoder.fit_transform(train[categorical_columns], target)

    val_encoded = encoder.transform(val[categorical_columns])
    test_encoded = encoder.transform(test[categorical_columns])

    train[categorical_columns] = train_encoded
    val[categorical_columns] = val_encoded
    test[categorical_columns] = test_encoded

    return train, val, test

## Statistical Functions

In [20]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    r, k = confusion_matrix.shape
    return np.sqrt(chi2 / (n * (min(r - 1, k - 1))))

In [21]:
def calculate_confidence_interval(data):
    mean = data.mean()
    n = len(data)
    standard_error = stats.sem(data)
    margin_of_error = 1.96 * standard_error
    lower_ci, upper_ci = (mean - margin_of_error), (mean + margin_of_error)
    return mean, lower_ci, upper_ci

## Feature Functions

In [23]:
def calculate_application_features(df):
    """
    Calculates various income and credit ratios and adds them to the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to which the calculated ratios will be added.
    """

    # Income and Credit Ratios
    df["ANNUITY_INCOME_RATIO"] = df["AMT_ANNUITY"] / df["AMT_INCOME_TOTAL"].replace(
        0, np.nan
    )
    df["INCOME_PER_FAMILY_MEMBER"] = df["AMT_INCOME_TOTAL"] / df[
        "CNT_FAM_MEMBERS"
    ].replace(0, np.nan)
    df["INCOME_PER_ADULT"] = df["AMT_INCOME_TOTAL"] / (
        df["CNT_FAM_MEMBERS"] - df["CNT_CHILDREN"] + 1
    ).replace(0, np.nan)
    df["CREDIT_PER_FAMILY_MEMBER"] = df["AMT_CREDIT"] / df["CNT_FAM_MEMBERS"].replace(
        0, np.nan
    )
    df["LOAN_TO_INCOME_RATIO"] = df["AMT_CREDIT"] / df["AMT_INCOME_TOTAL"].replace(
        0, np.nan
    )

    # Credit/Loan Features
    df["LOAN_TO_VALUE"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"].replace(0, np.nan)
    df["ANNUITY_TO_CREDIT"] = df["AMT_ANNUITY"] / df["AMT_CREDIT"].replace(0, np.nan)
    df["AMT_DIFF_CREDIT_GOODS"] = df["AMT_CREDIT"] - df["AMT_GOODS_PRICE"]
    df["LOAN_TO_GOODS_DIFFERENCE_RATIO"] = (
        df["AMT_CREDIT"] - df["AMT_GOODS_PRICE"]
    ) / df["AMT_CREDIT"]
    df["CREDIT_TO_ANNUITY_RATIO"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"].replace(
        0, np.nan
    )

    # External Sources
    df["EXT_SOURCE_MEAN"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(
        axis=1
    )
    df["EXT_SOURCE_PROD"] = df["EXT_SOURCE_1"] * df["EXT_SOURCE_2"] * df["EXT_SOURCE_3"]
    df["EXT_SOURCE_WEIGHTED_AVG"] = (
        0.5 * df["EXT_SOURCE_3"] + 0.3 * df["EXT_SOURCE_2"] + 0.2 * df["EXT_SOURCE_1"]
    )
    df["EXT_SOURCE_STD"] = df[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(
        axis=1
    )

    # Employment & Age Ratios
    df["EMPLOYMENT_AGE_RATIO"] = df["YEARS_EMPLOYED"] / df["AGE_YEARS"].replace(
        0, np.nan
    )
    df["LONG_EMPLOYMENT"] = np.where(df["YEARS_EMPLOYED"] < 10, 0, 1)

    # Family & Occupation
    df["HAS_CHILDREN"] = df["CNT_CHILDREN"].map(lambda x: 1 if x > 0 else 0)
    df["CHILDREN_RATIO"] = df["CNT_CHILDREN"] / df["CNT_FAM_MEMBERS"].replace(0, np.nan)
    df["CHILDREN_TO_ADULT_RATIO"] = df["CNT_CHILDREN"] / (
        df["CNT_FAM_MEMBERS"] - df["CNT_CHILDREN"] + 1
    ).replace(0, np.nan)
    df["LOW_SKILL_LABORER_FLAG"] = np.where(
        df["OCCUPATION_TYPE"] == "Low-skill Laborers", 1, 0
    )

In [24]:
def calculate_bureau_features(df, active_status):

    filtered_df = df[df["CREDIT_ACTIVE"] == active_status]

    agg_funcs = {
        "AMT_CREDIT_SUM": ["sum", "mean", "max", "min"],
        "AMT_CREDIT_SUM_DEBT": ["sum", "mean", "max", "min"],
        "AMT_CREDIT_SUM_OVERDUE": ["sum", "max"],
        "AMT_CREDIT_MAX_OVERDUE": ["max"],
        "CREDIT_DAY_OVERDUE": ["mean"],
        "DAYS_CREDIT_ENDDATE": ["min", "max"],
        "DAYS_ENDDATE_FACT": ["mean"],
        "CNT_CREDIT_PROLONG": ["sum"],
        "DAYS_CREDIT_UPDATE": ["mean"],
        "AMT_ANNUITY": ["sum", "mean"],
    }

    df_agg = filtered_df.groupby("SK_ID_CURR").agg(agg_funcs).reset_index()

    df_agg.columns = [
        "SK_ID_CURR" if col[0] == "SK_ID_CURR" else f"{col[0]}_{col[1]}"
        for col in df_agg.columns.values
    ]

    df_agg["CREDIT_ACTIVE_COUNT"] = (
        filtered_df.groupby("SK_ID_CURR")["CREDIT_ACTIVE"].count().values
    )

    df_agg["DEBT_TO_CREDIT_RATIO"] = df_agg["AMT_CREDIT_SUM_DEBT_sum"] / df_agg[
        "AMT_CREDIT_SUM_sum"
    ].replace(0, np.nan)
    df_agg["CREDIT_UTILIZATION_RATIO"] = df_agg["AMT_CREDIT_SUM_DEBT_sum"] / df_agg[
        "AMT_CREDIT_SUM_sum"
    ].replace(0, np.nan)
    df_agg["OVERDUE_RATIO"] = df_agg["AMT_CREDIT_SUM_OVERDUE_sum"] / df_agg[
        "AMT_CREDIT_SUM_sum"
    ].replace(0, np.nan)
    df_agg["AVERAGE_LOAN_AMOUNT"] = df_agg["AMT_CREDIT_SUM_sum"] / df_agg[
        "CREDIT_ACTIVE_COUNT"
    ].replace(0, np.nan)
    df_agg["PROLONGED_CREDIT_RATIO"] = df_agg["CNT_CREDIT_PROLONG_sum"] / df_agg[
        "CREDIT_ACTIVE_COUNT"
    ].replace(0, np.nan)
    df_agg["ANNUITY_TO_CREDIT_RATIO"] = df_agg["AMT_ANNUITY_sum"] / df_agg[
        "AMT_CREDIT_SUM_sum"
    ].replace(0, np.nan)

    return df_agg

In [25]:
def calculate_poscash_features(df):
    pos_cash_agg = df.groupby("SK_ID_CURR").agg(
        {
            "SK_DPD": ["mean", "max", "min"],
            "CNT_INSTALMENT_FUTURE": ["sum", "max", "min", "mean"],
            "CNT_INSTALMENT": "mean",
            "SK_ID_PREV": "nunique",
        }
    )

    pos_cash_agg.columns = [
        "SK_ID_CURR" if col[0] == "SK_ID_CURR" else f"{col[0]}_{col[1]}"
        for col in pos_cash_agg.columns.values
    ]

    return pos_cash_agg

In [26]:
def calculate_cc_balance_features(df):

    df_agg = df.groupby("SK_ID_CURR").agg(
        {
            "SK_ID_PREV": ["count", "nunique"],
            "MONTHS_BALANCE": ["mean", "max", "min"],
            "AMT_BALANCE": ["mean", "sum"],
            "AMT_CREDIT_LIMIT_ACTUAL": ["mean", "max"],
            "AMT_DRAWINGS_ATM_CURRENT": ["sum"],
            "AMT_DRAWINGS_CURRENT": ["sum"],
            "AMT_DRAWINGS_OTHER_CURRENT": ["sum"],
            "AMT_DRAWINGS_POS_CURRENT": ["sum"],
            "AMT_INST_MIN_REGULARITY": ["mean"],
            "AMT_PAYMENT_CURRENT": ["sum"],
            "AMT_PAYMENT_TOTAL_CURRENT": ["sum"],
            "AMT_RECEIVABLE_PRINCIPAL": ["sum"],
            "AMT_RECIVABLE": ["sum"],
            "AMT_TOTAL_RECEIVABLE": ["sum"],
            "CNT_DRAWINGS_ATM_CURRENT": ["sum"],
            "CNT_DRAWINGS_CURRENT": ["sum"],
            "CNT_DRAWINGS_OTHER_CURRENT": ["sum"],
            "CNT_DRAWINGS_POS_CURRENT": ["sum"],
            "CNT_INSTALMENT_MATURE_CUM": ["sum"],
            "SK_DPD": ["mean", "max"],
            "SK_DPD_DEF": ["mean", "max"],
        }
    )

    df_agg.columns = ["_".join(col).strip().upper() for col in df_agg.columns]
    df_agg.rename(columns={"SK_ID_CURR_": "SK_ID_CURR"}, inplace=True)

    df_agg["AVG_PAYMENT_TO_BALANCE_RATIO"] = df_agg[
        "AMT_PAYMENT_TOTAL_CURRENT_SUM"
    ] / df_agg["AMT_BALANCE_MEAN"].replace(0, np.nan)
    df_agg["TOTAL_DRAWINGS_RATIO"] = (
        df_agg["AMT_DRAWINGS_ATM_CURRENT_SUM"]
        + df_agg["AMT_DRAWINGS_CURRENT_SUM"]
        + df_agg["AMT_DRAWINGS_OTHER_CURRENT_SUM"]
        + df_agg["AMT_DRAWINGS_POS_CURRENT_SUM"]
    ) / df_agg["AMT_CREDIT_LIMIT_ACTUAL_MEAN"].replace(0, np.nan)
    df_agg["OVERDUE_RATIO"] = df_agg["SK_DPD_DEF_MAX"] / (
        df_agg["SK_DPD_DEF_MAX"] + df_agg["SK_DPD_MAX"]
    ).replace(0, np.nan)

    return df_agg

In [27]:
def calculate_installments_features(df):

    df["DAYS_LATE"] = df["DAYS_ENTRY_PAYMENT"] - df["DAYS_INSTALMENT"]
    df["PAYMENT_RATIO"] = df["AMT_PAYMENT"] / df["AMT_INSTALMENT"].replace(0, 1)

    df_agg = (
        df.groupby("SK_ID_CURR")
        .agg(
            Average_Installment=("AMT_INSTALMENT", "mean"),
            Total_Payments=("AMT_PAYMENT", "sum"),
            Payment_Ratio=("PAYMENT_RATIO", "mean"),
            Average_Days_Late=("DAYS_LATE", "mean"),
            Total_Installments=("NUM_INSTALMENT_NUMBER", "count"),
            Max_Payment=("AMT_PAYMENT", "max"),
            Min_Payment=("AMT_PAYMENT", "min"),
            Late_Payment_Count=("DAYS_LATE", lambda x: (x > 0).sum()),
        )
        .reset_index()
    )

    return df_agg

In [28]:
def calculate_prev_features(df):
    df["APP_CREDIT_PERC"] = df["AMT_APPLICATION"] / df["AMT_CREDIT"]
    df["AMT_DIFF_CREAPP"] = df["AMT_APPLICATION"] - df["AMT_CREDIT"]
    df["AMT_DIFF_CREDIT_GOODS"] = df["AMT_CREDIT"] - df["AMT_GOODS_PRICE"]
    df["AMT_CREDIT_GOODS_PERC"] = df["AMT_CREDIT"] / df["AMT_GOODS_PRICE"].replace(
        0, np.nan
    )
    df["AMT_PAY_YEAR"] = df["AMT_CREDIT"] / df["AMT_ANNUITY"].replace(0, np.nan)
    df["DAYS_TOTAL"] = df["DAYS_LAST_DUE"] - df["DAYS_FIRST_DUE"]
    df["DAYS_TOTAL2"] = df["DAYS_LAST_DUE_1ST_VERSION"] - df["DAYS_FIRST_DUE"]
    df["DAYS_END_DIFF"] = df["DAYS_LAST_DUE_1ST_VERSION"] - df["DAYS_LAST_DUE"]

    num_aggregations = {
        "SK_ID_PREV": ["count"],
        "AMT_ANNUITY": ["max", "mean"],
        "AMT_APPLICATION": ["max", "mean"],
        "AMT_CREDIT": ["mean", "sum"],
        "APP_CREDIT_PERC": ["max", "mean"],
        "AMT_DIFF_CREAPP": ["max", "mean"],
        "AMT_DIFF_CREDIT_GOODS": ["max", "mean"],
        "AMT_CREDIT_GOODS_PERC": ["max", "mean"],
        "AMT_PAY_YEAR": ["max", "mean"],
        "AMT_DOWN_PAYMENT": ["max", "mean"],
        "RATE_DOWN_PAYMENT": ["max", "mean"],
        "DAYS_DECISION": ["max", "mean", "min"],
        "CNT_PAYMENT": ["mean", "sum"],
    }

    df_agg = df.groupby("SK_ID_CURR").agg(num_aggregations).reset_index()

    df_agg.columns = [
        "SK_ID_CURR" if col[0] == "SK_ID_CURR" else f"{col[0]}_{col[1]}"
        for col in df_agg.columns.values
    ]

    return df_agg