In [None]:
import random
from collections import OrderedDict
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme()

data_reduction = OrderedDict()

SEED = 1337

def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(SEED)

# Data Import & Wrangling

## Helper Functions

In [None]:
def remap_values(df, column, mapping):
    # assert that all values in the column are in the mapping except for NaN
    assert df[column].dropna().isin(mapping.keys()).all()

    df[column] = df[column].map(mapping, na_action="ignore")
    return df


def map_empty_to_nan(df, column):
    if df[column].dtype != "object":
        return df

    df[column] = df[column].replace(r"^\s*$", np.nan, regex=True)
    return df


def read_csv(file_path, sep=";", dtypes=None):
    df = pd.read_csv(file_path, sep=sep, dtype=dtypes)

    for col in df.columns:
        df = map_empty_to_nan(df, col)

    return df

In [None]:
def plot_categorical_variables(df, categorical_columns, fill_na_value="NA", rotate_x=False):
    """
    Plots count plots for categorical variables in a DataFrame, filling NA values with a specified string.

    Parameters:
    - df: pandas.DataFrame containing the data.
    - categorical_vars: list of strings, names of the categorical variables in df to plot.
    - fill_na_value: string, the value to use for filling NA values in the categorical variables.
    """
    # Fill NA values in the specified categorical variables
    for var in categorical_columns:
        if df[var].isna().any():
            df[var] = df[var].fillna(fill_na_value)

    total = float(len(df))
    fig, axes = plt.subplots(
        nrows=len(categorical_columns), figsize=(8, len(categorical_columns) * 5)
    )

    if len(categorical_columns) == 1:  # If there's only one categorical variable, wrap axes in a list
        axes = [axes]

    for i, var in enumerate(categorical_columns):
        ax = sns.countplot(
            x=var, data=df, ax=axes[i], order=df[var].value_counts().index
        )

        axes[i].set_title(f"Distribution of {var}")
        axes[i].set_ylabel("Count")
        axes[i].set_xlabel(var)

        # if the number is more than 5 rotate the x labels or if specified
        if len(df[var].value_counts()) > 5 or rotate_x:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

        for p in ax.patches:
            height = p.get_height()
            ax.text(
                p.get_x() + p.get_width() / 2.0,
                height + 3,
                "{:1.2f}%".format((height / total) * 100),
                ha="center",
            )

    plt.tight_layout()
    plt.show()


def plot_numerical_distributions(df, numerical_columns, kde=True, bins=30):
    """
    Plots the distribution of all numerical variables in a DataFrame.

    Parameters:
    - df: pandas.DataFrame containing the data.
    """

    # Determine the number of rows needed for subplots based on the number of numerical variables
    nrows = len(numerical_columns)

    fig, axes = plt.subplots(nrows=nrows, ncols=1, figsize=(8, 5 * nrows))

    # If there's only one numerical variable, wrap axes in a list
    if nrows == 1:  
        axes = [axes]

    for i, var in enumerate(numerical_columns):
        sns.histplot(df[var], ax=axes[i], kde=kde, bins=bins)
        axes[i].set_title(f"Distribution of {var}")
        axes[i].set_xlabel(var)
        axes[i].set_ylabel("Frequency")

    plt.tight_layout()
    plt.show()


def plot_date_monthly_counts(df, date_column, title):
    """
    Plots the monthly counts of a date column in a DataFrame.

    Parameters:
    - df: pandas.DataFrame containing the data.
    - date_column: string, name of the date column in df to plot.
    - title: string, title of the plot.
    """
    df[date_column] = pd.to_datetime(df[date_column])
    df["month"] = df[date_column].dt.to_period("M")

    monthly_counts = df["month"].value_counts().sort_index()
    monthly_counts.plot(kind="bar")
    plt.title(title)
    plt.xlabel("Month")
    plt.ylabel("Count")
    plt.show()


def add_percentage_labels(ax, hue_order):
    for p in ax.patches:
        height = p.get_height()
        width = p.get_width()
        x = p.get_x()
        y = p.get_y()
        label_text = f"{height:.1f}%"
        label_x = x + width / 2
        label_y = y + height / 2
        ax.text(
            label_x,
            label_y,
            label_text,
            ha="center",
            va="center",
            fontsize=9,
            color="white",
            weight="bold"
        )

## Entities
The following section will describe the individual entities in the dataset.

Accordingly there is some simple remapping done to allow for easier understanding of the data. This can include translating the values from Czech to English or remapping the values to a more understandable format.

See code comments for more details of how the data is transformed.

### Accounts

In [None]:
accounts_df = read_csv("data/account.csv")

# translated frequency from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
accounts_df = remap_values(
    accounts_df,
    "frequency",
    {
        "POPLATEK MESICNE": "MONTHLY_ISSUANCE",
        "POPLATEK TYDNE": "WEEKLY_ISSUANCE",
        "POPLATEK PO OBRATU": "ISSUANCE_AFTER_TRANSACTION",
    },
)

accounts_df["date"] = pd.to_datetime(accounts_df["date"], format="%y%m%d")

accounts_df.rename(
    columns={"date": "account_created", "frequency": "account_frequency"}, inplace=True
)

data_reduction["Total number of accounts"] = len(accounts_df)
accounts_df.info()

In [None]:
accounts_df.head()

In [None]:
accounts_df.nunique()

In [None]:
plot_categorical_variables(accounts_df, ["account_frequency"])

In [None]:
plot_numerical_distributions(accounts_df, ["account_created"])

### Clients

In [None]:
clients_df = read_csv("data/client.csv")


def parse_birth_number(birth_number):
    birth_number_str = str(birth_number)

    # extract year, month, and day from birth number from string
    # according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
    year = int(birth_number_str[:2])
    month = int(birth_number_str[2:4])
    day = int(birth_number_str[4:6])

    # determine sex based on month and adjust month for female clients
    # according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
    if month > 50:
        sex = "Female"
        month -= 50
    else:
        sex = "Male"

    # quick validation
    assert 1 <= month <= 12
    assert 1 <= day <= 31
    assert 0 <= year <= 99

    if month in [4, 6, 9, 11]:
        assert 1 <= day <= 30
    elif month == 2:
        assert 1 <= day <= 29
    else:
        assert 1 <= day <= 31

    # assuming all dates are in the 1900s
    birth_date = datetime(1900 + year, month, day)
    return pd.Series([sex, birth_date])


clients_df[["sex", "birth_date"]] = clients_df["birth_number"].apply(parse_birth_number)

# calculate 'age' assuming the reference year is 1999
clients_df["age"] = clients_df["birth_date"].apply(lambda x: 1999 - x.year)

# drop 'birth_number' column as it is no longer needed
clients_df = clients_df.drop(columns=["birth_number"])

clients_df.info()

In [None]:
clients_df.head()

In [None]:
clients_df.describe()

In [None]:
plot_numerical_distributions(clients_df, ["birth_date", "age"], bins=40)

Here we can see the distribution of of birth dates and the age of the clients. We see that the majority of clients are between 20 and 60 years old. There are a few noticeable drops at around age 30, 50 and 70 years, which is not really of any concern. 

We also notice that there are a few underage clients.

### Dispositions

In [None]:
dispositions_df = read_csv("data/disp.csv")
dispositions_df.info()

In [None]:
dispositions_df.head()

In [None]:
dispositions_df.describe()

In [None]:
plot_categorical_variables(dispositions_df, ["type"])

The plot above shows the distribution of account types. We can see that roughly 4 out 5 accounts are categorized as "OWNER" accounts.

As the goal of this model is to address accounts and not client directly we will focus on the clients which own an account and focus solely on them.

In [None]:
dispositions_df = dispositions_df[dispositions_df["type"] == "OWNER"]

### Orders

In [None]:
orders_df = read_csv("data/order.csv")

# Translated from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
orders_df = remap_values(
    orders_df,
    "k_symbol",
    {
        "POJISTNE": "Insurance_Payment",
        "SIPO": "Household",
        "LEASING": "Leasing",
        "UVER": "Loan_Payment",
    },
)

orders_df["account_to"] = orders_df["account_to"].astype("category")

orders_df = orders_df.rename(columns={"amount": "debited_amount"})

orders_df.info()

In [None]:
orders_df.head()

In [None]:
orders_df.describe()

In [None]:
orders_df.nunique()

There appear to be as many order ids as there are rows. This implies that each order is unique which makes sense.

In [None]:
plot_categorical_variables(orders_df, ["k_symbol", "bank_to"])

Taking a closer look at the distribution of type of order (k_symbol) and receiving bank (bank_to) we can see that the majority of orders are Household orders and NA. 

We also see that the receiving bank is somewhat equally distributed, meaning that the orders are not concentrated on a single bank.

In [None]:
plot_numerical_distributions(orders_df, ["debited_amount"], bins=80)

The plot above shows that the debited amount are primarily between 0 and 4000. From there on we can see a steady decline up to around 15000.

### Transactions

In [None]:
# column 8 is the 'bank' column which contains NaNs and must be read as string
transactions_df = read_csv("data/trans.csv", dtypes={8: str})

transactions_df["date"] = pd.to_datetime(transactions_df["date"], format="%y%m%d")

# Translated type, operations and characteristics from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
transactions_df = remap_values(
    transactions_df,
    "type",
    {
        "VYBER": "Withdrawal",  # Also withdrawal as it is against the documentation present in the dataset
        "PRIJEM": "Credit",
        "VYDAJ": "Withdrawal",
    },
)

transactions_df = remap_values(
    transactions_df,
    "operation",
    {
        "VYBER KARTOU": "Credit Card Withdrawal",
        "VKLAD": "Credit in Cash",
        "PREVOD Z UCTU": "Collection from Another Bank",
        "VYBER": "Withdrawal in Cash",
        "PREVOD NA UCET": "Remittance to Another Bank",
    },
)

transactions_df = remap_values(
    transactions_df,
    "k_symbol",
    {
        "POJISTNE": "Insurance Payment",
        "SLUZBY": "Payment on Statement",
        "UROK": "Interest Credited",
        "SANKC. UROK": "Sanction Interest",
        "SIPO": "Household",
        "DUCHOD": "Old-age Pension",
        "UVER": "Loan Payment",
    },
)

# set the amount to negative for withdrawals and positive for credits
transactions_df["amount"] = np.where(
    transactions_df["type"] == "Credit",
    transactions_df["amount"],
    -transactions_df["amount"],
)

transactions_df.rename(columns={"type": "transaction_type"}, inplace=True)

transactions_df.info()

In [None]:
transactions_df.head()

In [None]:
transactions_df.describe()

In [None]:
plot_categorical_variables(
    transactions_df, ["transaction_type", "operation", "k_symbol"]
)

In [None]:
plot_numerical_distributions(transactions_df, ["date", "amount", "balance"])

Looking at the distributions of the transaction table we can see that the count of transactions per year increase over time. So we can conclude that the bank has a growing client base. It may however also show that there are more transactions per client.

However, the other plots are not very useful. For one the transaction amount seems to be very sparse, ranging from values between -80000 and 80000.

The balance distribution also showcases that there are accounts with a negative balance after a transaction, which would only make sense if debt is also included in this value.

According to description of the field balance: "balance after transaction"

#### {{{{POTENTIAL ADD A PLOT SHOWCASING CLIENT BASE GROWTH}}}}

#### Transaction Amounts and Counts by Month

In [None]:
# Getting a list of unique years from the dataset
transactions_df["year"] = transactions_df["date"].dt.year
transactions_df["month"] = transactions_df["date"].dt.month

months = [
    "Jan",
    "Feb",
    "Mar",
    "Apr",
    "May",
    "Jun",
    "Jul",
    "Aug",
    "Sep",
    "Oct",
    "Nov",
    "Dec",
]
years = sorted(transactions_df["year"].unique())

fig, axs = plt.subplots(
    len(years) * 2,
    1,
    figsize=(8, 6 * len(years)),
    sharex=True,
    gridspec_kw={"height_ratios": [3, 1] * len(years)},
)

for i, year in enumerate(years):
    # filter transactions for the current year
    yearly_transactions = transactions_df[transactions_df["year"] == year]

    # preparing data for the box plot: a list of amounts for each month for the current year
    amounts_per_month_yearly = [
        yearly_transactions[yearly_transactions["month"] == month]["amount"]
        for month in range(1, 13)
    ]

    # preparing data for the bar chart for the current year
    monthly_summary_yearly = (
        yearly_transactions.groupby("month")
        .agg(TotalAmount=("amount", "sum"), TransactionCount=("amount", "count"))
        .reset_index()
    )

    # box plot for transaction amounts by month for the current year
    axs[i * 2].boxplot(amounts_per_month_yearly, patch_artist=True)
    axs[i * 2].set_title(f"Transaction Amounts Per Month in {year} (Box Plot)")
    axs[i * 2].set_yscale("symlog")
    axs[i * 2].set_ylabel("Transaction Amounts (log scale)")
    axs[i * 2].grid(True, which="both")

    # Bar chart for transaction count by month for the current year
    axs[i * 2 + 1].bar(
        monthly_summary_yearly["month"],
        monthly_summary_yearly["TransactionCount"],
        color="tab:red",
        alpha=0.6,
    )
    axs[i * 2 + 1].set_ylabel("Transaction Count")
    axs[i * 2 + 1].grid(True, which="both")

# Setting x-ticks and labels for the last bar chart (shared x-axis for all)
axs[-1].set_xticks(range(1, 13))
axs[-1].set_xticklabels(months)
axs[-1].set_xlabel("Month")

plt.tight_layout()
plt.show()

#### Negative Balances

In [None]:
negative_balances = transactions_df[transactions_df["balance"] < 0]
plot_numerical_distributions(negative_balances, ["balance", "amount"])
print(f"Number of transactions with negative balance: {len(negative_balances)}")

There appear to be 2999 transactions which have a negative balance, therefore after the transaction the account balance was negative. This implies that these accounts are in some kind of debt.

### Loans

In [None]:
loans_df = read_csv("data/loan.csv")

loans_df["date"] = pd.to_datetime(loans_df["date"], format="%y%m%d")

loans_df["status"] = loans_df["status"].map(
    {
        "A": "Contract finished, no problems",
        "B": "Contract finished, loan not paid",
        "C": "Contract running, OK thus-far",
        "D": "Contract running, client in debt",
    }
)

loans_df.rename(
    columns={
        "date": "granted_date",
        "amount": "amount",
        "duration": "duration",
        "payments": "monthly_payments",
        "status": "status",
    },
    inplace=True,
)

loans_df.info()

In [None]:
loans_df.head()

In [None]:
loans_df.describe()

In [None]:
loans_df.nunique()

It seems as if one account can have at max one loan.

In [None]:
plot_categorical_variables(loans_df, ["duration", "status"], rotate_x=True)

The distribution of durations seems to be even. Most of the loans are still running and the majority of loans are in good standing. Around 200 loans are finished without problems. Around 11% of loans are in a potentially problematic state.

In [None]:
plot_numerical_distributions(loans_df, ["granted_date"])

The distribution of granted dates shows a steady increase with a drop at 1995-1996 before reaching its peak at 1998.

### Credit Cards

In [None]:
cards_df = read_csv("data/card.csv")

cards_df["issued"] = pd.to_datetime(
    cards_df["issued"], format="%y%m%d %H:%M:%S"
).dt.date

cards_df.info()

In [None]:
cards_df.head()

In [None]:
cards_df.describe()

In [None]:
plot_categorical_variables(cards_df, ["type"])

We can see that a majority of the credit cards issued are classic. A small fraction of 16.26% of the credit cards are junior cards and 9.87% are gold cards. 

In [None]:
plot_numerical_distributions(cards_df, ["issued"])

The plot above shows that over time there appear to be more credit cards issued which aligns with assumed growth of the bank (as shown in the previous plots with increasing number of transactions).

### Demographic data

In [None]:
districts_df = read_csv("data/district.csv")

# rename columns
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
districts_df.rename(
    columns={
        "A1": "district_id",
        "A2": "district_name",
        "A3": "region",
        "A4": "inhabitants",
        "A5": "small_municipalities",
        "A6": "medium_municipalities",
        "A7": "large_municipalities",
        "A8": "huge_municipalities",
        "A9": "cities",
        "A10": "ratio_urban_inhabitants",
        "A11": "average_salary",
        "A12": "unemployment_rate_1995",
        "A13": "unemployment_rate_1996",
        "A14": "entrepreneurs_per_1000_inhabitants",
        "A15": "crimes_committed_1995",
        "A16": "crimes_committed_1996",
    },
    inplace=True,
)

for col in [
    "unemployment_rate_1995",
    "unemployment_rate_1996",
    "crimes_committed_1995",
    "crimes_committed_1996",
]:
    districts_df[col] = pd.to_numeric(districts_df[col], errors="coerce")

districts_df.info()

It appears as if there is 1 null value for unemployment rate in 1995 and crimes committed in 1995.

In [None]:
districts_df["crimes_committed_1995"].corr(districts_df["crimes_committed_1996"])

The correlation between crimes committed in 1995 and 1996 is 0.99, which is very high. This implies that the number of crimes committed in 1995 is a good predictor for the number of crimes committed in 1996. Since 1995 is missing a data point we will impute the missing value with crimes committed in 1996.

In [None]:
districts_df["unemployment_rate_1995"].corr(districts_df["unemployment_rate_1996"])

The same goes for the unemployment rate in 1995 and 1996. 

We can impute the values therefore using a linear regression model to approximate the missing values.

In [None]:
from sklearn.linear_model import LinearRegression

districts_df_imputed = districts_df.copy()

X = districts_df_imputed[['crimes_committed_1996']].values.reshape(-1, 1)
y = districts_df_imputed['crimes_committed_1995'].values.reshape(-1, 1)
mask = ~np.isnan(y).flatten()
reg_model = LinearRegression()
reg_model.fit(X[mask], y[mask])
imputed_values = reg_model.predict(X[~mask])
districts_df_imputed.loc[districts_df_imputed['crimes_committed_1995'].isnull(), 'crimes_committed_1995'] = imputed_values.flatten()

X = districts_df_imputed[['unemployment_rate_1996']].values.reshape(-1, 1)
y = districts_df_imputed['unemployment_rate_1995'].values.reshape(-1, 1)
mask = ~np.isnan(y).flatten()
reg_model = LinearRegression()
reg_model.fit(X[mask], y[mask])
imputed_values = reg_model.predict(X[~mask])
districts_df_imputed.loc[districts_df_imputed['unemployment_rate_1995'].isnull(), 'unemployment_rate_1995'] = imputed_values.flatten()

districts_df = districts_df_imputed.copy()
districts_df.dropna(inplace=True)

In [None]:
districts_df.isnull().sum()

Now there are no missing values in the dataset and the integrity of the data is preserved.

In [None]:
districts_df.head()

In [None]:
districts_df.describe()

In [None]:
districts_df.nunique()

In [None]:
plot_numerical_distributions(districts_df, ["average_salary", "crimes_committed_1995"])

In [None]:
# plot how many inhabitants each region has, group by region first then sum
plt.figure(figsize=(10, 6))
sum_inhabitants_per_region = districts_df.groupby("region")["inhabitants"].sum()
sum_inhabitants_per_region.plot(kind="bar")
plt.title("Total Number of Inhabitants per Region")
plt.xlabel("Region")
plt.ylabel("Total Number of Inhabitants")

# add exact numbers to the bar 
for i, v in enumerate(sum_inhabitants_per_region):
    plt.text(i, v + 10000, str(v), ha="center")

plt.show()

Looking at the distribution of inhabitants per region we can see that the majority of inhabitants are in the region of Prague. This is not surprising as Prague is the capital of the Czech Republic and is the most populous region.

A quick google search seems to show that the inhabitants per region are approximately correctly set. Here an example for (Prague)[https://www.google.com/search?q=prague+city+population+1999&sourceid=chrome&ie=UTF-8] 

This is not the case for south Moravia and north Moravia, which appear to be overestimated, however we couldn't find a very reliable source for this information.

## Data Relationships

Following the documentation of the dataset, there are multiple relationships that need to be validated. https://sorry.vse.cz/\~berka/challenge/PAST/index.html

The ERD according to the descriptions on https://sorry.vse.cz/\~berka/challenge/PAST/index.html

[![](https://mermaid.ink/img/pako:eNqtV1Fv4jgQ_itWXu6l7SZ0gQatTsqGdhddCxVQrXSqFJnEgLWJnbOd7bGl__3GTgA3JGyvWh7ajPPNZ898nrHz7MQ8Ic7AIWJI8Urg7JEh-AVhOHkYz9FzaeqfVIKyFcJxzAumIpqg-7_QoxOUNhoNH50jcEL1Q2zQNxo9rAZq8AQrUv45EIaCYEU5Q0MYb-BeCvJPQVi8AZ-RlAVmMUE3u8Gdw0v5L7wdXTeGE6eUWNGExmxa3YIKtY5YkS2IAOBnbbYt7S1hVwsbjmb3k9loPpqMG1YHRPl-bUCSc0lNThqzfYjlpjWWYxVvfqmi2uRamMkTI-LDgySiFsNkOryeNqyei4SI_fIn2voNq1lg9j1SHGBTEtPcxPgZxk7w1tAVve2wTDlWCGdmWkg1WVBFEhRkdWDF_D2Sm2zBU8De402mWcM1FjhWRNCfZt_WkjSfBuNZELYIrQRmcp-qubaAq1XptyXMriqbsmXXVjJD3SVUffhG1ToR-AmnDVCeE1EW52vmO2gmJ9JqQ49TW4IXODWVDCVWPQVLSCqyfE_rYU_SpkltOxkVhYLt_YudZAFrm6hS-XYSNMkLkbG9urdg_C5ZDdcXiFjv1rqutewb7HHaKbxMir2cBjWs7GO2vNztEpB3nKl1ukFVAciGgKTCqpA71pmx6p05mA6b-jIWyaErg9F6wOR2lz3dIHc7XNPN4fkoqxTOEUjkbsbSstN66Nrz6SicN7fsfe-_P3HkVXCGM2KDxmA3wARZlfJMzUNdP8YjvowoW2PoW7iUZ8wvEF-i0WGw2SsrGIXOiFNIHJHRE9S9zRSlKuq6rtb7FRB9QjD8PkpwhJ4ceb7vH_PCy3P95n3UHbfk9hu59dtz_93kKxV5wNCQjD_NeDNrXEL2ioTGPi4tU3LaoRAL3S5eqfmgx2wx0VTDG8r9B7TmFYkkdE-h70ZBOYBmZuDYoWAky1NuajiCNZDI7-oJrWE9F0F_-N03efdavHvN6QGEILkgjBRCRnCwmBzXwr-2QQhAyGvPN88yqqAhRrGgGeho4gnNcz2IU14926tndYBaD9huz8_58_7KPDDtLsa6I4_2lVpHVxfSgSlpSROY5ADeUVVY-46oHQIpeUzNBPqU3nlVlP_PyYZpz-227Mja5SuWLSsqb3wa8wUk0bK3Ie1rTxlsDNfCZvS2PD4Hpv8y9BW2ssY5Z05GRIZpAt8ppuE-OmpNdJ_U0IQscZGaI-0FoLhQfLZhsTNQoiBnTpHrzl593DiDJU4ljOaY_c15tgPpGw8Xd-WnkPkiMhBn8Oz86wyuOhf9bsf96Lndvuu5ff_M2TgDz-1ddK763qXvel237131X86cn4bUvbi67PU6Xt_z3Y--e9ntv_wHEyE3kA?type=png)](https://mermaid.live/edit#pako:eNqtV1Fv4jgQ_itWXu6l7SZ0gQatTsqGdhddCxVQrXSqFJnEgLWJnbOd7bGl__3GTgA3JGyvWh7ajPPNZ898nrHz7MQ8Ic7AIWJI8Urg7JEh-AVhOHkYz9FzaeqfVIKyFcJxzAumIpqg-7_QoxOUNhoNH50jcEL1Q2zQNxo9rAZq8AQrUv45EIaCYEU5Q0MYb-BeCvJPQVi8AZ-RlAVmMUE3u8Gdw0v5L7wdXTeGE6eUWNGExmxa3YIKtY5YkS2IAOBnbbYt7S1hVwsbjmb3k9loPpqMG1YHRPl-bUCSc0lNThqzfYjlpjWWYxVvfqmi2uRamMkTI-LDgySiFsNkOryeNqyei4SI_fIn2voNq1lg9j1SHGBTEtPcxPgZxk7w1tAVve2wTDlWCGdmWkg1WVBFEhRkdWDF_D2Sm2zBU8De402mWcM1FjhWRNCfZt_WkjSfBuNZELYIrQRmcp-qubaAq1XptyXMriqbsmXXVjJD3SVUffhG1ToR-AmnDVCeE1EW52vmO2gmJ9JqQ49TW4IXODWVDCVWPQVLSCqyfE_rYU_SpkltOxkVhYLt_YudZAFrm6hS-XYSNMkLkbG9urdg_C5ZDdcXiFjv1rqutewb7HHaKbxMir2cBjWs7GO2vNztEpB3nKl1ukFVAciGgKTCqpA71pmx6p05mA6b-jIWyaErg9F6wOR2lz3dIHc7XNPN4fkoqxTOEUjkbsbSstN66Nrz6SicN7fsfe-_P3HkVXCGM2KDxmA3wARZlfJMzUNdP8YjvowoW2PoW7iUZ8wvEF-i0WGw2SsrGIXOiFNIHJHRE9S9zRSlKuq6rtb7FRB9QjD8PkpwhJ4ceb7vH_PCy3P95n3UHbfk9hu59dtz_93kKxV5wNCQjD_NeDNrXEL2ioTGPi4tU3LaoRAL3S5eqfmgx2wx0VTDG8r9B7TmFYkkdE-h70ZBOYBmZuDYoWAky1NuajiCNZDI7-oJrWE9F0F_-N03efdavHvN6QGEILkgjBRCRnCwmBzXwr-2QQhAyGvPN88yqqAhRrGgGeho4gnNcz2IU14926tndYBaD9huz8_58_7KPDDtLsa6I4_2lVpHVxfSgSlpSROY5ADeUVVY-46oHQIpeUzNBPqU3nlVlP_PyYZpz-227Mja5SuWLSsqb3wa8wUk0bK3Ie1rTxlsDNfCZvS2PD4Hpv8y9BW2ssY5Z05GRIZpAt8ppuE-OmpNdJ_U0IQscZGaI-0FoLhQfLZhsTNQoiBnTpHrzl593DiDJU4ljOaY_c15tgPpGw8Xd-WnkPkiMhBn8Oz86wyuOhf9bsf96Lndvuu5ff_M2TgDz-1ddK763qXvel237131X86cn4bUvbi67PU6Xt_z3Y--e9ntv_wHEyE3kA)

This ERD shows how the data appears in the dataset:

[![](https://mermaid.ink/img/pako:eNqtV99P2zAQ_lesvOyFbjCJSq2mSSHlRzRoUVq0F6TITdzWIrEz2xnqgP99ZydNTeIUhOgD5JzvPvvufJ-dJy_hKfHGHhETitcC5_cMwc8PgtnddIGeKlP_pBKUrRFOEl4yFdMU3f5C955f2Sic3HsdcEr1Q2LQFxo9qQda8BQrUv3ZEwaCYEU5QxMYd3CvBPlTEpZswSeUssQsIehiN7hzeKn-BdfhuTOcJKPEiiYwpmt1SyrUJmZlviQCgGfa7Fvae8KuFzYJ57ezebgIZ1PH6oCoaNYGJAWX1OTEme19LBe9sXSrePFmFdW20IWZPTIivt1JIloxzKLJeeRYPRcpEc3yZ9r6hNUsMXuIFQdYRBJamBjPYOwAbwtd09sOq4xjhXBupoVUkyVVJEV-3gbWzA-x3OZLngH2Fm9zzRpssMCJIoL-M_u2laRF5E_nftBTaCUwk02qFtoCrt5Kvy9hdlfZlD27ti4z9F1K1bffVG1SgR9x5oDygoiqOV8z34CYHEirDe2mtgIvcWY6GVqsfvJXkFRk-R6uhz1JX01a28lUUSjY3m_sJAvY2kR1la9nvqu8EBlrqnsNxmeV1XBdQsR6t7br2sq-wXbTTuFlWjblNKhJbXfZimq3S0DecKY22RbVDSAdAUmFVSl3rHNjtZXZjyYuXcYi3asyGL0HTGGr7GGB3O1wTbeA505WKZwjkMjdjJVlp3Wv2osoDBZuyW60__bAkVfDGc6JDZqC7YAJsq7KE5mHdv0Yj_kqpmyDQbdwVZ4p_4r4CoX7QbdXXjIKyogzSByR8SP0vc0UZyo-PT7W9X4FRD8QDH-MEhxBk-OT0WjU5YWXA_3mY9TfjyvukZNbvx2MPky-VvEJMDiS8dOMu1mTCtJUJDB2t7VMy2mHUiy1XLyq5p0es4uJIg13tPtfkOY1iSWop9B3I78aQHMz0HUoGcmLjJsejmENJB6d6gmtYT0XQV9Gp-_yHvZ4D93pAYQghSCMlELGcLCYHLfCP7dBCEDopD_fPM-pAkGME0FzqKOJJzDP7SAOeQ1tr6GlAC0NeH4eDPhTc2UeG7lLsFbksOnUNrq-kI5NS0uawiR78I6qxtp3RO3gS8kTaibQp_TOq6bUTs_P73WyYbWnUWTtcoVlz4qqG5_GXEJJdNn7kPa1pwo2gWuhEw1Tm-NzbPSXoSvYyhrnHXk5ETmmKXynGMG999SGaJ3U0JSscJmZI-0FoLhUfL5liTdWoiRHXlloZa8_brzxCmcSRvUVh4ub6tsn4WxF197LfwBRISI?type=png)](https://mermaid.live/edit#pako:eNqtV99P2zAQ_lesvOyFbjCJSq2mSSHlRzRoUVq0F6TITdzWIrEz2xnqgP99ZydNTeIUhOgD5JzvPvvufJ-dJy_hKfHGHhETitcC5_cMwc8PgtnddIGeKlP_pBKUrRFOEl4yFdMU3f5C955f2Sic3HsdcEr1Q2LQFxo9qQda8BQrUv3ZEwaCYEU5QxMYd3CvBPlTEpZswSeUssQsIehiN7hzeKn-BdfhuTOcJKPEiiYwpmt1SyrUJmZlviQCgGfa7Fvae8KuFzYJ57ezebgIZ1PH6oCoaNYGJAWX1OTEme19LBe9sXSrePFmFdW20IWZPTIivt1JIloxzKLJeeRYPRcpEc3yZ9r6hNUsMXuIFQdYRBJamBjPYOwAbwtd09sOq4xjhXBupoVUkyVVJEV-3gbWzA-x3OZLngH2Fm9zzRpssMCJIoL-M_u2laRF5E_nftBTaCUwk02qFtoCrt5Kvy9hdlfZlD27ti4z9F1K1bffVG1SgR9x5oDygoiqOV8z34CYHEirDe2mtgIvcWY6GVqsfvJXkFRk-R6uhz1JX01a28lUUSjY3m_sJAvY2kR1la9nvqu8EBlrqnsNxmeV1XBdQsR6t7br2sq-wXbTTuFlWjblNKhJbXfZimq3S0DecKY22RbVDSAdAUmFVSl3rHNjtZXZjyYuXcYi3asyGL0HTGGr7GGB3O1wTbeA505WKZwjkMjdjJVlp3Wv2osoDBZuyW60__bAkVfDGc6JDZqC7YAJsq7KE5mHdv0Yj_kqpmyDQbdwVZ4p_4r4CoX7QbdXXjIKyogzSByR8SP0vc0UZyo-PT7W9X4FRD8QDH-MEhxBk-OT0WjU5YWXA_3mY9TfjyvukZNbvx2MPky-VvEJMDiS8dOMu1mTCtJUJDB2t7VMy2mHUiy1XLyq5p0es4uJIg13tPtfkOY1iSWop9B3I78aQHMz0HUoGcmLjJsejmENJB6d6gmtYT0XQV9Gp-_yHvZ4D93pAYQghSCMlELGcLCYHLfCP7dBCEDopD_fPM-pAkGME0FzqKOJJzDP7SAOeQ1tr6GlAC0NeH4eDPhTc2UeG7lLsFbksOnUNrq-kI5NS0uawiR78I6qxtp3RO3gS8kTaibQp_TOq6bUTs_P73WyYbWnUWTtcoVlz4qqG5_GXEJJdNn7kPa1pwo2gWuhEw1Tm-NzbPSXoSvYyhrnHXk5ETmmKXynGMG999SGaJ3U0JSscJmZI-0FoLhUfL5liTdWoiRHXlloZa8_brzxCmcSRvUVh4ub6tsn4WxF197LfwBRISI)

In order to also validate the relationships from a algorithmic perspective, we can use the following code:

In [None]:
# verify 1:1 relationships between CLIENT, LOAN and DISPOSITION
assert dispositions_df[
    "client_id"
].is_unique, "Each client_id should appear exactly once in the DISPOSITION DataFrame."
assert loans_df[
    "account_id"
].is_unique, "Each account_id should appear exactly once in the LOAN DataFrame."

# verify 1:M relationships between ACCOUNT and DISPOSITION
# assert dispositions['account_id'].is_unique == False, "An account_id should appear more than once in the DISPOSITION DataFrame."
assert (
    dispositions_df["account_id"].is_unique == True
), "An account_id should appear once in the DISPOSITION DataFrame."

# verify each district_id in ACCOUNT and CLIENT exists in DISTRICT
assert set(accounts_df["district_id"]).issubset(
    set(districts_df["district_id"])
), "All district_ids in ACCOUNT should exist in DISTRICT."
assert set(clients_df["district_id"]).issubset(
    set(districts_df["district_id"])
), "All district_ids in CLIENT should exist in DISTRICT."

# verify each account_id in DISPOSITION, ORDER, TRANSACTION, and LOAN exists in ACCOUNT
assert set(dispositions_df["account_id"]).issubset(
    set(accounts_df["account_id"])
), "All account_ids in DISPOSITION should exist in ACCOUNT."
assert set(orders_df["account_id"]).issubset(
    set(accounts_df["account_id"])
), "All account_ids in ORDER should exist in ACCOUNT."
assert set(transactions_df["account_id"]).issubset(
    set(accounts_df["account_id"])
), "All account_ids in TRANSACTION should exist in ACCOUNT."
assert set(loans_df["account_id"]).issubset(
    set(accounts_df["account_id"])
), "All account_ids in LOAN should exist in ACCOUNT."

# verify each client_id in DISPOSITION exists in CLIENT
assert set(dispositions_df["client_id"]).issubset(
    set(clients_df["client_id"])
), "All client_ids in DISPOSITION should exist in CLIENT."

# verify each disp_id in CARD exists in DISPOSITION
assert set(cards_df["disp_id"]).issubset(
    set(dispositions_df["disp_id"])
), "All disp_ids in CARD should exist in DISPOSITION."

# Data Preparation: Non-Transactional Data

This section covers on how we prepare the non-transactional data in order to create the golden record.

In [None]:
orders_pivot_df = orders_df.pivot_table(
    index="account_id",
    columns="k_symbol",
    values="debited_amount",
    aggfunc="sum",
    fill_value=0,
)

orders_pivot_df.columns = [
    f"k_symbol_debited_sum_{col.lower()}" for col in orders_pivot_df.columns
]
orders_pivot_df = orders_pivot_df.reset_index()
orders_pivot_df.head()

In [None]:
def merge_non_transactional_data(
    clients, districts, dispositions, accounts, orders, loans, cards
):
    # rename district_id for clarity in clients and accounts DataFrames
    clients = clients.rename(columns={"district_id": "client_district_id"})
    accounts = accounts.rename(columns={"district_id": "account_district_id"})

    # prepare districts dataframe for merge with prefix for clients and accounts
    districts_client_prefixed = districts.add_prefix("client_")
    districts_account_prefixed = districts.add_prefix("account_")

    # merge district information for clients and accounts with prefixed columns
    clients_with_districts = pd.merge(
        clients,
        districts_client_prefixed,
        left_on="client_district_id",
        right_on="client_district_id",
        how="left",
    )
    accounts_with_districts = pd.merge(
        accounts,
        districts_account_prefixed,
        left_on="account_district_id",
        right_on="account_district_id",
        how="left",
    )

    # merge cards with dispositions and prefix card-related columns to avoid confusion
    cards_prefixed = cards.add_prefix("card_")
    dispositions_with_cards = pd.merge(
        dispositions,
        cards_prefixed,
        left_on="disp_id",
        right_on="card_disp_id",
        how="left",
    )

    # merge clients (with district info) with dispositions and cards
    clients_dispositions_cards = pd.merge(
        dispositions_with_cards, clients_with_districts, on="client_id", how="left"
    )

    # merge the above with accounts (with district info) on account_id
    accounts_clients_cards = pd.merge(
        accounts_with_districts, clients_dispositions_cards, on="account_id", how="left"
    )

    # merge orders DataFrame, assuming orders might contain columns that could overlap, prefix as needed
    orders_prefixed = orders.add_prefix("order_")
    comprehensive_df_with_orders = pd.merge(
        accounts_clients_cards,
        orders_prefixed,
        left_on="account_id",
        right_on="order_account_id",
        how="left",
    )

    # merge loans with the comprehensive dataframe (now including orders) on account_id
    # prefix loan-related columns to maintain clarity
    loans_prefixed = loans.add_prefix("loan_")
    final_df = pd.merge(
        comprehensive_df_with_orders,
        loans_prefixed,
        left_on="account_id",
        right_on="loan_account_id",
        how="left",
    )

    final_df["account_created"] = pd.to_datetime(final_df["account_created"])
    final_df["card_issued"] = pd.to_datetime(final_df["card_issued"])
    final_df["has_card"] = final_df["card_issued"].notna()
    return final_df


non_transactional_df = merge_non_transactional_data(
    clients_df,
    districts_df,
    dispositions_df,
    accounts_df,
    orders_pivot_df,
    loans_df,
    cards_df,
)
non_transactional_df.to_csv("data/non_transactional.csv", index=False)
non_transactional_df.info()

# Exploratory Data Analysis

## Non-transactional Data

### Card Holders

In [None]:
plt.figure()
plt.title("Number of Clients by Card Type")
sns.barplot(
    x=["No Card", "Classic/Gold Card Holders", "Junior Card Holders"],
    y=[
        non_transactional_df["card_type"].isna().sum(),
        non_transactional_df["card_type"].isin(["gold", "classic"]).sum(),
        non_transactional_df["card_type"].eq("junior").sum(),
    ],
)
# ensure that the number of clients is shown on the bars
for i, v in enumerate(
    [
        non_transactional_df["card_type"].isna().sum(),
        non_transactional_df["card_type"].isin(["gold", "classic"]).sum(),
        non_transactional_df["card_type"].eq("junior").sum(),
    ]
):
    plt.text(i, v + 10, str(v), ha="center", va="bottom")

plt.show()

Looking at the distribution of card holders in general we can see that the most clients are not in a possession of a credit card.

In [None]:
plt.figure()
plt.title(
    f'Distribution of Age for Junior Card Holders\n total count = {len(non_transactional_df[non_transactional_df["card_type"] == "junior"])}'
)
sns.histplot(
    non_transactional_df[non_transactional_df["card_type"] == "junior"]["age"],
    kde=True,
    bins=30,
)
plt.xlabel("Age of Client (presumably in 1999)")
plt.show()

Looking at the age distribution of Junior Card holders paints a picture on this group, however only looking at the current age may be misleading as we need to understand how old they were when the card was issued to determine if they could have been eligble for a Classic/Gold card (at least 18 when the card was issued).

In [None]:
non_transactional_df["card_issued"] = pd.to_datetime(
    non_transactional_df["card_issued"]
)

non_transactional_df["age_at_card_issuance"] = (
    non_transactional_df["card_issued"] - non_transactional_df["birth_date"]
)
non_transactional_df["age_at_card_issuance"] = (
    non_transactional_df["age_at_card_issuance"].dt.days // 365
)

plt.figure()
plt.title(
    f'Distribution of Age at Card Issuance for Junior Card Holders\n total count = {len(non_transactional_df[non_transactional_df["card_type"] == "junior"])}'
)
sns.histplot(
    non_transactional_df[non_transactional_df["card_type"] == "junior"][
        "age_at_card_issuance"
    ],
    kde=True,
    bins=30,
)
plt.xlabel("Age at Card Issuance")
plt.show()

Here we can see that roughly 1/3 of the Junior Card holders were not of legal age (assuming legal age is 18) when receiving their Junior Card. 

In [None]:
plt.figure()
plt.title(
    f"Distribution of Age at Card Issuance for All Card Types\n total count = {len(non_transactional_df)}"
)
sns.histplot(
    non_transactional_df[non_transactional_df["card_type"] == "junior"][
        "age_at_card_issuance"
    ],
    kde=True,
    bins=10,
    color="blue",
    label="Junior Card Holders",
)
sns.histplot(
    non_transactional_df[non_transactional_df["card_type"] != "junior"][
        "age_at_card_issuance"
    ],
    kde=True,
    bins=30,
    color="red",
    label="Non-Junior Card Holders",
)
plt.legend()
plt.xlabel("Age at Card Issuance")
plt.show()

Comparing the age at issue date between Junior and non-Junior (Classic/Gold) card holders shows that there is no overlap between the two groups, which makes intutively sense.

Therefore removing the subset of Junior Cards seems as valid as there is no reason to believe that there are Junior Cards issued wrongly, the subset being relatively small compared to the remaining issued cards and the fact that our target is specifically Classic/Gold Card owners.

In [None]:
before_len = len(non_transactional_df)
non_transactional_df = non_transactional_df[
    non_transactional_df["card_type"] != "junior"
]
data_reduction["Junior Card Holders"] = -(before_len - len(non_transactional_df))
del before_len

### Time factors on Card Status

The time between creating an account and issuing a card may also be important when filtering customers based on their history. We should avoid filtering out potentially interesting periods and understand how the timespans between account creation and card issuance are distributed.

In [None]:
non_transactional_w_cards_df = non_transactional_df[
    non_transactional_df["card_issued"].notna()
    & non_transactional_df["account_created"].notna()
]
non_transactional_w_cards_df["duration_days"] = (
    non_transactional_w_cards_df["card_issued"]
    - non_transactional_w_cards_df["account_created"]
).dt.days

plt.figure(figsize=(8, 6))
sns.histplot(
    non_transactional_w_cards_df["duration_days"], bins=50, edgecolor="black", kde=True
)
plt.title("Distribution of Duration Between Account Creation and Card Issuance")
plt.xlabel("Duration in Days")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

The histogram displays a distribution with multiple peaks, indicating that there are several typical time frames for card issuance after account creation. The highest peak occurs within the first 250 days, suggesting that a significant number of cards are issued during this period. The frequency decreases as duration increases, with noticeable peaks that may correspond to specific processing batch cycles or policy changes over time. The distribution also has a long tail, suggesting that in some cases, card issuance can take a very long time.

Analyzing the length of time a client has been with the bank in relation to their account creation date and card ownership can provide valuable insights for a bank's customer relationship management and product targeting strategies. Long-standing clients may exhibit different banking behaviors, such as product adoption and loyalty patterns, compared to newer clients.

In [None]:
max_account_creation_date = non_transactional_df["card_issued"].max()

non_transactional_df["client_tenure_years_relative"] = (
    max_account_creation_date - non_transactional_df["account_created"]
).dt.days / 365.25

plt.figure()
ax = sns.histplot(
    data=non_transactional_df,
    x="client_tenure_years_relative",
    hue="has_card",
    multiple="stack",
    binwidth=1,
    stat="percent",
)

# Call the function to add labels
add_percentage_labels(ax, non_transactional_df["has_card"].unique())

# Additional plot formatting
plt.title("Client Tenure Relative to Latest Card Issued Date and Card Ownership")
plt.xlabel("Client Tenure (Years, Relative to Latest Card Issuance)")
plt.ylabel("Percentage of Clients")

# Display the plot
plt.show()

The bar chart shows the tenure of clients in years, categorized by whether they own a credit card (True) or not (False). Each bar represents the percentage of clients within a specific tenure range, allowing for comparison of the distribution of card ownership among clients with different lengths of association with the bank.

### Demographics

Using the available demographic data, we can investigate the potential correlation between demographic data and card status. The average salary may indicate a difference between cardholders and non-cardholders, as it is reasonable to assume that cardholders have a higher average salary than non-cardholders.

In [None]:
plt.figure()
sns.boxplot(x="has_card", y="client_average_salary", data=non_transactional_df)
plt.title("Average Salary in Client's Region by Card Ownership")
plt.xlabel("Has Card")
plt.ylabel("Average Salary")
plt.xticks([0, 1], ["No Card Owner", "Card Owner"])

plt.show()

The box plot compares the average salaries of clients who own a credit card with those who do not. Both groups have a substantial overlap in salary ranges, suggesting that while there might be a trend for card owners to have higher salaries, the difference is not significant. The median salary for card owners is slightly higher than that for non-card owners, as indicated by the median line within the respective boxes.

Both distributions have outliers on the higher end, indicating that some individuals have salaries significantly above the average in both groups. However, these outliers do not dominate the general trend.

It should also be noted that this plot assumes that the average salary of the region's clients remained constant over the years, which is unlikely to be true.

The group of bar charts represents the distribution of credit card ownership across various demographics, showing the percentage of clients with and without cards within different age groups, sexes, and regions.

In [None]:
non_transactional_df["age_group"] = pd.cut(
    non_transactional_df["age"],
    bins=[0, 25, 40, 55, 70, 100],
    labels=["<25", "25-40", "40-55", "55-70", ">70"],
)

plt.figure(figsize=(8, 12))

# Age Group
plt.subplot(3, 1, 1)
age_group_counts = (
    non_transactional_df.groupby(["age_group", "has_card"]).size().unstack(fill_value=0)
)
age_group_percentages = (age_group_counts.T / age_group_counts.sum(axis=1)).T * 100
age_group_plot = age_group_percentages.plot(kind="bar", stacked=True, ax=plt.gca())
age_group_plot.set_title("Card Ownership by Age Group")
age_group_plot.set_ylabel("Percentage")
add_percentage_labels(age_group_plot, non_transactional_df["has_card"].unique())

# Sex
plt.subplot(3, 1, 2)
sex_counts = (
    non_transactional_df.groupby(["sex", "has_card"]).size().unstack(fill_value=0)
)
sex_percentages = (sex_counts.T / sex_counts.sum(axis=1)).T * 100
sex_plot = sex_percentages.plot(kind="bar", stacked=True, ax=plt.gca())
sex_plot.set_title("Card Ownership by Sex")
sex_plot.set_ylabel("Percentage")
add_percentage_labels(sex_plot, non_transactional_df["has_card"].unique())

# Client Region
plt.subplot(3, 1, 3)
region_counts = (
    non_transactional_df.groupby(["client_region", "has_card"])
    .size()
    .unstack(fill_value=0)
)
region_percentages = (region_counts.T / region_counts.sum(axis=1)).T * 100
region_plot = region_percentages.plot(kind="bar", stacked=True, ax=plt.gca())
region_plot.set_title("Card Ownership by Client Region")
region_plot.set_ylabel("Percentage")
region_plot.tick_params(axis="x", rotation=45)
add_percentage_labels(region_plot, non_transactional_df["has_card"].unique())

plt.tight_layout()
plt.show()

**Card Ownership by Age Group:** The bar chart displays the proportion of cardholders in different age groups. The percentage of cardholders is lowest in the age group of over 70, followed by the age group of 55-70, indicating that card ownership is more prevalent among younger demographics.

**Card Ownership by Sex:** The bar chart shows the breakdown of card ownership by sex. The data reveals that the percentage of cardholders is comparable between both sexes, and no significant difference is present.

**Card Ownership by Region** The bar chart at the bottom illustrates card ownership across different regions, showing a relatively consistent pattern among most regions.

### Impact of Loans / Debt

In [None]:
simplified_loan_status_mapping = {
    "Contract finished, no problems": "Finished",
    "Contract finished, loan not paid": "Not Paid",
    "Contract running, OK thus-far": "Running",
    "Contract running, client in debt": "In Debt",
    "No Loan": "No Loan",
}

non_transactional_df["loan_status_simplified"] = non_transactional_df[
    "loan_status"
].map(simplified_loan_status_mapping)

# this variable wants to kill itself
loan_status_simplified_card_ownership_counts = (
    non_transactional_df.groupby(["loan_status_simplified", "has_card"])
    .size()
    .unstack(fill_value=0)
)
loan_status_simplified_card_ownership_percentages = (
    loan_status_simplified_card_ownership_counts.T
    / loan_status_simplified_card_ownership_counts.sum(axis=1)
).T * 100

loan_status_simplified_card_ownership_percentages.plot(
    kind="bar", stacked=True, figsize=(8, 6)
)
plt.title("Interaction Between Simplified Loan Status and Card Ownership")
plt.xlabel("Simplified Loan Status")
plt.ylabel("Percentage of Clients")
plt.xticks(rotation=45)
plt.legend(title="Has Card", labels=["No Card", "Has Card"])
plt.tight_layout()
plt.show()

## Transactional Data

In [None]:
zero_amount_transactions_df = transactions_df[transactions_df["amount"] == 0]

zero_amount_transactions_info = {
    "total_zero_amount_transactions": len(zero_amount_transactions_df),
    "unique_accounts_with_zero_amount": zero_amount_transactions_df[
        "account_id"
    ].nunique(),
    "transaction_type_distribution": zero_amount_transactions_df[
        "transaction_type"
    ].value_counts(normalize=True),
    "operation_distribution": zero_amount_transactions_df["operation"].value_counts(
        normalize=True
    ),
    "k_symbol_distribution": zero_amount_transactions_df["k_symbol"].value_counts(
        normalize=True
    ),
}

zero_amount_transactions_info, len(zero_amount_transactions_info)

In [None]:
accounts_with_zero_amount_transactions = accounts_df[
    accounts_df["account_id"].isin(zero_amount_transactions_df["account_id"].unique())
]
accounts_with_zero_amount_transactions

In [None]:
# Clean up unnecessary variables
del accounts_with_zero_amount_transactions
del zero_amount_transactions_df
del zero_amount_transactions_info

Validating first transactions where the amount equals the balance is essential for the integrity of our aggregated data analysis. This specific assertion underpins the reliability of our subsequent aggregation operations by ensuring each account's financial history starts from a verifiable point.

In [None]:
def validate_first_transactions(transactions):
    """
    Validates that for each account in the transactions DataFrame, there is at least
    one transaction where the amount equals the balance on the account's first transaction date.

    Parameters:
    - transactions (pd.DataFrame): DataFrame containing transaction data with columns
      'account_id', 'date', 'amount', and 'balance'.

    Raises:
    - AssertionError: If not every account has a first transaction where the amount equals the balance.
    """

    first_dates = (
        transactions.groupby("account_id")["date"].min().reset_index(name="first_date")
    )

    first_trans = pd.merge(transactions, first_dates, how="left", on=["account_id"])

    first_trans_filtered = first_trans[
        (first_trans["date"] == first_trans["first_date"])
        & (first_trans["amount"] == first_trans["balance"])
    ]

    first_trans_filtered = first_trans_filtered.drop_duplicates(subset=["account_id"])

    unique_accounts = transactions["account_id"].nunique()
    assert (
        unique_accounts == first_trans_filtered["account_id"].nunique()
    ), "Not every account has a first transaction where the amount equals the balance."

    return "Validation successful: Each account has a first transaction where the amount equals the balance."


validate_first_transactions(transactions_df)

We can confirm the truth of the assertions made. It is certain that there is a transaction with an amount equal to the balance in the transaction history of every account on the first date.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from sklearn.metrics import pairwise_distances
from tqdm import tqdm
import seaborn as sns
import plotly.graph_objects as go
import json


transactions_df = pd.read_parquet("temp/transactions.parquet")
accounts_df = pd.read_parquet("temp/accounts.parquet")
non_transactional_df = pd.read_parquet("temp/non_transactional.parquet")
with open("temp/data_reduction.json", "r") as f:
    data_reduction = json.load(f)

# Data Preparation: Transactional Data

As we already prepared the non-transactional data in a previous section, we now focus on the transactional data. As the end goal is to have a single record per customer, we need to aggregate the transactional data. In addition, absolute temporal data such as transaction dates needs to be transformed into relative data, such as the number of months before the event of card issuance.

## Set artificial issue date for non-card holders

One crucial step in the data preparation process is to set an artificial card issue date for non-card holders. This date is necessary to align the transactional data of non-card holders with that of card holders. By setting an artificial card issue date, we can create a unified timeline for all customers, enabling a more accurate comparison of transactional behaviors across different groups. 

First we will explore the distribution of the months between account creation and card issuance to understand the typical timeline for card issuance after account creation.


In [None]:
def add_months_since_account_to_card(df):
    """Add a column to the DataFrame with the number of months between account creation and card issuance."""
    df["months_since_account_to_card"] = df.apply(
        lambda row: (
            (
                row["card_issued"].to_period("M")
                - row["account_created"].to_period("M")
            ).n
            if pd.notnull(row["card_issued"]) and pd.notnull(row["account_created"])
            else np.nan
        ),
        axis=1,
    )
    return df

As we need enough history to make a valid comparison between card holders and non-card holders, we filter out non-card holders with less than 25 months of history. This ensures that we have a sufficient amount of data to analyze and compare the transactional behaviors of both groups. Here is the reasoning behind this threshold:

- **New Customer Period (12 months)**: We need at least one year of transactional history to capture the typical spending patterns of customers.
- **One Year of History (12 months)**: An additional year of data provides a more comprehensive view of transactional behaviors, allowing us to identify trends and patterns more accurately across seasons and economic cycles.
- **Lag Period (1 month)**: The month immediately preceding card issuance is excluded to avoid any potential bias caused by transactional changes due to the impending card issuance.

In [None]:
def filter_clients_without_sufficient_history(
    non_transactional_df, min_history_months=25
):
    if "months_since_account_to_card" not in non_transactional_df.columns:
        non_transactional_df = add_months_since_account_to_card(non_transactional_df)

    count_before = len(non_transactional_df)
    filtered_df = non_transactional_df[
        non_transactional_df["months_since_account_to_card"].isnull()
        | (non_transactional_df["months_since_account_to_card"] >= min_history_months)
    ]
    print(
        f"Filtered out {count_before - len(filtered_df)} records with less than {min_history_months} months of history. "
        f"Percentage: {(count_before - len(filtered_df)) / count_before * 100:.2f}%."
    )
    return filtered_df


before_len = len(non_transactional_df)

non_transactional_w_sufficient_history_df = filter_clients_without_sufficient_history(
    non_transactional_df
)

data_reduction["Clients without sufficient history"] = -(
    before_len - len(non_transactional_w_sufficient_history_df)
)
del before_len

In this case roughly 10% of the non-card holders were filtered out due to insufficient history. This is a reasonable amount and will not impact the analysis significantly.

Next, we will explore the distribution of the months between account creation and card issuance for card holders to understand the typical timeline for card issuance after account creation.

In [None]:
non_transactional_w_card_df = non_transactional_w_sufficient_history_df.dropna(
    subset=["card_issued"]
).copy()

plt.figure(figsize=(8, 6))
sns.histplot(
    non_transactional_w_card_df["months_since_account_to_card"], kde=True, bins=30
)
plt.title(
    "Distribution of Months from Account Creation to Card Issuance (for Card Holders)"
)
plt.xlabel("Months")
plt.ylabel("Count")
plt.grid(True)
plt.tight_layout()
plt.show()

The plot shows a right-skewed distribution, with the majority of card holders receiving their cards roughly 25 to 30 months after account creation. On the long-tail end, some clients receive their cards after 60 months or more.

After briefly exploring the distribution of the months between account creation and card issuance for card holders, we will now set the artificial card issuance date for non-card holders. This date will be used to align the transactional data of non-card holders with that of card holders, enabling a more accurate comparison of transactional behaviors across different groups.

The following approaches were considered to match non-card holders with card holders:

1. Looking at the distributions above extract the amount of history a buyer most likely has at the issue data of the card
2. For each non buyer, find a buyer which was active in a similar time window (Jaccard similarity on the Year-Month sets). Instead of looking at the full activity of a buyer, we only look at the pre-purchase activity as there is reason to believe that clients may change their patterns after purchasing date and therefore add unwanted bias.

The second approach is chosen as it is provides an intuitive way to match clients based on their activity which is not only explainable but also provides a way to match clients based on their behavior. It strikes a balance of not finding a perfect match but a good enough match to focus on the discriminative features of the data.

Both approaches have their advantages and disadvantages. The first approach is more straightforward and less computationally intensive, but it may not capture the nuances of client behavior. The second approach is more complex and computationally intensive but offers a more nuanced view of client activity, potentially leading to better matches.

## Match by similar transaction activity

The process emphasizes matching based on the timing of activity, rather than a wide array of characteristics. By identifying when both existing cardholders and non-cardholders interacted with the bank, we can infer a level of behavioral alignment that extends beyond mere transactional data. This alignment suggests a shared response to external conditions. Intuitively we are constructing tuples of non-card holders and card holders based on the similarity of their activity patterns but one of them is not a card holder yet.

**Assumption**: This assumes that clients active during similar periods might be influenced by the same economic and societal conditions, providing a more nuanced foundation for establishing connections between current cardholders and potential new ones.

### Construction of the Activity Matrix

To hold the needed information about every customer we create a so called activity matrix. The resolution of the activity matrix is a binary matrix where each row represents a client and each column represents a month. A value of 1 indicates activity in a given month, while 0 indicates inactivity. Therefore we concentrate on the periods during which clients engage with the bank in the form of transactions rather than the specifics of those transactions. Here is a step-by-step breakdown of the construction process:

1.  **Data Aggregation**: We start with transaction data, which records each client's interactions across various months. This data includes every transaction made by both current cardholders and non-cardholders.

2.  **Temporal Transformation**: Each transaction is associated with a specific date. These dates are then transformed into monthly periods, consolidating daily transactions into a monthly view of activity. This step simplifies the data, focusing on the presence of activity within each month rather than the specific dates or frequencies of transactions.

3.  **Matrix Structure**: The transformed data is arranged into a matrix format. Rows represent individual clients, identified by their account IDs. Columns correspond to monthly periods, spanning the entire range of months covered by the transaction data.

4.  **Activity Indication**: In the matrix, a cell value is set to indicate the presence of activity for a given client in a given month. If a client made one or more transactions in a month, the corresponding cell is marked to reflect this activity. The absence of transactions for a client in a month leaves the cell unmarked. Active months are represented by a '1', indicating the presence of transactions, while inactive months are denoted by a '0', indicating no transactions.

In [None]:
def prepare_activity_matrix(transactions):
    """
    Create an activity matrix from transaction data.

    The function transforms transaction data into a binary matrix that indicates
    whether an account was active in a given month.

    Parameters:
    - transactions (pd.DataFrame): A DataFrame containing the transaction data.

    Returns:
    - pd.DataFrame: An activity matrix with accounts as rows and months as columns.
    """
    transactions["month_year"] = transactions["date"].dt.to_period("M")
    transactions["active"] = 1

    activity_matrix = transactions.pivot_table(
        index="account_id", columns="month_year", values="active", fill_value=0
    )

    activity_matrix.columns = [f"active_{str(col)}" for col in activity_matrix.columns]
    return activity_matrix


def plot_activity_matrix(activity_matrix):
    activity_matrix = activity_matrix.reindex(
        activity_matrix.sum(axis=1).sort_values(ascending=False).index # sort by activity across time
    )
    
    activity_matrix.columns = activity_matrix.columns.str.replace("active_", "")
    sparse_matrix = activity_matrix.astype(bool)
    plt.figure(figsize=(8, 8))
    sns.heatmap(sparse_matrix, cmap="binary", yticklabels=False, cbar=False)
    plt.title(f"Activity Matrix across all clients sorted by account creation date")
    plt.xlabel("Month-Year")
    plt.ylabel("Accounts")
    plt.tight_layout()

    active_patch = mpatches.Patch(color='black', label='Active')
    inactive_patch = mpatches.Patch(color='white', label='Not Active')
    plt.legend(handles=[active_patch, inactive_patch], loc='upper right')

    plt.show()


activity_matrix = prepare_activity_matrix(transactions_df)
plot_activity_matrix(activity_matrix.copy())

The heatmap provided offers a visual representation of the activity matrix for clients, depicting the levels of engagement over various periods. Clients are sorted by activity, with the most active clients at the top and the least active at the bottom.

There is a distinct diagonal pattern, indicating that newer accounts (those created more recently) perhaps have fewer periods of activity. This makes sense as these accounts have not had the opportunity to transact over the earlier periods displayed on the heatmap.

Also interesting are some gaps withing the activity of some clients. This could be due to a variety of reasons, such as seasonal spending patterns or changes in financial circumstances.

### Eligibility Criteria

After constructing the activity matrix, we check for eligibility of non-cardholders to be matched with cardholders. This ensures alignment for later model construction. The eligibility criteria are as follows:

1.  **Account History**: Non-cardholders must have an established history of interaction, with at least 25 months of history between account creation and card issuance (12 months (= New customer period) + 13 months (= one year of history) + 1 month (Lag period)) as described above.
2.  **Account Creation Date**: The account creation date of a non-cardholder must precede the card issuance date of the cardholder as this is a prerequisite for the matching process to work correctly when we set the issue date for non-card holders following the intuition that nobody can have a card before the account is created.

In [None]:
ELIGIBILITY_THRESHOLD_HIST_MONTHS = 25


def check_eligibility_for_matching(non_cardholder, cardholder, verbose=False):
    """
    Determine if a non-cardholder is eligible for matching with a cardholder.

    This function checks whether the card issuance to a cardholder occurred at least
    25 months after the non-cardholder's account was created.

    Parameters:
    - non_cardholder (pd.Series): A data series containing the non-cardholder's details.
    - cardholder (pd.Series): A data series containing the cardholder's details.
    - verbose (bool): If True, print detailed eligibility information. Default is False.

    Returns:
    - bool: True if the non-cardholder is eligible for matching, False otherwise.
    """
    if cardholder["card_issued"] <= non_cardholder["account_created"]:
        return False

    period_diff = (
        cardholder["card_issued"].to_period("M")
        - non_cardholder["account_created"].to_period("M")
    ).n

    if verbose:
        print(
            f"Card issued: {cardholder['card_issued']}, Account created: {non_cardholder['account_created']}, Period diff: {period_diff}, Eligible: {period_diff >= ELIGIBILITY_THRESHOLD_HIST_MONTHS}"
        )

    return period_diff >= ELIGIBILITY_THRESHOLD_HIST_MONTHS

### Matching Process

Next up we will implement the matching process. Our matching utilizes the Jaccard similarity index to compare activity patterns: We compare a vector representing an existing cardholder's monthly activity against a matrix of non-cardholders' activity patterns. Here we only consider the activity from the first transaction period across all customers to the card issue date.

The Jaccard similarity index is calculated as the intersection of active months divided by the union of active months between the two clients. This index ranges from 0 to 1, with higher values indicating greater overlap in activity patterns.

$$J(A, B) = \frac{|A \cap B|}{|A \cup B|}$$

The function `select_non_cardholders` randomly selects a non-cardholder match for a cardholder from the top N eligible candidates. The selection process is based on the Jaccard similarity scores calculated between the cardholder and non-cardholders. The function performs the following steps:

1. **Sorting by Similarity**: The function sorts the Jaccard distances between a cardholder and non-cardholders to identify the top N similar non-cardholders.
2. **Random Selection**: From the top N similar non-cardholders, the function randomly selects one non-cardholder match for the cardholder. This random selection helps avoid bias and ensures a fair matching process.

In [None]:
def select_non_cardholders(
    distances,
    eligible_non_cardholders,
    matches,
    matched_applicants,
    cardholder,
    without_card_activity,
    top_n,
):
    """
    Randomly select a non-cardholder match for a cardholder from the top N eligible candidates.

    Parameters:
    - distances (np.array): An array of Jaccard distances between a cardholder and non-cardholders.
    - eligible_non_cardholders (list): A list of indices for non-cardholders who are eligible for matching.
    - matches (list): A list to which the match will be appended.
    - matched_applicants (set): A set of indices for non-cardholders who have already been matched.
    - cardholder (pd.Series): The data series of the current cardholder.
    - without_card_activity (pd.DataFrame): A DataFrame of non-cardholders without card issuance.
    - top_n (int): The number of top similar non-cardholders to consider for matching.

    Returns:
    - None: The matches list is updated in place with the selected match (Object by reference).
    """
    eligible_distances = distances[eligible_non_cardholders]
    sorted_indices = np.argsort(eligible_distances)[:top_n]

    if sorted_indices.size > 0:
        selected_index = np.random.choice(sorted_indices)
        actual_selected_index = eligible_non_cardholders[selected_index]

        if actual_selected_index not in matched_applicants:
            matched_applicants.add(actual_selected_index)
            applicant = without_card_activity.iloc[actual_selected_index]
            similarity = 1 - eligible_distances[selected_index]

            matches.append(
                (cardholder["client_id"], applicant["client_id"], similarity)
            )

The function `match_cardholders_with_non_cardholders` brings together the data preparation, matching process, and match selection steps. It performs the following operations:

1. **Data Preparation**: The function prepares the activity matrix and splits the non-cardholders into two groups: those with and without cards.
2. **Matching Process**: For each cardholder, the function calculates the Jaccard similarity between their activity pattern and those of eligible non-cardholders. It then selects the top N similar non-cardholders and randomly assigns one match per cardholder.
3. **Match Selection**: The function selects a non-cardholder match for each cardholder based on the Jaccard similarity scores. It ensures that each non-cardholder is matched only once and that the top N similar non-cardholders are considered for matching.
   1. The selection among the top N similar non-cardholders is done randomly to avoid bias. This process is defined in the `select_non_cardholders` function.
   2. The function also checks for the eligibility as defined above.
   3. If no eligible non-cardholders are found, the function prints a warning message.
4. **Output**: The function returns a list of tuples containing the matched cardholder and non-cardholder client IDs along with their similarity scores.

In [None]:
def match_cardholders_with_non_cardholders(non_transactional, transactions, top_n=5):
    """
    Match cardholders with non-cardholders based on the similarity of their activity patterns.

    The function creates an activity matrix, identifies eligible non-cardholders, calculates
    the Jaccard similarity to find matches, and randomly selects one match per cardholder
    from the top N similar non-cardholders.

    Parameters:
    - non_transactional (pd.DataFrame): A DataFrame containing non-cardholders.
    - transactions (pd.DataFrame): A DataFrame containing transactional data.
    - top_n (int): The number of top similar non-cardholders to consider for matching.

    Returns:
    - list: A list of tuples with the cardholder and matched non-cardholder client IDs and similarity scores.
    """
    with_card = non_transactional[non_transactional["card_issued"].notna()]
    without_card = non_transactional[non_transactional["card_issued"].isna()]

    activity_matrix = prepare_activity_matrix(transactions)

    with_card_activity = with_card.join(activity_matrix, on="account_id", how="left")
    without_card_activity = without_card.join(
        activity_matrix, on="account_id", how="left"
    )

    matched_non_cardholders = set()
    matches = []

    for idx, cardholder in tqdm(
        with_card_activity.iterrows(),
        total=len(with_card_activity),
        desc="Matching cardholders",
    ):
        issue_period = cardholder["card_issued"].to_period("M")
        eligible_cols = [
            col
            for col in activity_matrix
            if col.startswith("active") and pd.Period(col.split("_")[1]) <= issue_period
        ]

        if not eligible_cols:
            print(
                f"No eligible months found for cardholder client_id {cardholder['client_id']}."
            )
            continue
        
        cardholder_vector = cardholder[eligible_cols].values.reshape(1, -1)
        non_cardholder_matrix = without_card_activity[eligible_cols].values
        
        cardholder_vector = np.where(cardholder_vector > 0, 1, 0).astype(bool)
        non_cardholder_matrix = np.where(non_cardholder_matrix > 0, 1, 0).astype(bool)

        assert (
            cardholder_vector.shape[1] == non_cardholder_matrix.shape[1]
        ), "Dimension mismatch between cardholder and applicant activity matrix."

        distances = pairwise_distances(
            cardholder_vector, non_cardholder_matrix, 
            metric="jaccard", n_jobs=-1 
        ).flatten()
        eligible_non_cardholders = [
            i
            for i, applicant in without_card_activity.iterrows()
            if check_eligibility_for_matching(applicant, cardholder)
            and i not in matched_non_cardholders
        ]

        if eligible_non_cardholders:
            select_non_cardholders(
                distances,
                eligible_non_cardholders,
                matches,
                matched_non_cardholders,
                cardholder,
                without_card_activity,
                top_n,
            )
        else:
            print(
                f"No eligible non-cardholders found for cardholder client_id {cardholder['client_id']}."
            )

    return matches

The matching process is executed, and the results are stored in the `matched_non_card_holders_df` DataFrame. The percentage of clients with a card issued before and after matching is calculated to assess the impact of the matching process. We expect the percentage of clients with a card issued to increase by 100% after matching, as each non-cardholder should be matched with a cardholder.

In [None]:
matched_non_card_holders_df = match_cardholders_with_non_cardholders(
    non_transactional_w_sufficient_history_df, transactions_df
)

percentage_before_matching = non_transactional_w_sufficient_history_df["card_issued"].notna().mean() * 100
print(f"Percentage of clients with card issued: {percentage_before_matching:.2f}%")

Last but not least we set the artificial card issue date for each non-cardholder based on the matching results.

In [None]:
def set_artificial_issue_dates(non_transactional_df, matches):
    """
    Augment the non-transactional DataFrame with artificial card issue dates based on matching results.

    Each matched non-cardholder is assigned a card issue date corresponding to their matched
    cardholder. The 'has_card' flag for each non-cardholder is updated accordingly.

    Parameters:
    - non_transactional_df (pd.DataFrame): The DataFrame of non-cardholders to augment.
    - matches (list): A list of tuples containing the matched cardholder and non-cardholder IDs and similarity scores.

    Returns:
    - pd.DataFrame: The augmented DataFrame with artificial card issue dates.
    """
    augmented_df = non_transactional_df.copy()
    augmented_df["has_card"] = True

    for cardholder_id, non_cardholder_id, _ in matches:
        card_issue_date = augmented_df.loc[
            augmented_df["client_id"] == cardholder_id, "card_issued"
        ].values[0]
        augmented_df.loc[
            augmented_df["client_id"] == non_cardholder_id, ["card_issued", "has_card"]
        ] = [card_issue_date, False]

    return augmented_df

matched_non_card_holders_w_issue_date_df = set_artificial_issue_dates(
    non_transactional_w_sufficient_history_df, matched_non_card_holders_df
)

percentage_after_matching = matched_non_card_holders_w_issue_date_df["card_issued"].notna().mean() * 100
assert np.isclose(percentage_after_matching, percentage_before_matching * 2), "Percentage of clients with card issued after matching should be double the initial percentage."
print(f"Percentage of clients with card issued after matching: {percentage_after_matching:.2f}%")
print(f"Percentage without card issued after matching: {(1 - percentage_after_matching / 100) * 100:.2f}%")

After each non-cardholder got the artifical card issued date assigned we drop the remaining non-cardholders without a match.

In [None]:
before_len = len(matched_non_card_holders_w_issue_date_df)

matched_non_card_holders_w_issue_date_df = (
    matched_non_card_holders_w_issue_date_df.dropna(subset=["card_issued"])
)

data_reduction["Non-cardholders without match"] = -(
    before_len - len(matched_non_card_holders_w_issue_date_df)
)

print(f"Filtered out {before_len - len(matched_non_card_holders_w_issue_date_df)} non-cardholders without a match.")
del before_len

In total 83% of the non-card holders were filtered out due to ineligibility or not being matched with a card holder.

## Monthly Summary of Transactions

After matching cardholders with non-cardholders and setting artificial card issue dates, we aggregate the transactional data on a monthly basis to remove the transactional nature of the data. The monthly aggregation makes sense for multiple reasons:

- Monthly aggregation standardizes the time frame across which we analyze transactions, allowing us to compare transactional behaviors consistently across all accounts.
- Aggregating data on a monthly level illuminates patterns that daily data might obscure. It enables us to discern trends over a broader time scale, capturing cyclical behaviors, seasonal effects, and response to macroeconomic events.
- Daily transaction data can be "noisy" with random fluctuations. By considering monthly totals and averages, we reduce this noise, revealing underlying trends more clearly.

The function `aggregate_transactions_monthly` simplifies financial transactions by summarizing them every month for each account. Here's a simplified explanation of how it works:

1. **Sorting Transactions**: First, the function arranges the transactions chronologically by `account_id` and `date` within the `transactions_df` DataFrame. This helps in organizing transactions for each account by date.

2. **Monthly Grouping**: It converts each transaction's date into a monthly period. This categorizes transactions by the month and year they occurred, making it easier to group them monthly.

3. **Aggregation of Monthly Data**: The function groups these transactions by `account_id` and the `month` column. For each group, it calculates:
   - `volume`: Total transaction amount for the month.
   - `total_abs_amount`: Total of absolute values of all transactions, showing the total money movement regardless of direction.
   - `transaction_count`: Number of transactions to show how active the account was.
   - Counts of positive and negative transactions, indicating money coming in and going out.
   - Other statistics like average, median, minimum, maximum, and standard deviation of transaction amounts, which provide insights into how transaction amounts vary.
   - Counts of different transaction types, operations, and symbols, showing the variety of transactions.

4. **Cumulative Balance Calculation**: Finally, the function calculates a running total of the `volume` to track how the account balance changes over time.

This method is effective for understanding the financial behavior of accounts on a monthly basis, helping in further analyses and model building.

In [None]:
def aggregate_transactions_monthly(df):
    """
    Aggregate financial transaction data on a monthly basis per account.

    Parameters:
    - df (pd.DataFrame): DataFrame containing financial transaction data with 'account_id', 'date', and other relevant columns.

    - validate (bool): If True, validate the aggregated data. Default is True.

    Returns:
    - pd.DataFrame: Monthly aggregated financial transaction data per account.
    """
    df_sorted = df.sort_values(by=["account_id", "date"]) # sort by account_id and date to ensure correct running balance calculation
    df_sorted["month"] = df_sorted["date"].dt.to_period("M")

    monthly_aggregated_data = (
        df_sorted.groupby(["account_id", "month"])
        .agg(
            volume=("amount", "sum"),
            total_abs_amount=("amount", lambda x: x.abs().sum()),
            transaction_count=("amount", "count"),
            positive_transaction_count=("amount", lambda x: (x >= 0).sum()),
            negative_transaction_count=("amount", lambda x: (x < 0).sum()),
            average_amount=("amount", "mean"),
            median_amount=("amount", "median"),
            min_amount=("amount", "min"),
            max_amount=("amount", "max"),
            std_amount=("amount", "std"),
            type_count=("transaction_type", "nunique"),
            operation_count=("operation", "nunique"),
            k_symbol_count=("k_symbol", "nunique"),
        )
        .reset_index()
        .sort_values(by=["account_id", "month"])
    )
    
    monthly_aggregated_data["balance"] = monthly_aggregated_data.groupby("account_id")[
        "volume"
    ].cumsum()
    
    return monthly_aggregated_data

agg_transactions_monthly_df = aggregate_transactions_monthly(transactions_df)
agg_transactions_monthly_df.describe()

The `validate_monthly_aggregated_transactions` function is invoked to ensure the integrity and correctness of the aggregated data through several assertions:

1. The balance should consistently increase or decrease based on whether the total monthly transaction volume is positive or negative, respectively.
2. For each account, the balance in the first month should equal the total transaction volume of that month.
3. The sum of positive and negative transaction counts must equal the total transaction count for each month.
4. The number of unique accounts in the aggregated data should match that in the original dataset.
5. The final balances of accounts in the aggregated data should closely match their last recorded transactions in the original dataset.

In [None]:
def validate_monthly_aggregated_transactions(aggregated_data, original_df):
    """
    Validate the integrity and correctness of aggregated monthly financial transactions.

    Parameters:
    - aggregated_data (pd.DataFrame): Aggregated monthly transaction data.
    - original_df (pd.DataFrame): Original dataset of financial transactions.

    Raises:
    - AssertionError: If validation conditions are not met.
    """
    # Assertion 1: Balance should consistently increase or decrease based on total monthly transaction volume
    assert (aggregated_data["volume"] >= 0).all() == (
        aggregated_data["balance"].diff() >= 0
    ).all(), "If the total amount is positive, the balance should go up."
    assert (aggregated_data["volume"] < 0).all() == (
        aggregated_data["balance"].diff() < 0
    ).all(), "If the total amount is negative, the balance should go down."

    # Assertion 2: Balance in the first month should equal the total transaction volume of that month
    first_month = aggregated_data.groupby("account_id").nth(0)
    assert (
        first_month["volume"] == first_month["balance"]
    ).all(), "The balance should equal the volume for the first month."

    # Assertion 3: The sum of positive and negative transaction counts should equal the total transaction count
    assert (
        aggregated_data["positive_transaction_count"]
        + aggregated_data["negative_transaction_count"]
        == aggregated_data["transaction_count"]
    ).all(), "The sum of positive and negative transaction counts should equal the total transaction count."

    # Assertion 4: The number of unique accounts in the aggregated data should match that in the original dataset
    assert (
        aggregated_data["account_id"].nunique() == original_df["account_id"].nunique()
    ), "The number of unique account_ids in the aggregated DataFrame should be the same as the original DataFrame."

    # Assertion 5: The final balances of accounts in the aggregated data should closely match their last recorded transactions in the original dataset
    assert (
        pd.merge(
            aggregated_data.groupby("account_id")
            .last()
            .reset_index()[["account_id", "balance"]],
            original_df[
                original_df.groupby("account_id")["date"].transform("max")
                == original_df["date"]
            ][["account_id", "balance"]],
            on="account_id",
            suffixes=("_final", "_last"),
        )
        .apply(
            lambda x: np.isclose(x["balance_final"], x["balance_last"], atol=5), axis=1 # allow for small differences due to floating point precision
        )
        .any()
    ), "Some accounts' final balances do not match their last transactions."


validate_monthly_aggregated_transactions(agg_transactions_monthly_df, transactions_df)

# Exploratory Data Analysis: Aggregated Monthly Transactions

Further we explore the aggregated monthly transactions to gain insights into the financial behavior of accounts over time. We will visualize the monthly transaction volume, balance, and number of transactions to understand how these metrics evolve over time.

## Monthly Balance Difference and Volume

The `plot_monthly_balance_diff_and_volume` function visualizes the monthly balance difference and volume for a specific account. The balance difference is calculated as the difference between the current month's balance and the previous month's balance. This metric helps identify the impact of monthly transactions on the account balance. The volume represents the total transaction amount for each month.

In [None]:
def plot_monthly_balance_diff_and_volume(
    transactions_monthly, account_id 
):
    account_transactions = transactions_monthly[
        transactions_monthly["account_id"] == account_id
    ].sort_values(by="month")
    account_transactions["balance_diff"] = account_transactions["balance"].diff()

    plt.figure(figsize=(9.5, 6))

    plt.plot(
        account_transactions["month"].astype(str),
        account_transactions["balance_diff"],
        marker="o",
        label="Balance Difference",
    )
    plt.plot(
        account_transactions["month"].astype(str),
        account_transactions["volume"],
        marker="x",
        linestyle="--",
        label="Volume",
    )

    plt.title(f"Monthly Balance Difference and Volume for Account {account_id}")
    plt.xlabel("Month")
    plt.ylabel("Value")
    plt.xticks(rotation=90, fontsize=7)
    plt.yticks(fontsize=8)
    plt.legend()
    plt.grid(True)
    plt.show()


plot_monthly_balance_diff_and_volume(agg_transactions_monthly_df, 2)

This plot gives a clear picture of how money moves in and out of an account each month and how these movements affect the overall balance. It does this by showing two things:

- **Balance Difference**: This line shows whether the account balance went up or down each month. If the line goes up, it means the account gained money that month. If it goes down, the account lost money.
- **Volume**: This line shows the total amount of money that moved in the account each month, regardless of whether it was coming in or going out.

There is a direct link between the amount of money moved (volume) and changes in the account balance. High incoming money should lead to an uptick in the balance, and lots of outgoing money should lead to a downturn. It further confirms the aggregation made in the previous step was correct.

## Monthly Balance and Volume

Instead of the difference in balance, we can also look at the monthly balance and volume directly. The `plot_monthly_transactions_balance_and_volume` function visualizes the monthly transactions and balance for a specific account.

In [None]:
def plot_monthly_transactions_balance_and_volume(agg_transactions_monthly, account_id):
    account_transactions = agg_transactions_monthly[
        agg_transactions_monthly["account_id"] == account_id
    ]

    plt.figure(figsize=(9.5, 6))

    plt.plot(
        account_transactions["month"].astype(str),
        account_transactions["volume"],
        marker="o",
        label="Volume",
    )
    plt.plot(
        account_transactions["month"].astype(str),
        account_transactions["balance"],
        marker="x",
        linestyle="--",
        label="Balance",
    )

    plt.title(f"Monthly Transactions and Balance for Account {account_id}")
    plt.xlabel("Month")
    plt.ylabel("Value")
    plt.xticks(rotation=90, fontsize=7)
    plt.yticks(fontsize=8)
    plt.legend()
    plt.grid(True)
    plt.show()


plot_monthly_transactions_balance_and_volume(agg_transactions_monthly_df, 2)

This visualization offers a snapshot of an account’s activity over time by comparing money movement each month with the overall account balance. Similarly to the previous plot, it shows two key metrics:

- **Volume**: How much money came in or went out of the account each month. Incoming money is shown as up, and outgoing money as down.
- **Balance**: The total money in the account at the end of each month, showing how it's changed over time due to the monthly transactions.

It shows how the monthly money movement impacts the account's growing or shrinking balance. For example, a few months of high income should visibly increase the balance. It further validates the aggregation made in the previous step.

## Deliverable: Closer Look at Account 14

Let's take a closer look at the monthly transactions, balance, and volume for account 14 as requested by the task.

In [None]:
plot_monthly_transactions_balance_and_volume(agg_transactions_monthly_df, 14)

Account 14 shows a rather conservative transaction history. The spending habits are all withing range of 10k to -10k per month. We can see little volatility, the account shows a slight trend of growing.

## Deliverable: Closer Look at Account 18

Let's also examine the monthly transactions, balance, and volume for account 18.

In [None]:
plot_monthly_transactions_balance_and_volume(agg_transactions_monthly_df, 18)

Account 18 paints a different picture in comparison to account 14.

The volatility here is a lot higher, indicating a potential for a business account or high income household. Especially March 1994 to December 1994 show some volatile transaction habits.

Looking at the balance and volume per month for the accounts 14 and 18 we can notice different patterns. Account 14 shows a rather conservative transaction history with little volatility and a slight trend of growing. Account 18, on the other hand, shows a lot more volatility, indicating a potential business account or high-income household. This highlights the importance of understanding the financial behavior of accounts to identify patterns and trends that can inform decision-making. Ultimately, this is the job of the models we will build in the next steps.

# Pivot Transactions: Rolling Up to Monthly Aggregates

Now we pivot the aggregated transaction data to have each account as a row and the months leading up to card issuance as columns. This transformation aligns with the goal for a single record per account, summarizing transactional behavior in the months before card issuance.

The `pivot_transactions` function aggregates monthly transaction data and merges it with non-transactional account data. It focuses on the time frame leading up to the card issuance, filtering transactions based on a specified range of months before card issuance and aggregating various transaction metrics.

We are mainly interested in the time frame 2 months to 13 months before card issuance. This range allows us to capture transactional behavior in the year leading up to the card issuance, ignoring the month directly before the card issuance to avoid any potential bias from the card issuance itself.

In [None]:
def pivot_transactions(
    non_transactional, transactions_monthly, months_before_card_range=(2, 13)
):
    """
    Aggregate monthly transaction data and merge it with non-transactional account data,
    focusing on the time frame leading up to the card issuance.

    This function merges monthly transaction data with non-transactional data to associate each
    transaction with the respective account and card issued date. It then filters transactions based
    on a specified range of months before card issuance and aggregates various transaction metrics.

    Parameters:
    - non_transactional (pd.DataFrame): A DataFrame containing non-transactional account data. This is only used to map card issuance dates to transactions.
    - transactions_monthly (pd.DataFrame): A DataFrame containing monthly transaction data.
    - months_before_card_range (tuple): A tuple specifying the inclusive range of months before card
                                        issuance to filter the transactions for aggregation.

    The aggregation includes the sum of volume and transaction counts, as well as the mean and other
    statistical measures of transaction amounts, for each account within the specified months before
    card issuance.

    The resulting DataFrame is pivoted to have 'account_id' as rows and the months before card
    issuance as columns, with aggregated metrics as values. Column names are constructed to
    describe the month and the metric represented.

    Returns:
    - pd.DataFrame: The final aggregated and pivoted dataset ready for analysis, with each row
                    representing an account and each column a specific metric in the months before
                    card issuance.
    """
    merged_df = transactions_monthly.merge(
        non_transactional[["account_id"]], on="account_id"
    )

    merged_df["card_issued_date"] = merged_df["account_id"].map(
        non_transactional.set_index("account_id")["card_issued"]
    )
    merged_df["months_before_card"] = merged_df.apply(
        lambda row: (row["card_issued_date"].to_period("M") - row["month"]).n, axis=1
    )

    start_month, end_month = months_before_card_range
    filtered_df = merged_df.query(f"{start_month} <= months_before_card <= {end_month}")

    aggregated_data = (
        filtered_df.groupby(["account_id", "months_before_card"])
        .agg(
            {
                "volume": "sum",
                "total_abs_amount": "sum",
                "transaction_count": "sum",
                "positive_transaction_count": "sum",
                "negative_transaction_count": "sum",
                "average_amount": "mean",
                "median_amount": "median",
                "min_amount": "min",
                "max_amount": "max",
                "std_amount": "std",
                "type_count": "sum",
                "operation_count": "sum",
                "k_symbol_count": "sum",
                "balance": "mean",
            }
        )
        .reset_index()
    )

    pivoted_data = aggregated_data.pivot(
        index="account_id", columns="months_before_card"
    )
    
    pivoted_data.columns = [
        "_".join(["M", str(col[1]), col[0]]) for col in pivoted_data.columns.values
    ]
    return pivoted_data.reset_index()

transactions_pivoted_df = pivot_transactions(
    matched_non_card_holders_w_issue_date_df, agg_transactions_monthly_df
)
transactions_pivoted_df.describe()

# Merge everything together

Finally, we merge the non-transactional data with the pivoted transactional data to create the final golden record. This record contains all relevant information for each account, including the aggregated transactional data for each month leading up to the card issuance date.

The resulting DataFrame has one row per account, with columns representing various metrics for each month before card issuance along with non-transactional data like client and account IDs, card issuance dates, and other relevant information.

We can merge the non-transactional data with the pivoted transactional data using the `account_id` as the common key. As each transaction is linked to an account, this key ensures that the transactional data is correctly associated with the respective account.

In [None]:
golden_record_df = matched_non_card_holders_w_issue_date_df.merge(
    transactions_pivoted_df, on="account_id", how="left" # left join as we 
)

data_reduction["Final Golden Record"] = len(golden_record_df)
golden_record_df.head()

Looking at the first few rows of the final golden record, we can see the aggregated transactional data for each account, with columns representing various metrics for each month leading up to the card issuance date.

Additionally we can verify the uniqueness of `client_id` and `account_id` in the final DataFrame.

In [None]:
assert golden_record_df[
    "client_id"
].is_unique, "Each client_id should appear exactly once in the final DataFrame."

assert golden_record_df[
    "account_id"
].is_unique, "Each account_id should appear exactly once in the final DataFrame."

In [None]:
plt.figure()
plt.title("Number of Clients by Card Issuance Status")
sns.countplot(x="has_card", data=golden_record_df)
plt.xlabel("Card Issued")
plt.ylabel("Count")
plt.show()

Looking at the card issuance status we can see that the number of clients with a card issued is equal to the number of clients without a card issued.

In [None]:
plt.figure()
plt.title("Distribution of Card Issuance Dates")
sns.histplot(
    golden_record_df, x="card_issued", hue="has_card", kde=True, bins=30, alpha=0.5
)
plt.xlabel("Card Issuance Date")
plt.ylabel("Count")
plt.show()

The distribution of card issuance dates shows that the card issuance process was spread out over time, with an expected identical distribution for clients with and without cards issued. This makes sense as we set the artificial card issue date for each non-cardholder based on the matching results.

# Data Reduction Summary

The following waterfall chart visualizes the data reduction process, highlighting the number of records retained or lost at each stage.

In [None]:
data_reduction_df = pd.DataFrame(
    list(data_reduction.items()), columns=["Category", "Amount"]
)
colors = ["skyblue" if amt >= 0 else "orange" for amt in data_reduction_df["Amount"]]

fig = go.Figure(
    go.Waterfall(
        name="20",
        orientation="v",
        measure=["relative"] * (len(data_reduction_df) - 1) + ["total"],
        x=data_reduction_df["Category"],
        textposition="outside",
        text=[f"{amt:,.0f}" for amt in data_reduction_df["Amount"]],
        y=data_reduction_df["Amount"],
        connector={"line": {"color": "black", "width": 2}},
        decreasing={"marker": {"color": "orange"}},
        increasing={"marker": {"color": "skyblue"}},
        totals={"marker": {"color": "skyblue"}},
    )
)

fig.update_layout(
    title="Enhanced Data Reduction Waterfall Chart",
    xaxis=dict(title="Category"),
    yaxis=dict(title="Amount", range=[0, 5500]),
    waterfallgap=0.3,
)
fig.show()

The waterfall chart provides a visual representation of the data reduction process, illustrating the number of records retained or lost at each stage. The chart shows the reduction in the number of records from the initial dataset to the final golden record, highlighting the impact of each step in the data preparation pipeline:

- **Initial Dataset**: The starting point with the full dataset of 4500 accounts/records.
- **Junior Accounts**: The removal of junior accounts, reducing the dataset by 145 records.
- **Clients without sufficient history**: The elimination of clients without sufficient transactional history, resulting in a reduction of 419 records.
- **Non-cardholders without match**: The filtering out of non-cardholders without a match, leading to a decrease of 3'280 records.
- **Final Golden Record**: The final dataset with 656 records, each representing a unique account with aggregated transactional data for each month leading up to the card issuance date.

# Exploratory Data Analysis: Golden Record

With the final golden record in hand, we can now perform exploratory data analysis to gain insights into the financial behavior of cardholders and non-cardholders.

## Comparing Cardholders and Non-Cardholders

We will focus on comparing the financial behavior of cardholders and non-cardholders to identify any significant differences in their transactional patterns. This analysis can help us understand how cardholders differ from non-cardholders in terms of transaction volume, balance, and other financial metrics giving us an impression of the financial behavior of cardholders and non-cardholders.

### Trends in Financial Metrics

The function `plot_trends_with_medians` generates line graphs for average monthly values and annotates medians for specified ranges. This visualization helps identify trends in financial metrics over time and highlights the median values for specific periods, providing insights into the distribution of values.

In [None]:
golden_cardholders = golden_record_df[golden_record_df["has_card"]]
golden_non_cardholders = golden_record_df[~golden_record_df["has_card"]]


def plot_trends_with_medians(
    cardholders, non_cardholders, columns, title, median_ranges
):
    """
    Plots line graphs for average monthly values and annotates medians for specified ranges,
    adjusting x-axis indices to match the month sequence from the start.

    Parameters:
    - cardholders (pd.DataFrame): DataFrame containing data for cardholders.
    - non_cardholders (pd.DataFrame): DataFrame containing data for non-cardholders.
    - columns (list of str): List of column names ordered by time.
    - title (str): Title for the plot.
    - median_ranges (list of tuples): Each tuple contains start and end indices for calculating medians.
    """
    cardholder_avgs = cardholders[columns].mean()
    non_cardholder_avgs = non_cardholders[columns].mean()

    months = list(range(1, 1 + len(columns)))
    plt.figure()
    plt.plot(
        months,
        cardholder_avgs.values,
        marker="o",
        linestyle="-",
        color="blue",
        label="Cardholders",
    )
    plt.plot(
        months,
        non_cardholder_avgs.values,
        marker="o",
        linestyle="-",
        color="orange",
        label="Non-Cardholders",
    )

    for start, end in median_ranges:
        median_cardholder = cardholders[columns[start : end + 1]].median().median()
        median_non_cardholder = (
            non_cardholders[columns[start : end + 1]].median().median()
        )
        plt.hlines(
            median_cardholder,
            months[start],
            months[end],
            colors="darkblue",
            linestyles="--",
            label=f"Median {start+1}-{end+1} (Cardholders): {median_cardholder:.2f}",
        )
        plt.hlines(
            median_non_cardholder,
            months[start],
            months[end],
            colors="red",
            linestyles="--",
            label=f"Median {start+1}-{end+1} (Non-Cardholders): {median_non_cardholder:.2f}",
        )

    plt.title(title)
    plt.xlabel("Month")
    plt.ylabel("Value")
    plt.legend()
    plt.grid(True)
    plt.xticks(months, labels=[f"M_{month}" for month in months])  # Proper month labels
    plt.show()

### Monthly Balance Trends

In [None]:
plot_trends_with_medians(
    golden_cardholders,
    golden_non_cardholders,
    [f"M_{i}_balance" for i in range(2, 14)],
    "Monthly Balance Trends",
    [(0, 2), (9, 11)]
)

Starting with the monthly balance trends, we can observe how the average balance changes over time for cardholders and non-cardholders. The line graph shows the average monthly balance for each group, with annotations indicating the median balance for specific periods.

It is interesting to note that the median balance for cardholders is consistently higher than that of non-cardholders, indicating a potential difference in financial stability or spending habits between the two groups over time.

### Monthly Volume Trends

In [None]:
plot_trends_with_medians(
    golden_cardholders,
    golden_non_cardholders,
    [f"M_{i}_volume" for i in range(2, 14)],
    "Monthly Volume Trends",
    [(0, 2), (9, 11)]
)

The monthly volume trends show the average monthly transaction volume for cardholders and non-cardholders over time. The line graph illustrates how the transaction volume changes each month, with annotations highlighting the median volume for specific periods.

The median volume for cardholders is higher than that of non-cardholders, indicating a potential difference in transactional activity or spending patterns between the two groups. However, when sole looking at the volume, the difference is not as pronounced as with the balance and shows very volatile behavior. Generally this could come from the fact that the volume is the sum of all transactions, which can be naturally very volatile.

### Monthly Transaction Count Trends

In [None]:
plot_trends_with_medians(
    golden_cardholders,
    golden_non_cardholders,
    [f"M_{i}_transaction_count" for i in range(2, 14)],
    "Monthly Transaction Count Trends",
    [(0, 2), (9, 11)]
)

The monthly transaction count trends show the average number of transactions per month for cardholders and non-cardholders over time. The line graph displays how the transaction count changes each month, with annotations indicating the median count for specific periods.

The median transaction count for cardholders is higher than that of non-cardholders, suggesting a difference in transactional activity or spending habits between the two groups. This is in line with the volume trends, indicating that cardholders tend to have more transactions on average than non-cardholders.

### Monthly Positive and Negative Transaction Count Trends

In [None]:
plot_trends_with_medians(
    golden_cardholders,
    golden_non_cardholders,
    [f"M_{i}_positive_transaction_count" for i in range(2, 14)],
    "Monthly Positive Transaction Count Trends",
    [(0, 2), (9, 11)]
)

The monthly transaction count trends show the average number of transactions per month for cardholders and non-cardholders over time. The line graph displays how the transaction count changes each month, with annotations indicating the median count for specific periods.

The median transaction count for cardholders is only slightly higher than that of non-cardholders, indicating that the difference in transactional activity is not as pronounced for positive transactions. This suggests that both groups have similar patterns of money coming in each month.

In [None]:
plot_trends_with_medians(
    golden_cardholders,
    golden_non_cardholders,
    [f"M_{i}_negative_transaction_count" for i in range(2, 14)],
    "Monthly Negative Transaction Count Trends",
    [(0, 2), (9, 11)]
)

The monthly negative transaction count trends show the average number of transactions per month for cardholders and non-cardholders over time. The line graph displays how the transaction count changes each month, with annotations indicating the median count for specific periods.

The picture is pretty similar to the positive transaction count trends, indicating that both groups have similar patterns of money going out each month. This suggests that the difference in transactional activity between cardholders and non-cardholders is not as pronounced for negative transactions as well as for positive transactions.

### Loan Amount

In [None]:
avg_loan_amount_cardholders = golden_cardholders["loan_amount"].mean()
avg_loan_amount_non_cardholders = golden_non_cardholders["loan_amount"].mean()

plt.figure()
plt.title("Average Loan Amount by Card Issuance Status")
sns.barplot(
    x=["Cardholders", "Non-Cardholders"],
    y=[avg_loan_amount_cardholders, avg_loan_amount_non_cardholders],
)

plt.ylabel("Average Loan Amount")
plt.show()

The average loan amount for cardholders is higher than that of non-cardholders, indicating that non-cardholders tend to have higher loan amounts. Yet, this difference is not as pronounced as with the balance, volume, and transaction count trends and not as significant.

In [None]:
golden_record_df.to_parquet("temp/golden_record.parquet")

In [None]:
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import joblib

SEED = 1337

golden_record_df = pd.read_parquet("temp/golden_record.parquet")
gs_cache_file = "data/grid_search_cache.pkl"
reduced_model_cache_file = "data/reduced_best_model.pkl"

np.random.seed(1337)
random.seed(1337)

# Data Partitioning

The data is split in a 80/20 ratio for training and testing purposes. The stratification ensures that the distribution of the target variable is maintained in both sets. When actually training the models, we will additionally use cross-validation to ensure robust evaluation.

Additionally, we will create a `DataModule` class to encapsulate the training and testing data, as well as the feature columns used in the model. This class will help us manage the data and features throughout the model training and evaluation process.


In [None]:
from sklearn.model_selection import train_test_split


class DataModule:
    def __init__(self, X_train, X_test, y_train, y_test, feature_columns=None):
        self.feature_columns = (
            feature_columns if feature_columns is not None else X_train.columns
        )

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test


def create_data_module(df, feature_cols, target_col="has_card", test_size=0.2):
    X = df.drop(columns=[target_col])[feature_cols]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, shuffle=True
    )

    return DataModule(X_train, X_test, y_train, y_test)


data_module = create_data_module(
    golden_record_df, golden_record_df.drop(columns=["has_card"]).columns
)

print(f"Train set size: {len(data_module.X_train)}")
print(f"Test set size: {len(data_module.X_test)}")

print(f"Train set distribution:\n{data_module.y_train.value_counts(normalize=True)}")
print(f"Test set distribution:\n{data_module.y_test.value_counts(normalize=True)}")

As we can see the distribution of the target variable is maintained in both sets after the split. The ratios are as specified in the 80/20 split.


# Model Construction

We will now construct a pipeline for training and evaluating machine learning models. The pipeline will handle preprocessing, model training, cross-validation, and evaluation. We will use this pipeline to train and evaluate multiple candidate models.

## Pipeline for Training and Evaluation

The `train_evaluate_model` function is designed to streamline the process of training and evaluating machine learning models. It performs the following steps:

1. **Preprocessing**: The function automatically handles numerical and categorical features, imputing missing values, scaling numerical features, and one-hot encoding categorical features.
2. **Model Training**: The specified model is trained on the training data.
3. **Cross-Validation**: The model is evaluated using cross-validation with specified evaluation metrics.
4. **Model Evaluation**: The model is evaluated on the test set using various metrics, including accuracy, F1 score, AUC-ROC, precision, and recall.

The pipeline is flexible and can accommodate various models and feature sets, making it a versatile tool for model development and evaluation. It returns a summary of evaluation metrics for both training and test sets, as well as the true labels and predicted probabilities for the test set.

Additionally, the pipeline supports feature selection using Recursive Feature Elimination with Cross-Validation (RFECV). This feature selection method automatically selects the most relevant features based on the model's performance during cross-validation. The selected features can be retrieved from the pipeline after training.

Last but not least, the pipeline supports hyperparameter tuning using Grid Search with Cross-Validation. This functionality allows for optimizing the model's hyperparameters to improve performance. The best hyperparameters can be retrieved after training the model.

In [None]:
from sklearn.feature_selection import RFECV
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import (
    make_scorer,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
)
import scikitplot as skplt
import dalex as dx


class Trainer:
    def __init__(
        self,
        data_module,
        model,
        cv=10,
        select_features=False,
        param_grid=None,
        verbose=False,
        n_jobs=-1,
    ):
        self.data_module = data_module
        self.model = model
        self.cv = cv
        self.verbose = verbose
        self.preprocessor = self._create_preprocessor()
        self.select_features = select_features
        self.param_grid = param_grid
        self.n_jobs = n_jobs
        self.pipeline = None
        self.train_metrics_report = None
        self.test_metrics_report = None

    def _create_preprocessor(self):
        numerical_features = [
            col
            for col in self.data_module.X_train.columns
            if self.data_module.X_train[col].dtype in ["int64", "float64"]
        ]
        categorical_features = [
            col
            for col in self.data_module.X_train.columns
            if col not in numerical_features
        ]

        other_features = [
            col
            for col in self.data_module.X_train.columns
            if col not in numerical_features + categorical_features
        ]
        if len(other_features) > 0:
            raise ValueError(
                f"Columns with unsupported data types found: {other_features}"
            )

        numerical_pipeline = Pipeline(
            [("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
        )

        categorical_pipeline = Pipeline(
            [
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
            ]
        )

        return ColumnTransformer(
            transformers=[
                ("num", numerical_pipeline, numerical_features),
                ("cat", categorical_pipeline, categorical_features),
            ]
        )

    def fit(self):
        model_pipeline_steps = [("model", self.model)]
        if self.select_features:
            model_pipeline_steps.insert(
                0,
                (
                    "feature_selection",
                    RFECV(self.model, verbose=3 if self.verbose else 0, cv=self.cv),
                )
            )
            
        model_pipeline = Pipeline(model_pipeline_steps)
        
        if self.param_grid is not None:
            model_pipeline = GridSearchCV(
                model_pipeline,
                self.param_grid,
                cv=self.cv,
                verbose=3 if self.verbose else 0,
                n_jobs=self.n_jobs
            )

        self.pipeline = Pipeline(
            [("preprocessor", self.preprocessor), ("model_pipeline", model_pipeline)]
        )

        self.pipeline.fit(self.data_module.X_train, self.data_module.y_train)
        return self

    @staticmethod
    def get_scoring_metrics():
        return ["accuracy", "f1_macro", "roc_auc", "precision", "recall"]

    def eval_train(self):
        scoring = {
            "accuracy": "accuracy",
            "f1_macro": make_scorer(f1_score),
            "roc_auc": "roc_auc",
            "precision": make_scorer(precision_score),
            "recall": make_scorer(recall_score),
        }

        cv_results = cross_validate(
            self.pipeline,
            self.data_module.X_train,
            self.data_module.y_train,
            scoring=scoring,
            cv=self.cv,
            return_train_score=False,
            n_jobs=self.n_jobs,
            verbose=3 if self.verbose else 0,
            return_estimator=True,
            return_indices=True,
            error_score="raise",
        )

        self.train_metrics_report = {
            metric: {
                "folds": cv_results[f"test_{metric}"].tolist(),
                "mean": cv_results[f"test_{metric}"].mean(),
                "std": cv_results[f"test_{metric}"].std(),
            }
            for metric in scoring
        }

        roc_data = []
        for i in range(self.cv):
            estimator = cv_results["estimator"][i]
            train_indices, test_indices = (
                cv_results["indices"]["train"][i],
                cv_results["indices"]["test"][i],
            )

            true_labels = self.data_module.y_train.iloc[test_indices]
            y_pred_proba = estimator.predict_proba(
                self.data_module.X_train.iloc[test_indices]
            )
            roc_data.append((true_labels, y_pred_proba))

        self.train_metrics_report["roc_data"] = roc_data

        return self

    def eval_test(self):
        X_test, y_test = self.data_module.X_test, self.data_module.y_test
        y_pred_proba = (
            self.pipeline.predict_proba(X_test)[:, 1]
            if hasattr(self.pipeline, "predict_proba")
            else np.nan
        )
        test_metrics = {
            "accuracy": self.pipeline.score(X_test, y_test),
            "f1_macro": f1_score(
                y_test, self.pipeline.predict(X_test), average="macro"
            ),
            "roc_auc": (
                roc_auc_score(y_test, y_pred_proba)
                if hasattr(self.pipeline, "predict_proba")
                else np.nan
            ),
            "precision": precision_score(y_test, self.pipeline.predict(X_test)),
            "recall": recall_score(y_test, self.pipeline.predict(X_test)),
        }
        self.test_metrics_report = {
            metric: test_metrics[metric] for metric in test_metrics
        }

        return self

    def get_pipeline(self):
        return self.pipeline

    def get_preprocessor(self):
        return self.preprocessor

    def get_train_metrics_report(self):
        return self.train_metrics_report

    def get_test_metrics_report(self):
        return self.test_metrics_report

    def get_best_params(self):
        if self.param_grid is None:
            raise ValueError(
                "No hyperparameter grid was provided during model training."
            )

        best_param = self.pipeline["model_pipeline"].best_params_
        return {key.split('__')[1]: value for key, value in best_param.items()}

    def get_selected_features(self):
        if not self.select_features:
            raise ValueError("Feature selection was not enabled during model training.")

        if (
            self.pipeline is None
            or "feature_selection"
            not in self.pipeline.named_steps["model_pipeline"].named_steps
        ):
            raise ValueError(
                "Feature selection has not been performed or the model is not fitted."
            )

        rfe = self.pipeline.named_steps["model_pipeline"].named_steps[
            "feature_selection"
        ]
        feature_mask = rfe.support_

        feature_names = self._get_feature_names_from_preprocessor()

        selected_features = [
            feature
            for feature, is_selected in zip(feature_names, feature_mask)
            if is_selected
        ]
        return [
            feature
            for feature in self.data_module.feature_columns
            if any([feature in col for col in selected_features])
        ]

    def _get_feature_names_from_preprocessor(self):
        transformers = self.preprocessor.transformers_
        feature_names = []

        for name, transformer, column in transformers:
            if hasattr(transformer, "get_feature_names_out"):
                feature_names.extend(transformer.get_feature_names_out(column))
            else:
                feature_names.extend(column)

        return feature_names

Similarly to the `Trainer` class, the `Visualizer` class is designed to streamline the process of visualizing model performance and explanations. It provides a variety of visualization methods for evaluating models, including confusion matrices, classification reports, ROC curves, precision-recall curves, and feature importances.

In [None]:
from sklearn.metrics import roc_curve, classification_report, precision_recall_curve

import plotly.express as px

class Visualizer:
    def __init__(self, trainer, model_name):
        self.trainer = trainer
        self.model_name = model_name

        X_train, X_test, y_train, y_test = (
            self.trainer.data_module.X_train,
            self.trainer.data_module.X_test,
            self.trainer.data_module.y_train,
            self.trainer.data_module.y_test,
        )

        self.explainer = dx.Explainer(trainer.get_pipeline(), X_test, y_test)

        self.X_test = X_test
        self.y_true = y_test
        self.y_test_pred_proba = trainer.get_pipeline().predict_proba(X_test)

    @staticmethod
    def compare_evaluation_metrics(visualizers):
        model_names = [viz.model_name for viz in visualizers]
        metrics = Trainer.get_scoring_metrics()
        
        means = {metric: [] for metric in metrics}
        stds = {metric: [] for metric in metrics}
        for viz in visualizers:
            train_metrics = viz.trainer.get_train_metrics_report()
            for metric in metrics:
                means[metric].append(np.mean(train_metrics[metric]["folds"]))
                stds[metric].append(np.std(train_metrics[metric]["folds"]))
        
        n_groups = len(metrics)
        bar_width = 0.15
        index = np.arange(n_groups)
        opacity = 0.8
        
        plt.figure(figsize=(9, 7))
        colors = plt.cm.viridis(np.linspace(0, 1, len(model_names)))
        
        for i, model_name in enumerate(model_names):
            bar_positions = index + bar_width * i
            bar_values = [means[metric][i] for metric in metrics]
            error_values = [stds[metric][i] for metric in metrics]
            
            bars = plt.bar(
                bar_positions,
                bar_values,
                bar_width,
                alpha=opacity,
                color=colors[i],
                yerr=error_values,
                capsize=5,
                label=model_name
            )
            
            for bar, error in zip(bars, error_values):
                yval = bar.get_height()
                plt.text(
                    bar.get_x() + bar.get_width() / 2,
                    yval + error + 0.01,
                    f"{yval:.2f} ± {error:.2f}",
                    ha='center',
                    va='bottom',
                    fontsize=9,
                    rotation=90
                )
        
        plt.xlabel('Metrics', fontsize=14)
        plt.ylabel('Scores', fontsize=14)
        plt.title('Cross-Validation (k={}) Evaluation Metrics Comparison'.format(visualizers[0].trainer.cv), fontsize=16)
        plt.xticks(index + bar_width * (len(model_names) - 1) / 2, metrics, fontsize=12, rotation=90)
        plt.ylim(0, 1.21)
        plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
        plt.grid(True, which='major', linestyle='--', linewidth='0.5', color='grey')
        plt.tight_layout()
        plt.show()

    @staticmethod
    def compare_roc_curves(visualizers, dataset):
        if dataset not in ["test", "eval"]:
            raise ValueError("Invalid dataset option. Choose 'test' or 'eval'.")

        plt.figure(figsize=(8, 8))
        colors = plt.cm.viridis(np.linspace(0, 1, len(visualizers)))

        for i, viz in enumerate(visualizers):
            if dataset == "test":
                y_true = viz.trainer.data_module.y_test
                y_scores = viz.trainer.get_trained_model().predict_proba(
                    viz.trainer.data_module.X_test
                )[:, 1]
            elif dataset == "eval":
                y_true = []
                y_scores = []
                for fold in viz.trainer.get_train_metrics_report()["roc_data"]:
                    y_true.extend(fold[0])
                    y_scores.extend(fold[1][:, 1])

            fpr, tpr, _ = roc_curve(y_true, y_scores)
            auc_score = roc_auc_score(y_true, y_scores)
            plt.plot(
                fpr,
                tpr,
                label=f"{viz.model_name} (AUC = {auc_score:.2f})",
                color=colors[i],
            )

        plt.plot([0, 1], [0, 1], "k--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        
        title = None
        if dataset == "test":
            title = "ROC Curve Comparison on Test Set"
        elif dataset == "eval":
            title = "ROC Curve Comparison on Evaluation Set (Averaged over Folds with CV=10)"
        
        plt.title(title)
        plt.legend(loc="lower right")
        plt.show()

    def plot_validation_metrics(self):
        train_metrics = self.trainer.get_train_metrics_report()
        cv = len(train_metrics["accuracy"]["folds"])
        metrics = self.trainer.get_scoring_metrics()
        fold_scores = {metric: train_metrics[metric]["folds"] for metric in metrics}

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
        bp = ax1.boxplot(fold_scores.values(), labels=metrics, notch=True, patch_artist=True, positions=np.arange(len(metrics))+1)
        for box in bp['boxes']:
            box.set(color='blue', linewidth=2)
            box.set(facecolor='lightblue')
        ax1.set_title('Boxplot of Metrics')
        ax1.set_ylabel('Scores')
        ax1.set_ylim(0, 1)
        ax1.grid(True)

        means = [np.mean(values) for values in fold_scores.values()]
        std_devs = [np.std(values) for values in fold_scores.values()]
        bar_positions = np.arange(1, len(metrics)+1)
        ax2.bar(bar_positions, means, align='center', alpha=0.7, color='green', capsize=10)
        ax2.set_xticks(bar_positions)
        ax2.set_xticklabels(metrics)
        ax2.set_title('Bar Chart of Average Metrics')
        ax2.set_ylabel('Average Score')
        ax2.set_ylim(0, 1)
        ax2.grid(True)

        for idx, (mean, std) in enumerate(zip(means, std_devs)):
            ax2.text(idx + 1, mean, f'{mean:.2f}±{std:.2f}', ha='center', va='bottom', fontsize=9, color='darkred')

        plt.tight_layout()
        plt.suptitle(f"{self.model_name}: Validation Metrics Comparison (CV={cv})", fontsize=16)
        plt.subplots_adjust(top=0.85)
        plt.show()

    def plot_test_metrics(self):
        test_metrics = self.trainer.get_test_metrics_report()
        test_values = list(test_metrics.values())
        test_names = list(test_metrics.keys())

        sns.barplot(x=test_names, y=test_values)
        plt.title(f"{self.model_name}: Test Metrics")
        plt.xlabel("Metrics")
        plt.ylabel("Score")
        for i, v in enumerate(test_values):
            if np.isnan(v):
                plt.text(i, 0.5, "N/A", ha="center", va="bottom")
            else:
                plt.text(i, v + 0.01, f"{v:.2f}", ha="center", va="bottom")
        plt.ylim(0, 1)
        plt.grid(True)
        plt.show()

    def plot_confusion_matrix_test(self):
        preds = self.y_test_pred_proba.argmax(axis=1)
        skplt.metrics.plot_confusion_matrix(self.y_true, preds)
        plt.title(f"{self.model_name}: Confusion Matrix on Test Set")
        plt.show()
        
    def plot_confusion_matrix_eval(self):
        y_true = []
        y_pred = []
        for fold in self.trainer.get_train_metrics_report()["roc_data"]:
            y_true.extend(fold[0])
            y_pred.extend(fold[1][:, 1].argmax(axis=1))
                
    def plot_classification_report_test(self):
        preds = self.y_test_pred_proba.argmax(axis=1)
        report = classification_report(self.y_true, preds, output_dict=True)

        report_df = pd.DataFrame(report).transpose()
        report_df = report_df.round(2)

        table = plt.table(
            cellText=report_df.values,
            colLabels=report_df.columns,
            rowLabels=report_df.index,
            cellLoc="center",
            rowLoc="center",
            loc="center",
            fontsize=12,
        )
        table.auto_set_font_size(False)
        table.set_fontsize(12)
        table.scale(1.2, 1.2)

        plt.axis("off")
        plt.title(f"{self.model_name}: Classification Report on Test Set")
        plt.show()

    def plot_roc_curve_test(self):
        skplt.metrics.plot_roc(
            self.y_true, self.y_test_pred_proba, plot_micro=False, plot_macro=True
        )
        plt.title(f"{self.model_name}: ROC Curve on Test Set")
        plt.show()

    def plot_roc_curve_eval(self, show_folds=False):
        fig, ax = plt.subplots(figsize=(8, 8))
        colors = plt.cm.viridis(np.linspace(0, 1, self.trainer.cv))

        roc_data = self.trainer.get_train_metrics_report()["roc_data"]
        for k in range(self.trainer.cv):
            true_labels, y_pred_proba = roc_data[k]
            fpr, tpr, _ = roc_curve(true_labels, y_pred_proba[:, 1])
            auc_score = roc_auc_score(true_labels, y_pred_proba[:, 1])
            ax.plot(
                fpr, tpr, color=colors[k], label=f"Fold {k + 1} (AUC = {auc_score:.2f})"
            )

        plt.title(
            f"{self.model_name}: ROC Curves for each fold (CV={self.trainer.cv}, "
            f'Mean AUROC={self.trainer.train_metrics_report["roc_auc"]["mean"]:.2f})'
        )
        if show_folds:
            plt.legend(loc="lower right")
            
        plt.plot([0, 1], [0, 1], color="gray", linestyle="--")

        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.grid(True)
        plt.show()

    def plot_precision_recall_curve_test(self):
        skplt.metrics.plot_precision_recall(self.y_true, self.y_test_pred_proba)
        plt.title(f"{self.model_name}: Precision-Recall Curve on Test Set")
        plt.show()

    def plot_lift_curve_test(self):
        skplt.metrics.plot_lift_curve(self.y_true, self.y_test_pred_proba)
        plt.title(f"{self.model_name}: Lift Curve on Test Set")
        plt.legend(loc="upper right")
        plt.show()

    def plot_cumulative_gain_curve_test(self):
        skplt.metrics.plot_cumulative_gain(self.y_true, self.y_test_pred_proba)
        plt.title(f"{self.model_name}: Cumulative Gain Curve on Test Set")
        plt.show()

    def plot_partial_dependence_test(self, feature):
        pdp = self.explainer.model_profile(type="partial", variables=feature)
        pdp.plot()

    def plot_accumulated_local_effects_test(self, feature):
        ale = self.explainer.model_profile(type="accumulated", variables=feature)
        ale.plot()

    def plot_breakdown_test(self, observation):
        breakdown = self.explainer.predict_parts(observation, type="break_down")
        breakdown.plot()

    def plot_model_explanations_test(self):
        feature_importance = self.explainer.model_parts()
        feature_importance.plot()

        # model_profile = self.explainer.model_profile(type="partial")
        # model_profile.plot()

    def plot_grid_search(self, log_scale_params):
        if self.trainer.param_grid is None:
            raise ValueError("No hyperparameter grid was provided during model training.")
        
        cv_results = pd.DataFrame(self.trainer.get_pipeline().named_steps["model_pipeline"].cv_results_)
        
        def shorten_param(param_name):
            if "__" in param_name:
                return param_name.rsplit("__", 1)[1]
            return param_name
        
        cv_results = cv_results.rename(shorten_param, axis=1)
        
        params = {}
        for param in log_scale_params:
            if cv_results[param].dtype in ["int64", "float64"]:
                params[param] = lambda x: np.log10(x) if x > 0 else 0
            else:
                params[param] = lambda x: x
        
        fig = px.parallel_coordinates(
            cv_results.apply(
                {
                    **params,
                    'mean_test_score': lambda x: x
                }
            ),
            color="mean_test_score",
            color_continuous_scale=px.colors.sequential.Viridis
        )
        fig.show()

    def visualize_explanations_test(self, feature_columns=[]):
        self.plot_model_explanations()

        if not feature_columns:
            feature_columns = self.trainer.data_module.feature_columns[0]

        self.plot_partial_dependence(feature_columns)
        self.plot_accumulated_local_effects(feature_columns)

        observation = self.trainer.data_module.X_test.iloc[0]
        self.plot_breakdown(observation)

        plt.show()

## Baseline Model: Logistic Regression

We will start by training a baseline logistic regression model using a subset of features. The features include the client's age, region, and aggregated balance and volume information from the transactional data. The goal is to establish a baseline performance level that we can compare against more complex models.

The logistic regression model is a simple yet effective model for binary classification tasks. It provides interpretable results and can serve as a good starting point for more complex models.

In [None]:
baseline_feature_columns = ["age", "client_region"] + [
    col
    for col in golden_record_df.columns
    if "M_" in col and ("_balance" in col or "_volume" in col)
]

baseline_data_module = create_data_module(golden_record_df, baseline_feature_columns)

print(f"Number of baseline feature columns: {len(baseline_feature_columns)}")
print(f"Baseline feature columns: {baseline_feature_columns}")

Filtering the relevant columns for the baseline model, we have 26 columns.

In [None]:
from sklearn.linear_model import LogisticRegression

baseline_trainer = (
    Trainer(baseline_data_module, LogisticRegression(max_iter=10000, random_state=SEED)).fit().eval_train()
)

baseline_visualizer = Visualizer(baseline_trainer, "Baseline Logistic Regression")
baseline_visualizer.plot_validation_metrics()

The baseline model hits quite high scores in all metrics showing good robustness across folds. 

The confusion matrix shows that the model is performing well on the test set with a high number of true positives and true negatives. There is an equal number of false positives and false negatives.

In [None]:
baseline_visualizer.plot_roc_curve_eval(show_folds=True)

The ROC curve shows that the model has a high true positive rate across different thresholds. The AUC score is also quite high, indicating good performance.

## Adding more features

In order to possibly improve the model performance, we will include more features in the training data. We will include all features except for the ones that are not relevant for the model training.

After merging the transactional and non-transactional data, we have many columns that are unnecessary for model training. We will remove all columns containing card-related information, except for the `has_card` column. This decision stems from the fact that 50% of our dataset consists of cardholders and the other 50% consists of non-cardholders, which we matched with the cardholders. Therefore, the data in the non-target card-related columns come from the actual cardholders.

Additionally we will remove all columns that contain time-dependent information, such as dates and IDs, as they are not relevant for the model.

In [None]:
num_cols_before = len(golden_record_df.columns)
print(f"Number of columns before filtering: {num_cols_before}")

golden_record_df = golden_record_df.loc[
    :,
    ~golden_record_df.columns.str.contains("card")
    | golden_record_df.columns.str.contains("has_card"),
]
print(
    f"Removed {num_cols_before - len(golden_record_df.columns)} card-related columns. Now {len(golden_record_df.columns)} columns remain."
)

num_cols_before = len(golden_record_df.columns)
golden_record_df = golden_record_df.drop(
    columns=["loan_granted_date", "birth_date", "account_created"]
)
print(
    f"Removed {num_cols_before - len(golden_record_df.columns)} time-dependent columns. Now {len(golden_record_df.columns)} columns remain."
)

num_cols_before = len(golden_record_df.columns)
golden_record_df = golden_record_df.drop(
    columns=[
        "loan_account_id",
        "loan_loan_id",
        "order_account_id",
        "client_district_name",
        "disp_id",
        "account_id",
        "account_district_name",
    ]
)
print(
    f"Removed {num_cols_before - len(golden_record_df.columns)} ID columns. Now {len(golden_record_df.columns)} columns remain."
)

num_cols_before = len(golden_record_df.columns)
golden_record_df = golden_record_df.drop(
    columns=[col for col in golden_record_df.columns if "std" in col]
)
print(
    f"Removed {num_cols_before - len(golden_record_df.columns)} std columns. Now {len(golden_record_df.columns)} columns remain."
)

cols_to_exclude_in_train = ["client_id", "has_card"]
all_cols_data_module = create_data_module(
    golden_record_df, golden_record_df.drop(columns=cols_to_exclude_in_train).columns
)

print(f"Number of all feature columns: {len(all_cols_data_module.feature_columns)}")
del num_cols_before

In total we remove 30 columns from the dataset. The remaining columns are used for training the models.

## Candidate Models

We will now train multiple candidate models using the expanded feature set and evaluate their performance. The candidate models include:

- Logistic Regression
- Random Forest
- Decision Tree
- Gradient Boosting

We will train each model using the same cross-validation strategy and evaluation metrics to ensure a fair comparison. After training the models, we will evaluate their performance across folds. 

### Logistic Regression

As a direct extension of the baseline model, we will train a logistic regression model using the expanded feature set. The hypothesis is that the additional features will improve the model's performance by capturing more complex relationships but also potentially introduce noise and reduce generalization.

In [None]:
log_reg_trainer = (
    Trainer(all_cols_data_module, LogisticRegression(max_iter=10000, random_state=SEED)).fit().eval_train()
)

log_reg_visualizer = Visualizer(log_reg_trainer, "Logistic Regression")
log_reg_visualizer.plot_validation_metrics()

As hypothesized, the logistic regression model with the expanded feature set performs worse than the baseline model. This indicates that the additional features might have introduced noise or overfitting. The model's performance is still quite good, but it is slightly worse than the baseline model. We still have a fairly high AUC score as in the baseline model.

In [None]:
log_reg_visualizer.plot_roc_curve_eval(show_folds=True)

Looking closer at the ROC curves for each fold, we can see some expected variance in the performance across different folds. 

### Random Forest

Next, we will train a Random Forest model to see if it can capture more complex relationships in the data and outperform the logistic regression model.

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_trainer = (
    Trainer(
        all_cols_data_module,
        RandomForestClassifier(random_state=SEED),
    )
    .fit()
    .eval_train()
)

rf_visualizer = Visualizer(rf_trainer, "Random Forest")
rf_visualizer.plot_validation_metrics()

In [None]:
rf_visualizer.plot_roc_curve_eval(show_folds=True)

### Decision Tree

We will also train a Decision Tree model to see how it performs compared to the other models. Decision Trees are known for their interpretability and simplicity.

In [None]:
from sklearn.tree import DecisionTreeClassifier

decision_tree_trainer = (
    Trainer(
        all_cols_data_module,
        DecisionTreeClassifier(random_state=SEED),
    )
    .fit()
    .eval_train()
)

decision_tree_visualizer = Visualizer(decision_tree_trainer, "Decision Tree")
decision_tree_visualizer.plot_validation_metrics()

In [None]:
decision_tree_visualizer.plot_roc_curve_eval(show_folds=True)

### Gradient Boosting

Finally, we will train a Gradient Boosting model to see if it can outperform the other models. Gradient Boosting models are known for their high accuracy and ability to capture complex relationships in the data.

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gradient_boost_trainer = (
    Trainer(
        all_cols_data_module,
        GradientBoostingClassifier(random_state=SEED),
    )
    .fit()
    .eval_train()
)

gradient_boost_visualizer = Visualizer(gradient_boost_trainer, "Gradient Boosting")
gradient_boost_visualizer.plot_validation_metrics()

In [None]:
gradient_boost_visualizer.plot_roc_curve_eval(show_folds=True)

# Model Comparison & Selection

We have trained and evaluated multiple candidate models using the expanded feature set. We will now compare the models' performance across the  evaluation metrics and select the best-performing model for further analysis. The evaluation metrics include accuracy, F1 score, AUC-ROC, precision, and recall.

In [None]:
candidate_trainers = [
    baseline_trainer,
    log_reg_trainer,
    rf_trainer,
    decision_tree_trainer,
    gradient_boost_trainer,
]
candidate_visualizers = [
    baseline_visualizer,
    log_reg_visualizer,
    rf_visualizer,
    decision_tree_visualizer,
    gradient_boost_visualizer,
]

In [None]:
Visualizer.compare_evaluation_metrics(candidate_visualizers)

The comparison of evaluation metrics across the candidate models shows that the Random Forest model has one of the highest mean scores along with the Baseline and Gradient Boosting models. The Decision Tree model as well as the Logistic Regression model have lower mean scores across the metrics.

Especially the high recall of the Random Forest model is promising, as it indicates that the model can effectively classify positive samples (clients who have a card) without missing many of them.

In [None]:
Visualizer.compare_roc_curves(candidate_visualizers, dataset="eval")

The ROC curves move all above the diagonal line, which is a good sign and does not show any problems with the models. The ROC curve of a decision tree model tends to be very linear because decision trees make hard predictions, assigning instances to either one class or the other without providing probability estimates. This results in a stair-step or piecewise linear ROC curve.


The curves of the baseline, Gradient Boosting, and Random Forest models are very similar, indicating that they perform similarly across different thresholds. The Logistic Regression model has a slightly lower curve, indicating that it performs worse than the other models.

The AUC scores of the models are also quite high, with the Random Forest model having one of the highest scores.

## Top-N Customer Lists

We will now use the trained models to generate a list of the top N% customers who are most likely to get a card (according to the model). Therefore we will look at the customers who are most likely to get a card but don't have one yet. This list can be used by the marketing team to target potential customers who are likely to get a card.

In [None]:
def create_top_n_customers_list(model, data):
    """ Create a list of top N% customers who are most likely to get a card according to the model """
    mandatory_columns = ["client_id", "has_card"]

    if not hasattr(model, "predict_proba"):
        raise ValueError("Model does not support probability predictions")

    if not all(col in data.columns for col in mandatory_columns):
        raise ValueError("Mandatory columns not found in data: 'client_id', 'has_card'")

    data = data[data["has_card"] == 0]

    probabilities = model.predict_proba(data.copy())
    # Probability of having a card (class 1). This essentially gives the clients who should most likely have a card based on the model but don't have one.
    probabilities = probabilities[:, 1]

    results = pd.DataFrame(
        {"Client ID": data["client_id"], "Probability": probabilities}
    )

    return results.sort_values(by="Probability", ascending=False).reset_index(drop=True)


def compare_top_n_lists(*lists, labels, top_n_percent):
    """ Compare the overlap of top N% customer lists generated by different models """
    if len(lists) != len(labels):
        raise ValueError("Each list must have a corresponding label")

    if len(set([len(l) for l in lists])) != 1:
        raise ValueError("All lists must have the same length")

    for l in lists:
        if not l["Probability"].is_monotonic_decreasing:
            raise ValueError("Lists must be sorted in descending order of probability")

    top_n = int(len(lists[0]) * top_n_percent)
    lists = [l.head(top_n) for l in lists]

    overlap_matrix = pd.DataFrame(0, index=labels, columns=labels)

    for i, list1 in enumerate(lists):
        set1 = set(list1["Client ID"])
        for j, list2 in enumerate(lists):
            set2 = set(list2["Client ID"])
            overlap_matrix.iloc[i, j] = len(set1.intersection(set2))

    overlap_matrix = overlap_matrix / len(lists[0])
    return overlap_matrix


def visualize_overlap_matrix(overlap_matrix, title):
    """ Visualize the overlap matrix as a heatmap """
    plt.figure(figsize=(10, 8))

    mask = np.tril(np.ones_like(overlap_matrix, dtype=bool))
    overlap_matrix = overlap_matrix.mask(mask)

    sns.heatmap(
        overlap_matrix,
        annot=True,
        cmap="Blues",
        cbar_kws={"label": "Common Customers [%]"},
    )
    plt.title(title)
    plt.ylabel("List from Model/Method")
    plt.xlabel("List from Model/Method")
    plt.xticks(
        ticks=np.arange(len(overlap_matrix.columns)) + 0.5,
        labels=overlap_matrix.columns,
        rotation=45,
        ha="right",
    )
    plt.yticks(
        ticks=np.arange(len(overlap_matrix.index)) + 0.5,
        labels=overlap_matrix.index,
        rotation=0,
    )
    plt.show()

### Top-10% Customer Selection

We will select the top 10% of customers who are most likely to get a card according to each model.

In [None]:
customer_lists = [
    create_top_n_customers_list(trainer.get_pipeline(), golden_record_df)
    for trainer in candidate_trainers
]

candidate_labels = [
    "Baseline",
    "Logistic Regression",
    "Random Forest",
    "Decision Tree",
    "Gradient Boosting",
]

top_10_overlap_matrix = compare_top_n_lists(
    *customer_lists, labels=candidate_labels, top_n_percent=0.1
)
visualize_overlap_matrix(
    top_10_overlap_matrix, "Overlap of Top-10% Customer Lists by Model"
)

Looking at the overlap matrix of the top 10% customer lists, we can see that the Random Forest model has a high overlap with the other tree-based models (Decision Tree and Gradient Boosting). The Baseline model seems to have only 50% overlap with the Logistic Regression model, indicating that they still share some common predictions as they are both linear models.

### Top-5% Customer Selection

We will select the top 5% of customers who are most likely to get a card according to each model.

In [None]:
top_5_overlap_matrix = compare_top_n_lists(
    *customer_lists, labels=candidate_labels, top_n_percent=0.05
)
visualize_overlap_matrix(
    top_5_overlap_matrix, "Overlap of Top-5% Customer Lists by Model"
)

Looking at the overlap matrix of the top 5% customer lists, we can see that the overlap between the tree-based models is even higher. Especially the overlap between the Decision Tree and Gradient Boosting models got higher. The Logistic Regression model still has a lower overlap with the other models but gained some overlap with the Gradient Boosting model.

## Selected Model: Random Forest

After evaluating the candidate models, we have selected the Random Forest model as the selected model for further analysis. The Random Forest model has shown one of the highest mean scores across the evaluation metrics and has a high recall, indicating that it can effectively classify positive samples without missing many of them. The model also has a high AUC score, indicating good performance across different thresholds.

After we have now selected the Random Forest model, we will further optimize its hyperparameters using grid search with cross-validation to improve its performance

In [None]:
best_model_trainer = rf_trainer
best_model_visualizer = rf_visualizer

# Model Optimization

We will perform a GridSearch on a param grid with reasonable values for the Random Forest model to find the best hyperparameters. The GridSearch will be performed using cross-validation with the same settings as the training of the candidate model. The best model will be selected based on the mean AUC score across folds.

The hyperparameters we will tune are:

- `n_estimators`: The number of trees in the forest.
- `max_depth`: The maximum depth of the tree.
- `min_samples_split`: The minimum number of samples required to split an internal node.
- `min_samples_leaf`: The minimum number of samples required to be at a leaf node.

We will use the same data and feature set as before to ensure a fair comparison.

As this process can be computationally expensive, we will cache the trained model to avoid retraining it multiple times.

In [None]:
gs_param_grid = {
    "model__n_estimators": [50, 100, 200],
    "model__max_depth": [5, 10, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4]
}

try:
    gs_trainer = joblib.load(gs_cache_file)
    print("Loaded cached model")
except FileNotFoundError:
    print("No cached model found, proceeding with training...")
    gs_trainer = Trainer(
        all_cols_data_module,
        RandomForestClassifier(random_state=SEED),
        param_grid=gs_param_grid,
        verbose=False
    ).fit().eval_train()
    
    joblib.dump(gs_trainer, gs_cache_file)  

print("Best Parameters:", gs_trainer.get_best_params())

The best hyperparameters found by the GridSearch are:

- `n_estimators`: 200
- `max_depth`: 10
- `min_samples_split`: 2
- `min_samples_leaf`: 4

This indicates that a Random Forest model profits from a higher number of trees in the forest and a higher maximum depth of the trees. The minimum number of samples required to split an internal node and the minimum number of samples required to be at a leaf node are relatively low, indicating that the model can split nodes with fewer samples.

In [None]:
gs_visualizer = Visualizer(gs_trainer, "Random Forest Grid Search")
gs_visualizer.plot_grid_search(log_scale_params=["n_estimators", "max_depth", "min_samples_split", "min_samples_leaf"])

Looking at the coordinate plot of the GridSearch results, we can see that the model's performance increases with the number of estimators and the maximum depth of the trees. Overall the impact of the `min_samples_split` and `min_samples_leaf` hyperparameters is less pronounced. The best model is found at the highest values of `n_estimators` and possibly mid-range values of `max_depth`. In the end the optimizations are minimal though with a mean test score being between 0.81 and 0.835.

In [None]:
gs_visualizer.plot_validation_metrics()

In [None]:
Visualizer.compare_evaluation_metrics([best_model_visualizer, gs_visualizer])

The Random Forest model after GridSearch optimization shows a slight improvement in the mean scores across the evaluation metrics. Especially the recall profited from the hyperparameter optimization, getting both higher in its mean and lower in its variance.

In [None]:
gs_visualizer.plot_roc_curve_eval(show_folds=True)

In [None]:
Visualizer.compare_roc_curves([best_model_visualizer, gs_visualizer], dataset="eval")

The ROC curves of the Random Forest model before and after hyperparameter optimization are moving closely towards the upper left corner, indicating good performance across different thresholds. However the mean AUC score did not change after the optimization with the mean curves of both models being almost identical and the difference negligible.

As we now optimised the model we will evaluate it on the test set to get a final performance estimate.

In [None]:
gs_trainer.eval_test()
gs_visualizer.plot_test_metrics()

The model performs well on the test set, with high scores across the evaluation metrics. The AUC score is high, indicating good performance across different thresholds.

In [None]:
_, _ = (
    gs_visualizer.plot_confusion_matrix_test(),
    gs_visualizer.plot_classification_report_test()
)

The confusion matrix shows that the model is performing well on the test set with a high number of true positives and true negatives. However, there is an unequal number of false positives and false negatives: The false positives are higher than the false negatives. This hints that the model may be slightly biased towards predicting positive samples (clients who have a card).

In [None]:
gs_visualizer.plot_lift_curve_test()

Looking at the lift curve we can see some interesting patterns. For true class there is a sharper drop of precision for around 0.25 percentage of samples which then recovers again, before continuing with the expected decline, this indicates that there appears to be a problem in certain classifications in the top 0.20 percentage of probabilities. Looking at the false class, everything seems fine, with a high lift in the beginning and a steady smooth decline.

In [None]:
best_model_visualizer.plot_roc_curve_test()

In [None]:
_, _ = (
    best_model_visualizer.plot_confusion_matrix_test(),
    best_model_visualizer.plot_classification_report_test(),
)

Looking at the confusion matrix and classification report of the Random Forest model, we can see that the model performs well on the test set. The confusion matrix shows a high number of true positives and true negatives, with a low number of false positives and false negatives. There is a slight imbalace in the false positives and false negatives, with the false positives being higher than the false negatives. This can also be seen in the recall for False class being lower at 0.79.

# Model Explanation & Reduction

In [None]:
try:
    reduced_best_model_trainer = joblib.load(reduced_model_cache_file)
    print("Loaded cached reduced best model")
except FileNotFoundError:
    print("No cached reduced best model found, proceeding with training...")
    reduced_best_model_trainer = (
        Trainer(
            all_cols_data_module,
            RandomForestClassifier(**gs_trainer.get_best_params(), random_state=SEED),
            select_features=True,
        )
        .fit()
        .eval_train()
    )
    joblib.dump(reduced_best_model_trainer, reduced_model_cache_file)

selected_features = reduced_best_model_trainer.get_selected_features()
print("Selected Features:", selected_features)
print("Number of selected features:", len(selected_features))

reduced_best_model_visualizer = Visualizer(reduced_best_model_trainer, "Reduced Random Forest")

Looking at the selected features we see some confirmation of our previous analysis. The selection contains a every of base balance feature (M_2 to M_12). This makes sense as our analysis showed that across the board card holders in the golden record appear to have higher balances than non-card holders.

In [None]:
Visualizer.compare_evaluation_metrics(
    [best_model_visualizer, gs_visualizer, reduced_best_model_visualizer]
)

The comparison shows that the reduced Random Forest model performs in some cases slightly worse than the original Random Forest model. Across all metrics we see a negligble decrease of about 1% in the mean scores. The variance of the scores is also slightly higher, indicating that the model may be less robust. This is a trade-off we have to make when reducing the number of features. Given the gained explainability and interpretability of the model, this trade-off is worth it.

In [None]:
reduced_best_model_visualizer.plot_confusion_matrix_test()

The confusion matrix is almost identical with the exception of one fewer false positive. Therefore the reduced model performs better than the original model in this regard.

In [None]:
reduced_best_model_visualizer.plot_model_explanations_test()

Here we can see the feature important of the reduced model. The most important feature is the `M_2_balance` feature, which is the base balance of the client. This confirms our previous analysis that the base balance is a strong indicator of whether a client has a card or not. In appear all balance features as important features in the model.

## Lift Curve on reduced model vs grid search model

In [None]:
reduced_best_model_visualizer.plot_lift_curve_test()
gs_visualizer.plot_lift_curve_test()

Also comparing the lift curves of the grid search model and the reduced model we can see very little difference. There appears to be a slightly earlier drop at already 0.18 percentage of samples for the reduced model, but the overall shape of the curve is very similar.

## Top-N Customer List

We will generate a list of the top 10% and top 5% customers who are most likely to get a card according to the reduced Random Forest model.

In [None]:
rf_models = [rf_trainer, gs_trainer, reduced_best_model_trainer]


rf_customer_lists = [
    create_top_n_customers_list(trainer.get_pipeline(), golden_record_df)
    for trainer in rf_models
]

rf_labels = ["Random Forest", "Grid Search Random Forest", "Reduced Random Forest"]

top_10_overlap_matrix_rf = compare_top_n_lists(
    *rf_customer_lists, labels=rf_labels, top_n_percent=0.1
)

visualize_overlap_matrix(
    top_10_overlap_matrix_rf, "Overlap of Top-10% Customer Lists by Model"
)

In [None]:
top_5_overlap_matrix_rf = compare_top_n_lists(
    *rf_customer_lists, labels=rf_labels, top_n_percent=0.05
)

visualize_overlap_matrix(
    top_5_overlap_matrix_rf, "Overlap of Top-5% Customer Lists by Model"
)

Looking at the top 10% and top 5% customer lists again across all Random Forest models we can see that the overlap is partially very high. The reduced and the grid search model have a rather high overlap with 0.78 and 0.62 respectively for top 10% and top 5% of customers. While the reduced and the normal Random Forest model share less overlap at 0.66 for top 10% and 0.56 for top 5% of customers. We do however see that the grid search and normal Random Forest share a similar overlap to the grid search and reduced model.

## Breakdown of selected model on top and bottom prediction.

To better understand the model's predictions, we will visualize the breakdown of the top and bottom predictions from the reduced Random Forest model. 

In [None]:
customer_lists_reduced_rf = customer_lists[-1] 
print(customer_lists_reduced_rf.head())
top_prob_client = customer_lists_reduced_rf['Client ID'][0]

top_client = golden_record_df[golden_record_df['client_id'] == top_prob_client]
top_client  = top_client.apply(pd.to_numeric, errors='coerce')

# drop has card and label required for explainer to work properly
top_client = top_client.drop(columns=['has_card', 'client_id'])

reduced_best_model_visualizer.plot_breakdown_test(top_client)

Looking at a specific prediction we see how the features contribute to the prediction. The base balance is the most important feature, followed by the volume of the client. The other features have a much smaller impact on the prediction.

In [None]:
bottom_prob_client = customer_lists_reduced_rf.iloc[-1]['Client ID']
print(customer_lists_reduced_rf.tail())
bottom_client = golden_record_df[golden_record_df['client_id'] == bottom_prob_client]
bottom_client  = bottom_client.apply(pd.to_numeric, errors='coerce')
# drop has card and label
bottom_client = bottom_client.drop(columns=['has_card', 'client_id'])

reduced_best_model_visualizer.plot_breakdown_test(bottom_client)

Looking at the lowest probability feature we see how the features contribute to the prediction. 

# Conclusion

In this project, we have analyzed a dataset containing information about clients and their transactions to predict which clients are most likely to get a card. We looked at multiple different models and evaluated their performance using various evaluation metrics. 

## To Non-Technical Stakeholders

Across the board it appears as if high balances are important in predicting whether a client has a card or not. Generally speaking is a high balance count in the past 12 (+ 1 lag) months a strong indicator for a potential card buyer. The analysis also showed that with card holders having higher median balances across all months. 

In [None]:
reduced_best_model_visualizer.plot_lift_curve_test()

Looking at the Lift Curve we can see that our model outperforms a random model by a big margin. Especially it is very good at identifying non-buyers, reducing risk of spending effort and time on clients that are unlikely to buy a card.


Here is the top 5 client ids according to our model which currently do not have a card but are prime candidates for getting one:

In [None]:
customer_lists_reduced_rf.head()

We hope that this helps you in the decision making process and that you can use this information to target potential customers who are likely to get a card.

Good luck!

Dominik & Noah
