In [1]:
import os
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import scipy.cluster.hierarchy as spc

pio.templates.default = "plotly_white"

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    from _aux import functions as func

# Load data

In [3]:
default = (
    pd.read_csv(
        "../data/train/X_train.csv",
        index_col=0,
        usecols=[
            "row_id",
            "merchant_category",
            "merchant_group",
        ],
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 1")
)

not_default = (
    pd.read_csv(
        "../data/train/X_train.csv",
        index_col=0,
        usecols=[
            "row_id",
            "merchant_category",
            "merchant_group",
        ],
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 0")
)

df = pd.read_csv(
    "../data/train/X_train.csv",
    index_col=0,
).join(pd.read_csv("../data/train/y_train.csv", index_col=0))

## Overview

Both "merchant_category" and "merchant_group" are highly correlated. We must explore whether they can help us predict defaults and the first step is to see how they relate:

In [4]:
def highlight_above_mean(s):
    """
    highlight the maximum in a Series yellow.
    """
    is_above_mean = s > s.mean()
    return ["background-color: yellow" if v else "" for v in is_above_mean]


df.groupby(["merchant_group", "merchant_category"]).agg(
    customers=("uuid", "nunique"),
    default=("default", "sum"),
    not_default=("default", func.complement),
    cat_contamination=("default", lambda s: s.sum() / s.shape[0]),
).join(
    df.groupby("merchant_group").agg(
        group_contamination=("default", lambda s: s.sum() / s.shape[0])
    ),
    on="merchant_group",
).style.apply(
    highlight_above_mean, subset=["cat_contamination", "group_contamination"]
).format(
    {
        "default": "{:.0f}",
        "not_default": "{:.0f}",
        "cat_contamination": "{:.2%}",
        "group_contamination": "{:.2%}",
    }
)

Unnamed: 0_level_0,Unnamed: 1_level_0,customers,default,not_default,cat_contamination,group_contamination
merchant_group,merchant_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Automotive Products,Automotive Parts & Accessories,594,7,587,1.18%,1.63%
Automotive Products,Wheels & Tires,82,4,78,4.88%,1.63%
Children Products,Children Clothes & Nurturing products,1082,15,1067,1.39%,1.24%
Children Products,Children toys,460,3,457,0.65%,1.24%
Children Products,Diversified children products,2171,28,2143,1.29%,1.24%
Clothing & Shoes,Adult Shoes & Clothing,224,5,219,2.23%,2.31%
Clothing & Shoes,General Shoes & Clothing,3359,56,3303,1.67%,2.31%
Clothing & Shoes,Underwear,61,2,59,3.28%,2.31%
Clothing & Shoes,Youthful Shoes & Clothing,8335,214,8121,2.57%,2.31%
Electronics,Car electronics,33,2,31,6.06%,2.38%


In the table above, we highlight groups and categories that have contamination higher than average. We can see that 12 categories go above average, 3 of which have nearly a 10 fold increase. Likewise, 3 groups are flagged as above average, but the variance within groups is considerably high so it's not clear that groups would be a good choice.

On the other hand, 56 merchant categories also makes the variable ineligible. It seems that the best approach is to blacklist a set of categories and create a boolean variable with it. Before doing so, we present an analytical overview of "merchant_category":

In [5]:
plot_data = (
    df.groupby("merchant_category")
    .agg(
        default=("default", lambda s: s.sum() / s.shape[0]),
        not_default=("default", lambda s: (s.shape[0] - s.sum()) / s.shape[0]),
        counts=("default", "count"),
    )
    .sort_values("counts", ascending=False)
)

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(
    go.Bar(
        x=plot_data.index,
        y=plot_data.not_default,
        name="not_default",
    ),
    secondary_y=False,
)

fig.add_trace(
    go.Bar(
        x=plot_data.index,
        y=plot_data.default,
        name="default",
    ),
    secondary_y=False,
)

fig.update_traces(opacity=0.4)

fig.add_trace(
    go.Scatter(
        x=plot_data.index,
        y=plot_data.counts,
        name="observations",
        mode="lines+markers",
        line_color="indianred",
        line_width=4,
    ),
    secondary_y=True,
)

fig.update_layout(
    title="Merchant Category",
    barmode="stack",
    yaxis_tickformat="%",
    yaxis_title="",
    width=1500,
    height=600,
)

fig.update_yaxes(
    showgrid=False,
    secondary_y=False,
)

fig.update_yaxes(
    title_text="<b>Observations (log)</b>",
    secondary_y=True,
    type="log",
    range=[-1e-3, 4.5],
)

fig.show()

The plot above shows two main ideas, the first (left yaxis) explores how defaults are distributed across categories, the second (right yaxis) explores the ubiquity of each category with our customers. A few broad observations:
- Diversified entertainment holds ~39% of all customers in the sample but sees only half the rate of defaults compared to the whole sample
- 11 categories have at least 1K customers
- 20 categories have less than 100 customers
- "Sex toys" and "Plants & Flowers" see more than 10% of their customers default, and "Tobacco" almost 15%
    - these merchants combined have less than 0.2% of our customers
- In the mid-range, "Dating services" see almost 10% of customers default

## 1. Blacklist variable

We use the strategy of "above ~10% contamination" (over 5x the average rate) to choose categories in which defaults happen more frequently. We believe this information combined with all other features we proposed should have enough signal to do a decent job at predicting defaults. The categories we choose are:
- Tobacco
- Sex toys
- Plants & Flowers
- Dating services

In [6]:
blacklist = [
    "Tobacco",
    "Sex toys",
    "Plants & Flowers",
    "Dating services",
]

(
    df[["default"]]
    .assign(
        var=df["merchant_category"].isin(blacklist),
    )
    .groupby("var")
    .agg(
        default=("default", "sum"),
        not_default=("default", func.complement),
        count=("default", "count"),
        contamination=("default", lambda s: s.sum() / s.shape[0]),
    )
)

Unnamed: 0_level_0,default,not_default,count,contamination
var,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,971.0,70447.0,71418,0.013596
True,59.0,503.0,562,0.104982


This indeed looks much better than the 56 categories we would have to one-hot encode. The recall of this feature is far from ideal, but we are hopefull that along with others, it will help us predict defaults.
Feature:
- is_merchant_category_blacklisted

---

Now that we are done with exploration, we move onto putting all features together to find out whether we need to drop some or come up with new ones.