In [12]:
import multiprocessing as mp
import os
from typing import List, Tuple
import subprocess
import numpy as np
import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go
from sklearn.utils import resample


pio.templates.default = "seaborn"

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])
    import _aux.functions as func

# Load data

In [3]:
default = (
    pd.read_csv(
        "../data/train/X_train.csv", index_col=0, usecols=["row_id", "age", "name_in_email"]
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 1")
)

not_default = (
    pd.read_csv(
        "../data/train/X_train.csv", index_col=0, usecols=["row_id", "age", "name_in_email"]
    )
    .join(pd.read_csv("../data/train/y_train.csv", index_col=0))
    .query("default == 0")
)

## 1. Personal Variables
Be it due to less financial stability, more impulsive behaviour or dimmed ability to weight consequences, common knowledge tells us that younger customers should be more likely to default their payments than their older counterparts. However, time and time again, common knowledge has prooved to be rather flimsy ally when making decisions and predictions. Next, we test the hypothesis that "younger customers are more likely to default" against the alternative hypothesis that "younger customers are not more likely to default"


In [13]:
fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=default.age.sample(1000, replace=True, random_state=42),
        name="default",
        histfunc="count",
        # histnorm='probability',
        xbins=dict(start=18, end=100, size=5),
    )
)

fig.add_trace(
    go.Histogram(
        x=not_default.age.sample(1000, replace=True, random_state=42),
        name="not_default",
        histfunc="count",
        # histnorm='probability',
        xbins=dict(start=18, end=100, size=5),
    )
)


fig.update_layout(title="Are youngsters more likely to default?", barmode="overlay")

fig.update_traces(opacity=0.75)
fig.show()

Although comparing histograms for a boostrap sample of each label is suggestive towards youngsters being more likely to default, it is not rigorous enough to draw any conclusion. Hence, we employ a bootstrap one-sided hypothesis test

In [50]:
num_iterations = 100_000
perms_default = []
perms_not_default = []
combined = np.concatenate((default.age, not_default.age), axis=0)

for i in range(num_iterations):
    np.random.seed(i)
    # perms_default.append(default.age.sample(500, replace=True))
    # perms_not_default.append(not_default.age.sample(500, replace=True))
    perms_default.append(resample(combined, n_samples=500))
    perms_not_default.append(resample(combined, n_samples=5000))
    
diff_bootstrap_means = (np.mean(perms_default, axis=1)-np.mean(perms_not_default, axis=1))
observed_difference = (np.mean(default.age) - np.mean(not_default.age))

In [51]:
p_value = diff_bootstrap_means[diff_bootstrap_means < observed_difference].shape[0]/num_iterations

In [52]:
fig = go.Figure()
fig.add_trace(
    go.Histogram(
        x=diff_bootstrap_means,
        name="sample_difference",
        histfunc="count",
        # histnorm='probability',
        # xbins=dict(start=18, end=100, size=5),
    )
)

fig.add_vline(x=observed_difference, line_width=3, line_color="red", annotation_text=f"{p_value}")

# fig.update_layout(title="Are youngsters more likely to default?", barmode="overlay")

fig.update_traces(opacity=0.75)
fig.show()
