In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])

# Load Data

In [3]:
df = pd.read_csv("../data/dataset.csv", delimiter=";").dropna(subset=["default"])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89976 entries, 0 to 89975
Data columns (total 43 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   uuid                                 89976 non-null  object 
 1   default                              89976 non-null  float64
 2   account_amount_added_12_24m          89976 non-null  int64  
 3   account_days_in_dc_12_24m            79293 non-null  float64
 4   account_days_in_rem_12_24m           79293 non-null  float64
 5   account_days_in_term_12_24m          79293 non-null  float64
 6   account_incoming_debt_vs_paid_0_24m  36619 non-null  float64
 7   account_status                       41042 non-null  float64
 8   account_worst_status_0_3m            41042 non-null  float64
 9   account_worst_status_12_24m          29921 non-null  float64
 10  account_worst_status_3_6m            38038 non-null  float64
 11  account_worst_status_6_12m  

# What does a row represent?

Looking at the variables, we can conclude that each row represents a snapshot of an account for up to 24 months before its most recent appearance in the database. We can go one step further and supose that such event happens when the account tries to make a new purchase and the model must infer the probability that the account will default the payment.

In [5]:
df["time_hours"].describe()

count    89976.000000
mean        15.336351
std          5.030674
min          0.000278
25%         11.629167
50%         15.800833
75%         19.548056
max         23.999722
Name: time_hours, dtype: float64

# Is there a timeline between snapshots?

Although the variable "time_hours" can give us a sense of order between snapshots, it is hardly a good candidate to separate the past and future as it is not a full timestamp.
Thus, for the sake of brevity, it is fair to assume that the behaviour of accounts is independent of others and that "time_hours" is intended to be used only as a feature for our models.


# Stratified Split

Before going any further, we must set aside 20% of our dataset to be used as our test (unseen) set. Assuming that our dataset is representative of real-world data, we choose to maintain the rate of contamination in the test set.

In [6]:
try:
    y = df.pop("default")
except KeyError:
    pass

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df, y, train_size=0.8, random_state=42, shuffle=True, stratify=y
)

In [8]:
X_test.index.name = "row_id"
y_test.index.name = "row_id"

X_test.to_csv("../data/test/X_test.csv", index=True)
y_test.to_csv("../data/test/y_test.csv", index=True)

In [9]:
X_train.index.name = "row_id"
y_train.index.name = "row_id"

X_train.to_csv("../data/train/X_train.csv", index=True)
y_train.to_csv("../data/train/y_train.csv", index=True)

In [10]:
pd.read_csv("../data/dataset.csv", delimiter=";").query("default.isna()").to_csv(
    "../data/predict/to_predict.csv", index=False
)