In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
try:
    _ = first_run
except NameError:
    first_run = True
    os.chdir(os.getcwd().rsplit("/", 1)[0])

# Load Data

In [None]:
df = pd.read_csv("../data/dataset.csv", delimiter=";").dropna(subset=["default"])

In [None]:
df.info()

# What does a row represent?

Looking at the variables, we can conclude that each row represents a snapshot of an account for up to 24 months before its most recent appearance in the database. We can go one step further and supose that such event happens when the account tries to make a new purchase and the model must infer the probability that the account will default the payment.

In [None]:
df["time_hours"].describe()

# Is there a timeline between snapshots?

Although the variable "time_hours" can give us a sense of order between snapshots, it is hardly a good candidate to separate the past and future as it is not a full timestamp.
Thus, for the sake of brevity, it is fair to assume that the behaviour of accounts is independent of others and that "time_hours" is intended to be used only as a feature for our models.


# Stratified Split

Before going any further, we must set aside 20% of our dataset to be used as our test (unseen) set. Assuming that our dataset is representative of real-world data, we choose to maintain the rate of contamination in the test set.

In [None]:
try:
    y = df.pop("default")
except KeyError:
    pass

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df, y, train_size=0.8, random_state=42, shuffle=True, stratify=y
)

In [None]:
X_test.index.name = "row_id"
y_test.index.name = "row_id"

X_test.to_csv("../data/test/X_test.csv", index=True)
y_test.to_csv("../data/test/y_test.csv", index=True)

In [None]:
X_test.index.name = "row_id"
y_test.index.name = "row_id"

X_train.to_csv("../data/train/X_train.csv", index=True)
y_train.to_csv("../data/train/y_train.csv", index=True)

In [None]:
pd.read_csv("../data/dataset.csv", delimiter=";").query("default.isna()").to_csv(
    "../data/predict/to_predict.csv", index=False
)