In [1]:
!echo $PYTHONPATH

src


In [2]:
import pandas as pd
import os
from enum import Enum
import sys
from pprint import pprint    
sys.path.append("../src")

from config import data_dir
from tqdm import tqdm

# auto reload all modules
%load_ext autoreload
%autoreload 2

In [3]:
# check .env was loaded
os.environ['LOGGING_LEVEL']

'DEBUG'

In [7]:
data_dir()

PosixPath('/app/home-credit-risk/datasets')

### Preprocess descriptions table

In [10]:
descr_filename = "HomeCredit_columns_description.csv"
descr_df = pd.read_csv(data_dir() / descr_filename, encoding='utf-8')

descr_df.table = descr_df.table.map({'application_{train|test}.csv': 'application',
                     'bureau.csv': 'bureau',
                     'bureau_balance.csv': 'bureau_balance',
                     'POS_CASH_balance.csv': 'cash_balance',
                     'credit_card_balance.csv': 'credit_card_balance',
                     'previous_application.csv': 'previous_applications',
                     'installments_payments.csv': 'installments_payments'})
descr_df.table.unique()

descr_df.columns = [col.lower() for col in descr_df.columns]
descr_df.to_csv(data_dir() / "HomeCredit_columns_description.csv", index=False)

### Map dataset name to filename

To easily load from name

In [11]:
class DatasetFilename(Enum):
    APPLICATIONS = "application_train.feather"
    BUREAU_BALANCE = "bureau_balance.feather"
    BUREAU = "bureau.feather"
    CREDIT_CARD_BALANCE = "credit_card_balance.feather"
    INSTALLMENTS_PAYMENTS = "installments_payments.feather"
    PREVIOUS_APPLICATIONS = "previous_application.feather"
    CASH_BALANCE = "POS_CASH_balance.feather"

    @classmethod
    def from_name(cls, name):
        if hasattr(DatasetFilename, name.upper()):
            return getattr(DatasetFilename, name.upper()).value
        else:
            raise ValueError(f"No such dataset: {name}")

In [12]:
DatasetFilename.from_name('cash_balance')

'POS_CASH_balance.feather'

In [13]:
DatasetFilename.CASH_BALANCE

<DatasetFilename.CASH_BALANCE: 'POS_CASH_balance.feather'>

### Class to load data from file
Cache loaded datasets

In [14]:
class DataIO:

    DATASETS = [
        name.split(".")[0].lower() for name, _ in DatasetFilename.__members__.items()
    ]
    DESCRIPTIONS_FILENAME = "HomeCredit_columns_description.csv"

    def __init__(self):

        # store loaded datasets
        self.datasets_ = dict()

    @staticmethod
    def format_dataset(df):
        df.columns = [col.lower() for col in df.columns]
        return df

    def load_dataset(self, dataset_name: str) -> pd.DataFrame:
        assert dataset_name in self.DATASETS, f"Unknown dataset {dataset_name}."

        if dataset_name in self.datasets_:
            print(f'Dataset {dataset_name}: already loaded.')
            df = self.datasets_[dataset_name]
        else:
            print(f'Dataset {dataset_name}: loading from file.')
            df = pd.read_feather(data_dir() / DatasetFilename.from_name(dataset_name))
            self.datasets_[dataset_name] = df

        df = self.format_dataset(df)
        return df

    def load_all(self):
        pbar = tqdm(self.DATASETS)
        for dataset in pbar:
            pbar.set_description(f"Loading dataset: {dataset}")
            self.load_dataset(dataset_name=dataset)

    @classmethod
    def list_available(cls) -> list:
        return cls.DATASETS
    
    @classmethod
    def describe_columns(cls, dataset_name: str) -> pd.DataFrame:
        return pd.read_csv(data_dir() / cls.DESCRIPTIONS_FILENAME, encoding='utf-8')\
                .query('table == @dataset_name')
        [['row', 'description']]
    def list_loaded(self) -> list:
        return list(self.datasets_.keys())

In [15]:
data_io = DataIO()
descr_df = data_io.describe_columns('cash_balance')
descr_df

Unnamed: 0,table,row,description,special


In [23]:
data_io.describe_columns('cash_balance')

Unnamed: 0,table,row,description,special


In [25]:
data_io.datasets_.keys()

dict_keys(['cash_balance', 'applications', 'bureau_balance', 'bureau', 'credit_card_balance', 'installments_payments', 'previous_applications'])

In [16]:
data_io.list_available()

['applications',
 'bureau_balance',
 'bureau',
 'credit_card_balance',
 'installments_payments',
 'previous_applications',
 'cash_balance']

In [17]:
data_io.load_dataset(dataset_name='cash_balance')
data_io.list_loaded()

Dataset cash_balance: loading from file.


['cash_balance']

In [18]:
data_io.load_all()
data_io.list_loaded()

Loading dataset: applications:   0%|                                                                                                                                                            | 0/7 [00:00<?, ?it/s]

Dataset applications: loading from file.


Loading dataset: bureau_balance:  14%|████████████████████▊                                                                                                                             | 1/7 [00:03<00:18,  3.12s/it]

Dataset bureau_balance: loading from file.


Loading dataset: bureau:  29%|████████████████████████████████████████████                                                                                                              | 2/7 [00:09<00:25,  5.14s/it]

Dataset bureau: loading from file.


Loading dataset: credit_card_balance:  43%|████████████████████████████████████████████████████████████▍                                                                                | 3/7 [00:12<00:16,  4.19s/it]

Dataset credit_card_balance: loading from file.


Loading dataset: installments_payments:  57%|███████████████████████████████████████████████████████████████████████████████▍                                                           | 4/7 [00:19<00:15,  5.21s/it]

Dataset installments_payments: loading from file.


Loading dataset: previous_applications:  71%|███████████████████████████████████████████████████████████████████████████████████████████████████▎                                       | 5/7 [00:32<00:16,  8.13s/it]

Dataset previous_applications: loading from file.


Loading dataset: cash_balance: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:42<00:00,  6.10s/it]

Dataset cash_balance: already loaded.





['cash_balance',
 'applications',
 'bureau_balance',
 'bureau',
 'credit_card_balance',
 'installments_payments',
 'previous_applications']

In [20]:
data_io.datasets_['cash_balance'].head()

Unnamed: 0,sk_id_prev,sk_id_curr,months_balance,cnt_instalment,cnt_instalment_future,name_contract_status,sk_dpd,sk_dpd_def
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0
