In [None]:
from collections import OrderedDict
import gc
import pathlib
from pprint import pprint
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl


gc.enable()

In [None]:
parquet_files_dir = pathlib.Path(
    '/kaggle/input/home-credit-credit-risk-model-stability/parquet_files'
)

In [None]:
feature_definition = pd.read_csv(
    '/kaggle/input/home-credit-credit-risk-model-stability/feature_definitions.csv'
)
display(feature_definition)

In [None]:
train_base = pd.read_parquet(parquet_files_dir.joinpath('train/train_base.parquet'))
test_base = pd.read_parquet(parquet_files_dir.joinpath('test/test_base.parquet'))
display(train_base)
display(test_base)

In [None]:
train_depth_files = OrderedDict()

train_depth_files['train_static_0'] = ['train_static_0_0.parquet',
                                        'train_static_0_1.parquet']
train_depth_files['train_static_cb_0'] = ['train_static_cb_0.parquet']
train_depth_files['train_applprev_1'] = ['train_applprev_1_0.parquet',
                                          'train_applprev_1_1.parquet']
train_depth_files['train_other_1'] = ['train_other_1.parquet']
train_depth_files['train_tax_registry_a_1'] = ['train_tax_registry_a_1.parquet']
train_depth_files['train_tax_registry_b_1'] = ['train_tax_registry_b_1.parquet']
train_depth_files['train_tax_registry_c_1'] = ['train_tax_registry_c_1.parquet']
train_depth_files['train_credit_bureau_a_1'] = ['train_credit_bureau_a_1_0.parquet',
                                                'train_credit_bureau_a_1_1.parquet',
                                                'train_credit_bureau_a_1_2.parquet',
                                                'train_credit_bureau_a_1_3.parquet']
train_depth_files['train_credit_bureau_b_1'] = ['train_credit_bureau_b_1.parquet']
train_depth_files['train_deposit_1'] = ['train_deposit_1.parquet']
train_depth_files['train_person_1'] = ['train_person_1.parquet']
train_depth_files['train_debitcard_1'] = ['train_debitcard_1.parquet']

train_depth_files['train_applprev_2'] = ['train_applprev_2.parquet']
train_depth_files['train_person_2'] = ['train_person_2.parquet']
train_depth_files['train_credit_bureau_a_2'] = ['train_credit_bureau_a_2_0.parquet',
                                                'train_credit_bureau_a_2_1.parquet',
                                                'train_credit_bureau_a_2_2.parquet',
                                                'train_credit_bureau_a_2_3.parquet',
                                                'train_credit_bureau_a_2_4.parquet',
                                                'train_credit_bureau_a_2_5.parquet',
                                                'train_credit_bureau_a_2_6.parquet',
                                                'train_credit_bureau_a_2_7.parquet',
                                                'train_credit_bureau_a_2_8.parquet',
                                                'train_credit_bureau_a_2_9.parquet',
                                                'train_credit_bureau_a_2_10.parquet']
train_depth_files['train_credit_bureau_b_2'] = ['train_credit_bureau_b_2.parquet']

print(f'the number of train files: {sum(len(v1) for v1 in train_depth_files.values())}')

In [None]:
test_depth_files = OrderedDict()

test_depth_files['train_static_0'] = ['test_static_0_0.parquet',
                                      'test_static_0_1.parquet',
                                      'test_static_0_2.parquet']
test_depth_files['test_static_cb_0'] = ['test_static_cb_0.parquet']
test_depth_files['test_applprev_1'] = ['test_applprev_1_0.parquet',
                                       'test_applprev_1_1.parquet',
                                       'test_applprev_1_2.parquet']
test_depth_files['test_other_1'] = ['test_other_1.parquet']
test_depth_files['test_tax_registry_a_1'] = ['test_tax_registry_a_1.parquet']
test_depth_files['test_tax_registry_b_1'] = ['test_tax_registry_b_1.parquet']
test_depth_files['test_tax_registry_c_1'] = ['test_tax_registry_c_1.parquet']
test_depth_files['test_credit_bureau_a_1'] = ['test_credit_bureau_a_1_0.parquet',
                                              'test_credit_bureau_a_1_1.parquet',
                                              'test_credit_bureau_a_1_2.parquet',
                                              'test_credit_bureau_a_1_3.parquet',
                                              'test_credit_bureau_a_1_4.parquet']
test_depth_files['test_credit_bureau_b_1'] = ['test_credit_bureau_b_1.parquet']
test_depth_files['test_deposit_1'] = ['test_deposit_1.parquet']
test_depth_files['test_person_1'] = ['test_person_1.parquet']
test_depth_files['test_debitcard_1'] = ['test_debitcard_1.parquet']

test_depth_files['test_applprev_2'] = ['test_applprev_2.parquet']
test_depth_files['test_person_2'] = ['test_person_2.parquet']
test_depth_files['test_credit_bureau_a_2'] = ['test_credit_bureau_a_2_0.parquet',
                                              'test_credit_bureau_a_2_1.parquet',
                                              'test_credit_bureau_a_2_2.parquet',
                                              'test_credit_bureau_a_2_3.parquet',
                                              'test_credit_bureau_a_2_4.parquet',
                                              'test_credit_bureau_a_2_5.parquet',
                                              'test_credit_bureau_a_2_6.parquet',
                                              'test_credit_bureau_a_2_7.parquet',
                                              'test_credit_bureau_a_2_8.parquet',
                                              'test_credit_bureau_a_2_9.parquet',
                                              'test_credit_bureau_a_2_10.parquet',
                                              'test_credit_bureau_a_2_11.parquet']
test_depth_files['test_credit_bureau_b_2'] = ['test_credit_bureau_b_2.parquet']

print(f'the number of test files: {sum(len(v1) for v1 in test_depth_files.values())}')

In [None]:
train_depth_paths = OrderedDict()
for k, path_list in train_depth_files.items():
    train_depth_paths[k] = list(
        map(lambda p: parquet_files_dir.joinpath('train/'+p), path_list)
    )
    
test_depth_paths = OrderedDict()
for k, path_list in test_depth_files.items():
    test_depth_paths[k] = list(
        map(lambda p: parquet_files_dir.joinpath('test/'+p), path_list)
    )
    
print(f'number of train paths: {sum(len(v1) for v1 in train_depth_paths.values())}')
print(f'number of test paths: {sum(len(v1) for v1 in test_depth_paths.values())}')

In [None]:
def get_values(depth_paths: Dict[str, List[str]]) -> pd.DataFrame:
    unique_values = OrderedDict()
    num_nan_values = OrderedDict()
    unique_files = OrderedDict()
    for k, path_list in depth_paths.items():
        print(f'loading `{k}`')
        for path in path_list:
            print(f'\tloading {path.stem}')
            data = pd.read_parquet(path)
            for col in data.columns:
                if col == 'case_id':
                    continue
                value = data[col].unique()
                if col not in unique_values.keys():
                    unique_values[col] = list(value)
                    num_nan_values[col] = data[col].isna().sum()
                    unique_files[col] = [k]
                else:
                    unique_values[col] += list(value)
                    num_nan_values[col] += data[col].isna().sum()
                    unique_files[col] += [k]
                del value
                gc.collect()
            del data
            gc.collect()
            
    unique_values = [
        (
            k,
            len(set(v)),
            set(map(lambda x: type(x).__name__, list(set(v))))
        )
        for k, v in unique_values.items()
    ]
    unique_values = pd.DataFrame(
        unique_values,
        columns=['Variable', 'num_unique_values', 'unique_types']
    )
    
    num_nan_values = pd.DataFrame.from_dict(
        num_nan_values,
        orient='index',
    )
    num_nan_values.reset_index(inplace=True)
    num_nan_values.columns = ['Variable', 'num_nan']
    unique_files = [(k, list(set(v))) for k, v in unique_files.items()]
    unique_files = pd.DataFrame(
        unique_files,
        columns=['Variable', 'files']
    )
    
    data = unique_values.merge(num_nan_values, on='Variable')
    data = data.merge(unique_files, on='Variable')
    return data


def drop_nontype(x: set) -> list:
    x = list(x)
    if 'NoneType' in x:
        x.remove('NoneType')
    return x

In [None]:
%%time


train_depth_values = get_values(train_depth_paths)
train_depth_values['unique_types'] = train_depth_values['unique_types'].map(drop_nontype)
train_depth_values['num_unique_types'] = train_depth_values['unique_types'].map(len)
display(train_depth_values)

In [None]:
%%time


test_depth_values = get_values(test_depth_paths)
test_depth_values['unique_types'] = test_depth_values['unique_types'].map(drop_nontype)
test_depth_values['num_unique_types'] = test_depth_values['unique_types'].map(len)
display(test_depth_values)

In [None]:
print(set(feature_definition['Variable']) - set(train_depth_values['Variable']))
print(set(train_depth_values['Variable']) - set(feature_definition['Variable']))

print(set(feature_definition['Variable']) - set(test_depth_values['Variable']))
print(set(test_depth_values['Variable']) - set(feature_definition['Variable']))

In [None]:
train_depth_values.columns = [
    'Variable', 'train_num_unique_values', 'train_unique_types', 'train_num_nan',  'train_files', 'train_num_unique_types'
]
test_depth_values.columns = [
    'Variable', 'test_num_unique_values', 'test_unique_types', 'test_num_nan', 'test_files', 'test_num_unique_types'
]

feature_definition = feature_definition.merge(train_depth_values, on='Variable')
feature_definition = feature_definition.merge(test_depth_values, on='Variable')
display(feature_definition)

In [None]:
feature_definition = (
    feature_definition
    .sort_values('train_num_unique_values')
    .reset_index(drop=True)
)
display(feature_definition)
display(feature_definition['train_num_unique_types'].value_counts())
display(feature_definition['test_num_unique_types'].value_counts())

In [None]:
display(feature_definition.query('train_num_unique_types==1 & test_num_unique_types==0'))

In [None]:
def get_unique_type(x: list) -> str:
    if isinstance(x, list) and len(x) > 0:
        return x[0]

feature_definition['train_unique_types'] = (
    feature_definition['train_unique_types']
    .map(get_unique_type)
)
feature_definition['test_unique_types'] = (
    feature_definition['test_unique_types']
    .map(get_unique_type)
)

In [None]:
feature_definition.replace(
    {'train_unique_types': {'bool_': 'bool'}, 'test_unique_types': {'bool_': 'bool'}},
    inplace=True
)
display(feature_definition['train_unique_types'].value_counts())
display(feature_definition['test_unique_types'].value_counts())

fea_def_both_f64_types = (
    feature_definition
    .query('train_unique_types=="float64" & test_unique_types=="float64"')
    .reset_index(drop=True)
)
fea_def_both_str_types = (
    feature_definition
    .query('train_unique_types=="str" & test_unique_types=="str"')
    .reset_index(drop=True)
)
fea_def_both_bool_types = (
    feature_definition
    .query('train_unique_types=="bool" & test_unique_types=="bool"')
    .reset_index(drop=True)
)
fea_def_complex_types = (
    feature_definition
    .query('train_unique_types != test_unique_types')
    .reset_index(drop=True)
)
display(fea_def_both_f64_types)
display(fea_def_both_str_types)
display(fea_def_both_bool_types)
display(fea_def_complex_types)

In [None]:
fea_def_f64_types = pd.concat(
    [
        fea_def_both_f64_types,
        fea_def_complex_types.query('train_unique_types=="float64"')
    ],
    ignore_index=True
)

fea_def_str_types = pd.concat(
    [
        fea_def_both_str_types,
        fea_def_complex_types.query('train_unique_types=="str"')
    ],
    ignore_index=True
)

fea_def_bool_types = pd.concat(
    [
        fea_def_both_bool_types,
        fea_def_complex_types.query('train_unique_types=="bool"')
    ],
    ignore_index=True
)

display(fea_def_f64_types)
display(fea_def_str_types)
display(fea_def_bool_types)

In [None]:
fea_def_f64_types.to_csv('float64_features.csv', index=False)
fea_def_str_types.to_csv('string_features.csv', index=False)
fea_def_bool_types.to_csv('bool_features.csv', index=False)