In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet
/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


# Preprocessing

In [2]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools

In [3]:
def read_preprocess_data():    
    train = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet')    
    features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print('Starting training feature engineer...')
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)
    train_num_agg_lag3 = train.groupby("customer_ID").tail(3).groupby("customer_ID")[num_features].agg({'mean', 'std', 'min', 'max'})    
    #train_num_agg_lag3 = train.groupby("customer_ID")[num_features].rolling(3).agg({'mean', 'std', 'min', 'max'}).groupby("customer_ID").tail(1)
    train_num_agg_lag3.columns = ['_lag3_'.join(x) for x in train_num_agg_lag3.columns]    
    train_num_agg_lag3.reset_index(inplace = True)    
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
    train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID').merge(train_num_agg_lag3, how = 'inner', on = 'customer_ID')
    # Save files to disk
    train.to_parquet('../input/amex-data-integer-dtypes-parquet-format/train_fe.parquet')
    del train_num_agg, train_cat_agg, train
    gc.collect()    
    test = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')    
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)
    test_num_agg_lag3 = test.groupby("customer_ID").tail(3).groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg_lag3.columns = ['_lag3_'.join(x) for x in test_num_agg_lag3.columns]    
    test_num_agg_lag3.reset_index(inplace = True)
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').merge(test_num_agg_lag3, how = 'inner', on = 'customer_ID')
    gc.collect()
    # Save files to disk    
    test.to_parquet('../input/amex-data-integer-dtypes-parquet-format/test_fe.parquet')
    del test_num_agg, test_cat_agg, test
    gc.collect()

In [4]:
# read_preprocess_data()

In [5]:
def preprocess_train_data():    
    train = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/train.parquet')    
    features = train.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print('Starting training feature engineer...')
    train_num_agg = train.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace = True)
    train_num_agg_lag3 = train.groupby("customer_ID").tail(3).groupby("customer_ID")[num_features].agg({'mean', 'std', 'min', 'max'})    
    #train_num_agg_lag3 = train.groupby("customer_ID")[num_features].rolling(3).agg({'mean', 'std', 'min', 'max'}).groupby("customer_ID").tail(1)
    train_num_agg_lag3.columns = ['_lag3_'.join(x) for x in train_num_agg_lag3.columns]    
    train_num_agg_lag3.reset_index(inplace = True)    
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace = True)
    train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
    train = train_num_agg.merge(train_cat_agg, how = 'inner', on = 'customer_ID').merge(train_labels, how = 'inner', on = 'customer_ID').merge(train_num_agg_lag3, how = 'inner', on = 'customer_ID')
    # Save files to disk
    train.to_parquet('train_fe.parquet')
    del train_num_agg, train_cat_agg, train
    gc.collect()  

In [6]:
preprocess_train_data()

Starting training feature engineer...


In [7]:
def preprocess_test_data():    
    gc.collect()    
    test = pd.read_parquet('../input/amex-data-integer-dtypes-parquet-format/test.parquet')        
    features = test.drop(['customer_ID', 'S_2'], axis = 1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]    
    print('Starting test feature engineer...')
    test_num_agg = test.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)
    test_num_agg_lag3 = test.groupby("customer_ID").tail(3).groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg_lag3.columns = ['_lag3_'.join(x) for x in test_num_agg_lag3.columns]    
    test_num_agg_lag3.reset_index(inplace = True)
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    test = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').merge(test_num_agg_lag3, how = 'inner', on = 'customer_ID')
    gc.collect()
    # Save files to disk    
    test.to_parquet('test_fe.parquet')
    del test_num_agg, test_cat_agg, test
    gc.collect()

In [8]:
# preprocess_test_data()

In [9]:
import os
os.getcwd()


'/kaggle/working'

In [10]:
import glob 
print('.parquet file:', glob.glob('../input/amex-data-integer-dtypes-parquet-format/*'))
print('.csv file', glob.glob('../input/amex-default-prediction/*'))

.parquet file: ['../input/amex-data-integer-dtypes-parquet-format/train.parquet', '../input/amex-data-integer-dtypes-parquet-format/test.parquet']
.csv file ['../input/amex-default-prediction/sample_submission.csv', '../input/amex-default-prediction/train_data.csv', '../input/amex-default-prediction/test_data.csv', '../input/amex-default-prediction/train_labels.csv']
