<a href="https://colab.research.google.com/github/meltyyyyy/kaggle-amex/blob/main/Notebooks/Starter/EDA002.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
class Config:
    name = "Starter/EDA002"

    n_splits = 5
    n_neighbors = 10
    seed = 2022
    target = "target"

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/workspace/kaggle.json"
    drive_path = "/content/drive/MyDrive/workspace/kaggle-amex"
    
    # Kaggle Env
    kaggle_dataset_path = None

In [2]:
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys
import gc
import multiprocessing
import joblib
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
warnings.filterwarnings('ignore')

In [3]:
COLAB = "google.colab" in sys.modules
if COLAB:
    print("This environment is Google Colab")
    
    # mount
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount('/content/drive') 
	
    # import library
    # ! pip install lightgbm==3.3.1
    # ! pip install --quiet iterative-stratification
    # ! pip install --quiet tensorflow-addons

    # use kaggle api (need kaggle token)
    f = open(Config.api_path, 'r')
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]
    
    # set dirs
    DRIVE = Config.drive_path
    EXP = (Config.name if Config.name is not None 
           else get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
    INPUT = os.path.join(DRIVE, "Input")
    OUTPUT = os.path.join(DRIVE, "Output")
    SUBMISSION = os.path.join(DRIVE, "Submission")
    OUTPUT_EXP = os.path.join(OUTPUT, EXP) 
    EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
    EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
    EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

    # make dirs
    for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)

    if not os.path.isfile(os.path.join(INPUT, "amex-default-prediction.zip")):
        # load dataset
        ! kaggle competitions download -c amex-default-prediction -p $INPUT 
else:
    print("This environment is Kaggle Kernel")
    
    # set dirs
    INPUT = "../input/amex-default-prediction"
    EXP, OUTPUT, SUBMISSION = "./", "./", "./"
    EXP_MODEL = os.path.join(EXP, "model")
    EXP_FIG = os.path.join(EXP, "fig")
    EXP_PREDS = os.path.join(EXP, "preds")
    
    # copy dirs
    if Config.kaggle_dataset_path is not None:
        KD_MODEL = os.path.join(Config.kaggle_dataset_path, "model")
        KD_EXP_PREDS = os.path.join(Config.kaggle_dataset_path, "preds")
        shutil.copytree(KD_MODEL, EXP_MODEL)
        shutil.copytree(KD_EXP_PREDS, EXP_PREDS)

    # make dirs
    for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)


This environment is Google Colab


In [4]:
%%time
test = pd.read_parquet(os.path.join(INPUT, 'test_data.parquet') if COLAB else 'test_data.parquet')

CPU times: user 25.4 s, sys: 38.1 s, total: 1min 3s
Wall time: 1min 21s


In [5]:
test.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631315,0.001912,0.010728,0.814497,0.007547,0.168651,0.009971,0.002347,...,,,,,0.004669,,,,0.008281,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.587042,0.005275,0.011026,0.810848,0.001817,0.241389,0.000166,0.009132,...,,,,0.000142,0.00494,0.009021,,0.003695,0.003753,0.00146
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,0.609056,0.003326,0.01639,1.00462,0.000114,0.266976,0.004196,0.004192,...,,,,7.4e-05,0.002114,0.004656,,0.003155,0.002156,0.006482
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,0.614911,0.009065,0.021672,0.816549,0.009722,0.188947,0.004123,0.015325,...,,,,0.004743,0.006392,0.00289,,0.006044,0.005206,0.007855
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,0.591673,0.238794,0.015923,0.810456,0.002026,0.180035,0.000731,0.011281,...,,,,0.008133,0.004329,0.008384,,0.001008,0.007421,0.009471


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 16.1+ GB


### Reduce memory usage by converting float64 to float32.

In [7]:
# https://www.kaggle.com/code/balabaskar/memory-reduction-using-pandas
float_cols = [col for col in test.columns if test[col].dtype == 'float64']
int_cols = [col for col in test.columns if test[col].dtype == 'int64']
len(float_cols), print(int_cols)

['B_31']


(185, None)

In [8]:
for col in float_cols:
    test[col] = test[col].astype('float16')

Sucessfully reduced memory from 16.1GB to 4.3GB

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float16(185), int64(1), object(4)
memory usage: 4.3+ GB


### Reduce memory usage by converting int64 to int8.

In [10]:
test['B_31'].unique()

array([1, 0])

It seems there's no problem for converting it to int8.

In [11]:
test['B_31'] = test['B_31'].astype('int8')

It does not change memory usege. But, it's better thant to do nothing.

In [12]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float16(185), int8(1), object(4)
memory usage: 4.3+ GB


### Look at categorycal features

In [13]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
test[cat_cols].head()

Unnamed: 0,B_30,B_38,D_114,D_116,D_117,D_120,D_126,D_63,D_64,D_66,D_68
0,0.0,1.0,,,,,0.0,CR,,,
1,0.0,1.0,,,,,0.0,CR,,,
2,0.0,2.0,,,,,0.0,CR,,,
3,0.0,2.0,,,,,0.0,CR,,,
4,0.0,2.0,0.0,0.0,-1.0,1.0,0.0,CR,U,,6.0


In [14]:
test['B_30'].unique()

array([ 0.,  1.,  2., nan], dtype=float16)

In [15]:
test['B_38'].unique()

array([ 1.,  2.,  5.,  3.,  6.,  7.,  4., nan], dtype=float16)

In [16]:
test['D_114'].unique()

array([nan,  0.,  1.], dtype=float16)

In [17]:
test['D_116'].unique()

array([nan,  0.,  1.], dtype=float16)

In [18]:
test['D_117'].unique()

array([nan, -1.,  3.,  6.,  4.,  2.,  1.,  5.], dtype=float16)

In [19]:
test['D_120'].unique()

array([nan,  1.,  0.], dtype=float16)

In [20]:
test['D_126'].unique()

array([ 0.,  1., -1., nan], dtype=float16)

In [21]:
test['D_63'].unique()

array(['CR', 'CO', 'CL', 'XM', 'XZ', 'XL'], dtype=object)

In [22]:
test['D_64'].unique()

array([None, 'U', 'O', 'R'], dtype=object)

In [23]:
test['D_66'].unique()

array([nan,  1.], dtype=float16)

In [24]:
test['D_68'].unique()

array([nan,  6.,  4.,  5.,  1.,  2.,  3.], dtype=float16)

Some categorical features does not inclueded in training set. Need to analyze later on.

Now let's convert float16 features to categorical

In [25]:
for col in cat_cols:
    test[col] = test[col].astype('category')

In [26]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), float16(176), int8(1), object(2)
memory usage: 4.0+ GB


Reduced memory from 4.3 to 4.0 by converting to category.



### Look at object features

In [27]:
obj_cols = [col for col in test.columns if test[col].dtype == 'object']
obj_cols

['customer_ID', 'S_2']

https://www.kaggle.com/competitions/amex-default-prediction/data.  
It says 'S_' column is spend variables. Guessing this is a transaction date?  
D_* = Delinquency variables.  
S_* = Spend variables.  
P_* = Payment variables.  
B_* = Balance variables. 
R_* = Risk variables.  

In [28]:
test['customer_ID'].value_counts()

8034aa3a67acb152f472bd8036f4c579b559d046ba12d7a911d27abd1c4b080b    13
f20b67b121eb3b67fb0558f910d65ab2d426e7240bf6f2a2b2d244335bfe5ed7    13
9b08b73dd6f5cba8f5f4fa0d59ba24c735d259571ac2b735bf3424007c991f53    13
9b08b9d5c195dcec159fc45dc1ff186fa35343bea193c097697e54d4014bd532    13
9b08cab050f7890ab7ef591262ea2718e29e4fbd57eb7d07761aadf37e006b14    13
                                                                    ..
96891d5635c9fe20ea5a7f47c2a798bfe76cd21b8b2766e9d00322fb8db1f07a     1
967923b9af14ea50d0b14884ad2fec253e1ece75de428a7b4ac2b79cd70e4c8b     1
96743f1ec496a95bfc1abac77f3029d00baf84b5510cf25113e09d7c2b5796d9     1
96609d94e00c546b837bead5c705c9ed3cee0e4cb4c806cad5ff7f0b652ce0f3     1
7574744c775a074ed4764473e97e9992a1a7cbc4925461ba266652d2dad3e60a     1
Name: customer_ID, Length: 924621, dtype: int64

One customer has 13 duplicate tables at most. However, time range is totally different from training set. It was between 2017 to 2018, but this is betweenn 2018 to 2019.

In [29]:
test[test['customer_ID'] == '8034aa3a67acb152f472bd8036f4c579b559d046ba12d7a911d27abd1c4b080b']

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
5681066,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-04-11,0.453857,0.474365,0.147827,0.8125,0.502441,0.189087,0.089539,0.012489,...,,,,0.006618,0.006016,0.006691,,0.001047,0.000509,0.006653
5681067,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-05-26,0.50293,0.005619,0.008667,0.811523,0.008194,0.167603,0.000905,0.002495,...,,,,0.008606,0.007507,0.003866,,0.008423,0.004566,0.007328
5681068,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-06-26,0.419434,0.002306,0.007168,0.819336,0.005177,0.164673,0.00729,0.008423,...,,,,0.001596,0.006973,0.001781,,0.000456,0.002254,0.007248
5681069,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-07-27,0.463867,0.000552,0.006317,0.813477,0.001314,0.162354,0.001353,0.008522,...,,,,0.002901,0.004284,0.007629,,0.006069,0.003651,0.003599
5681070,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-08-27,0.524414,0.00106,0.002678,0.81543,0.504395,0.164185,0.007168,0.003084,...,,,,0.003866,0.008232,0.000573,,0.002544,0.002958,0.000734
5681071,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-09-26,0.497803,0.00197,0.002529,0.819336,0.00041,0.115662,0.006912,0.00975,...,,,,0.002388,0.007797,0.002674,,0.003611,0.001533,0.009026
5681072,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-10-29,0.529297,0.003265,0.004173,0.810547,0.501465,,0.003382,0.005268,...,,,,0.000785,0.009407,0.004742,,0.0047,0.009583,0.003979
5681073,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-11-26,0.427979,0.033173,0.008408,0.817871,0.001494,,0.005295,0.00507,...,,,,0.001839,0.001595,0.006466,,0.006222,0.003536,0.009354
5681074,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2018-12-27,0.492188,0.037994,0.010933,0.001026,0.008965,,0.009354,0.009186,...,,,,0.0003,0.006195,0.003826,,0.001202,0.002348,0.009781
5681075,8034aa3a67acb152f472bd8036f4c579b559d046ba12d7...,2019-01-26,0.506348,0.032532,0.009392,0.005619,0.002151,,0.003601,0.006924,...,,,,0.006317,0.004047,0.007298,,0.006859,0.002214,0.007881


In [30]:
test['customer_ID'] = test['customer_ID'].astype('str')
test['S_2'] = pd.to_datetime(test['S_2'])

Minimum memory usage seems to be 4.0 GB.  
float32 -> float16.  
int64 -> int8.  

In [31]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: category(11), datetime64[ns](1), float16(176), int8(1), object(1)
memory usage: 4.0+ GB


to save as parquet for rapid reading, convert float16 to float32 since parquet does not accept half float.

In [32]:
for col in float_cols:
    test[col] = test[col].astype('float32')
for col in cat_cols:
    test[col] = test[col].astype('category')

In [33]:
test.to_parquet(os.path.join(INPUT, 'test.parquet'))

In [34]:
%%time
test = pd.read_parquet(os.path.join(INPUT, 'test.parquet') if COLAB else 'test.parquet')

CPU times: user 25.5 s, sys: 23.4 s, total: 48.9 s
Wall time: 11.9 s


In [35]:
test.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.631348,0.001912,0.010727,0.814453,0.007545,0.168701,0.009972,0.002348,...,,,,,0.004669,,,,0.008278,
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.586914,0.005276,0.011024,0.811035,0.001817,0.241333,0.000166,0.009132,...,,,,0.000142,0.00494,0.009018,,0.003695,0.003754,0.00146
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-04-25,0.608887,0.003326,0.016388,1.004883,0.000114,0.26709,0.004196,0.004192,...,,,,7.4e-05,0.002113,0.004658,,0.003155,0.002155,0.006481
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-05-20,0.614746,0.009064,0.021667,0.816406,0.00972,0.188965,0.004124,0.015327,...,,,,0.004742,0.006393,0.00289,,0.006042,0.005207,0.007858
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-06-15,0.591797,0.23877,0.01593,0.810547,0.002026,0.180054,0.000731,0.011284,...,,,,0.008133,0.00433,0.008385,,0.001008,0.00742,0.009468


It seems some float categorical colmuns are converted to float64 when reading parquet.

In [36]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: category(2), datetime64[ns](1), float32(176), float64(9), int8(1), object(1)
memory usage: 8.4+ GB


In [37]:
del test