<a href="https://colab.research.google.com/github/meltyyyyy/kaggle-amex/blob/main/Notebooks/Starter/EDA003.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
class Config:
    name = "Starter/EDA003"

    n_splits = 5
    n_neighbors = 10
    seed = 2022
    target = "target"

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/workspace/kaggle.json"
    drive_path = "/content/drive/MyDrive/workspace/kaggle-amex"
    
    # Kaggle Env
    kaggle_dataset_path = None

In [2]:
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys
import gc
import multiprocessing
import joblib
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
warnings.filterwarnings('ignore')

In [4]:
COLAB = "google.colab" in sys.modules
if COLAB:
    print("This environment is Google Colab")
    
    # mount
    from google.colab import drive
    if not os.path.isdir("/content/drive"):
        drive.mount('/content/drive') 
	
    # import library
    # ! pip install lightgbm==3.3.1
    # ! pip install --quiet iterative-stratification
    # ! pip install --quiet tensorflow-addons

    # use kaggle api (need kaggle token)
    f = open(Config.api_path, 'r')
    json_data = json.load(f) 
    os.environ["KAGGLE_USERNAME"] = json_data["username"]
    os.environ["KAGGLE_KEY"] = json_data["key"]
    
    # set dirs
    DRIVE = Config.drive_path
    EXP = (Config.name if Config.name is not None 
           else get("http://172.28.0.2:9000/api/sessions").json()[0]["name"][:-6])
    INPUT = os.path.join(DRIVE, "Input")
    OUTPUT = os.path.join(DRIVE, "Output")
    SUBMISSION = os.path.join(DRIVE, "Submission")
    OUTPUT_EXP = os.path.join(OUTPUT, EXP) 
    EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
    EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
    EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

    # make dirs
    for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)

    if not os.path.isfile(os.path.join(INPUT, "train_data.parquet")):
        # load dataset
        ! kaggle competitions download -c amex-default-prediction -p $INPUT 
else:
    print("This environment is Kaggle Kernel")
    
    # set dirs
    INPUT = "../input/amex-default-prediction"
    EXP, OUTPUT, SUBMISSION = "./", "./", "./"
    EXP_MODEL = os.path.join(EXP, "model")
    EXP_FIG = os.path.join(EXP, "fig")
    EXP_PREDS = os.path.join(EXP, "preds")
    
    # copy dirs
    if Config.kaggle_dataset_path is not None:
        KD_MODEL = os.path.join(Config.kaggle_dataset_path, "model")
        KD_EXP_PREDS = os.path.join(Config.kaggle_dataset_path, "preds")
        shutil.copytree(KD_MODEL, EXP_MODEL)
        shutil.copytree(KD_EXP_PREDS, EXP_PREDS)

    # make dirs
    for d in [EXP_MODEL, EXP_FIG, EXP_PREDS]:
        os.makedirs(d, exist_ok=True)


This environment is Google Colab


In [17]:
train = pd.read_parquet(os.path.join(INPUT, 'train_data.parquet') if COLAB else 'train_data.parquet')

In [20]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: float64(185), int64(1), object(4)
memory usage: 7.8+ GB


In [21]:
float_cols = [col for col in train.columns if train[col].dtype == 'float64']
int_cols = [col for col in train.columns if train[col].dtype == 'int64']
obj_cols = [col for col in train.columns if train[col].dtype == 'object']
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [24]:
obj_cols

['customer_ID', 'S_2', 'D_63', 'D_64']

In [25]:
data_types = dict()

for col in train.columns:
  if col in float_cols:
    data_types[col] = 'float16'
  if col in int_cols:
    data_types[col] = 'int8'
  if col in obj_cols:
    if col == 'customer_ID':
      data_types[col] = 'str'
    elif col == 'S_2':
      data_types[col] = 'datetime64[ns]'
    else:
      data_types[col] = 'category'

In [29]:
with open(os.path.join(INPUT, 'data_types.json'), "w") as write_file:
    json.dump(data_types, write_file, indent=4)

In [31]:
with open(os.path.join(INPUT, 'data_types.json'), "r") as read_file:
    dtypes = json.load(read_file)

In [32]:
dtypes

{'B_1': 'float16',
 'B_10': 'float16',
 'B_11': 'float16',
 'B_12': 'float16',
 'B_13': 'float16',
 'B_14': 'float16',
 'B_15': 'float16',
 'B_16': 'float16',
 'B_17': 'float16',
 'B_18': 'float16',
 'B_19': 'float16',
 'B_2': 'float16',
 'B_20': 'float16',
 'B_21': 'float16',
 'B_22': 'float16',
 'B_23': 'float16',
 'B_24': 'float16',
 'B_25': 'float16',
 'B_26': 'float16',
 'B_27': 'float16',
 'B_28': 'float16',
 'B_29': 'float16',
 'B_3': 'float16',
 'B_30': 'float16',
 'B_31': 'int8',
 'B_32': 'float16',
 'B_33': 'float16',
 'B_36': 'float16',
 'B_37': 'float16',
 'B_38': 'float16',
 'B_39': 'float16',
 'B_4': 'float16',
 'B_40': 'float16',
 'B_41': 'float16',
 'B_42': 'float16',
 'B_5': 'float16',
 'B_6': 'float16',
 'B_7': 'float16',
 'B_8': 'float16',
 'B_9': 'float16',
 'D_102': 'float16',
 'D_103': 'float16',
 'D_104': 'float16',
 'D_105': 'float16',
 'D_106': 'float16',
 'D_107': 'float16',
 'D_108': 'float16',
 'D_109': 'float16',
 'D_110': 'float16',
 'D_111': 'float16',
 '

In [38]:
%%time
train = pd.read_parquet(os.path.join(INPUT, 'train.parquet') if COLAB else 'train.parquet')

CPU times: user 12 s, sys: 7.78 s, total: 19.8 s
Wall time: 4.37 s


In [39]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: category(2), datetime64[ns](1), float32(176), float64(9), int8(1), object(1)
memory usage: 4.1+ GB


In [40]:
train = train.astype(dtypes)

In [41]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 190 entries, customer_ID to D_145
dtypes: category(2), datetime64[ns](1), float16(185), int8(1), object(1)
memory usage: 2.0+ GB
