In [41]:
class Config:
    name = "EDA/Agg-StatUniv"

    n_splits = 5
    seed = 2022
    target = "target"

    # Colab Env
    upload_from_colab = True
    api_path = "/content/drive/MyDrive/workspace/kaggle.json"
    drive_path = "/content/drive/MyDrive/workspace/kaggle-amex"

    # Kaggle Env
    kaggle_dataset_path = None

    # Reka Env
    dir_path = '/home/abe/kaggle/kaggle-amex'

In [42]:
import os
import json
import warnings
import shutil
import logging
import joblib
import random
import datetime
import sys
import gc
import multiprocessing
import joblib
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.auto import tqdm
from IPython import get_ipython
tqdm.pandas()
warnings.filterwarnings('ignore')

## Environment Settings

In [43]:
INPUT = os.path.join(Config.dir_path, 'input')
OUTPUT = os.path.join(Config.dir_path, 'output')
SUBMISSION = os.path.join(Config.dir_path, 'submissions')
OUTPUT_EXP = os.path.join(OUTPUT, Config.name)
EXP_MODEL = os.path.join(OUTPUT_EXP, "model")
EXP_FIG = os.path.join(OUTPUT_EXP, "fig")
EXP_PREDS = os.path.join(OUTPUT_EXP, "preds")

# make dirs
for d in [INPUT, SUBMISSION, EXP_MODEL, EXP_FIG, EXP_PREDS]:
    os.makedirs(d, exist_ok=True)

## Load data

In [44]:
train = pd.read_pickle(os.path.join(INPUT, 'train_agg.pkl'), compression='gzip')
test = pd.read_pickle(os.path.join(INPUT, 'test_agg.pkl'), compression='gzip')
train = train.sample(10000)
test = test.sample(15000)

In [45]:
train.info()

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 10000 entries, 96620ffc44c322422d070f1489d2acc112a34fdaf7c0bbda30b7540c5289778f to fd3410b00bbc31a766306e3a5d3ea5a1d290018e4931421ee57ad9fa0d3aace8
Columns: 919 entries, P_2_mean to target
dtypes: category(2), float16(713), float64(178), int64(22), int8(4)
memory usage: 48.6 MB


In [46]:
train.head()

Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96620ffc44c322422d070f1489d2acc112a34fdaf7c0bbda30b7540c5289778f,0.677246,0.056772,0.625977,0.767578,0.767578,0.076294,0.252068,0.000438,0.914551,0.914551,...,10,O,3,0,,0,9,4.0,3,1
d650700e3e2086622713131eb2aaa01b453c3e5d4582f99f2106aac3f753612e,0.941406,0.014912,0.915039,0.960449,0.953125,0.135376,0.264983,0.001035,0.972656,0.003738,...,13,O,1,0,,0,13,6.0,1,0
043632b2effa7d5bed9781cf0a9d0cbb567c4de634621068797289803d697148,0.891602,0.030054,0.835938,0.936523,0.89209,0.163818,0.247653,0.000392,0.825195,0.008141,...,13,O,2,0,,0,13,6.0,1,0
1b887133dbf11080e8e8a0356eb0b8c866717495d291f2307ca25cff45aaf077,0.786621,0.044534,0.71875,0.86084,0.746582,0.217041,0.255277,0.001299,0.714844,0.559082,...,13,O,1,0,,0,13,5.0,4,0
ee2926808c97807af9b9a5a2c30545f1967b207a0ea23c9e71df113a9b40cc6e,0.878906,0.028066,0.82959,0.910156,0.879883,0.00526,0.002566,0.000721,0.009903,0.005997,...,13,U,1,0,,0,13,6.0,1,0


## Transform data type

In [47]:
float64_cols = [col for col in train.columns if train[col].dtype == 'float64']
int64_cols = [col for col in train.columns if train[col].dtype == 'int64']

print(train.info())
print(test.info())
print()
print("-"*50+f' data type transformation '+'-'*50)
print()

def transform_dtype(df):
  for col in df.columns:
    if df[col].dtype == 'float64':
      df[col] = df[col].astype('float16')
    if df[col].dtype == 'float32':
      df[col] = df[col].astype('float16')
    if df[col].dtype == 'int64':
      df[col] = df[col].astype('int8')
    if df[col].dtype == 'int32':
      df[col] = df[col].astype('int8')
  return df

train = transform_dtype(train)
test = transform_dtype(test)

print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 10000 entries, 96620ffc44c322422d070f1489d2acc112a34fdaf7c0bbda30b7540c5289778f to fd3410b00bbc31a766306e3a5d3ea5a1d290018e4931421ee57ad9fa0d3aace8
Columns: 919 entries, P_2_mean to target
dtypes: category(2), float16(713), float64(178), int64(22), int8(4)
memory usage: 48.6 MB
None
<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 15000 entries, 23d885843a4cb9faa887dce1c45b06194b8b7ef330d52f42c9c5b858d66acff1 to 5e16d0fe36e7124b9b0133d12acf528bb1db0079d2ff9cbaee9996bc4f88f1d9
Columns: 918 entries, P_2_mean to D_68_nunique
dtypes: category(2), float16(713), float64(178), int32(3), int64(22)
memory usage: 82.8 MB
None

-------------------------------------------------- data type transformation --------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 10000 entries, 96620ffc44c322422d070f1489d2acc112a34fdaf7c0bbda30b7540c5289778f to fd3410b00bbc31a766306e3a5d3ea5a1d290018e49314

## Preprocess

In [48]:
from sklearn.preprocessing import LabelEncoder
cat_cols = [col for col in train.columns if train[col].dtype == 'category']

for col in cat_cols:
    le = LabelEncoder()
    le.fit(train[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

## Select Features to Use

In [49]:
train.info()

<class 'pandas.core.frame.DataFrame'>
CategoricalIndex: 10000 entries, 96620ffc44c322422d070f1489d2acc112a34fdaf7c0bbda30b7540c5289778f to fd3410b00bbc31a766306e3a5d3ea5a1d290018e4931421ee57ad9fa0d3aace8
Columns: 919 entries, P_2_mean to target
dtypes: float16(891), int64(2), int8(26)
memory usage: 37.1 MB


In [12]:
features = []
continuous = []
categorical = []
unuse = ['target', 'customer_ID', 'S_2']

for col in train.columns:
  if col not in unuse:
    features.append(col)
  if train[col].dtype == 'category':
    categorical.append(col)
  else:

## Statistical Test
For coninuous features, we will use Linear Discriminant Analysis.
For categorical features, we will use Chi-Square.

In [24]:
train = train.fillna(train.mean())
test = test.fillna(train.mean())
train.isna().sum().any()

In [35]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(store_covariance=True)
lda.fit(train[features], train[Config.target])

LinearDiscriminantAnalysis(store_covariance=True)