In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns


import numpy as np
import pandas as pd
import tensorflow as tf

import atecml.data
import atecml.fs

from contextlib import contextmanager
from tqdm import tqdm
from time import strftime,time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import joblib
from concurrent.futures import ProcessPoolExecutor

plt.style.use('ggplot')

In [2]:
train_df,test_df= atecml.data.load()
predictors = [x for x in train_df.columns if x not in atecml.data.NOT_FEATURE_COLUMNS]
target ='Fraud'

predictors.append('NaN_LIST')

In [3]:
train_df['f5'] =train_df['f5']//100
test_df['f5'] =test_df['f5']//100

In [4]:
a = train_df['f5'].unique()

In [5]:
test_df['f5'][~test_df['f5'].isin(a)] =np.nan

In [6]:
def mt_iv(feature):
    return atecml.data.calc_iv(df=train_df,feature=feature,target=target)

iv_list = {}
woe_table = {}
type_list = {}

with ProcessPoolExecutor(max_workers=60) as pool:
    for pool_result in pool.map(mt_iv,predictors):
        iv = pool_result[0]
        woe_df = pool_result[1]
        feature_type = pool_result[2]
        iv_list.update(iv)
        woe_table.update(woe_df)
        type_list.update(feature_type)
        

[2018-06-30 16:34:47][WoE/IV Calculation for f1] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f6] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f2] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f3] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f5] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f4] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f10] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f11] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f13] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f12] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f8] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f9] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f7] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f21] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f20] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f19] Begin ...
[2018-06-30 16:34:47][WoE/IV Calculation for f22]

[2018-06-30 16:35:53][WoE/IV Calculation for f41] End   ...[Elapsed: 65.95s]
[2018-06-30 16:35:53][WoE/IV Calculation for f95] Begin ...
[2018-06-30 16:35:54][WoE/IV Calculation for f46] End   ...[Elapsed: 67.05s]
[2018-06-30 16:35:54][WoE/IV Calculation for f96] Begin ...
[2018-06-30 16:35:55][WoE/IV Calculation for f1] End   ...[Elapsed: 67.90s]
[2018-06-30 16:35:55][WoE/IV Calculation for f97] Begin ...
[2018-06-30 16:35:55][WoE/IV Calculation for f31] End   ...[Elapsed: 68.01s]
[2018-06-30 16:35:55][WoE/IV Calculation for f98] Begin ...
[2018-06-30 16:35:55][WoE/IV Calculation for f14] End   ...[Elapsed: 68.59s]
[2018-06-30 16:35:55][WoE/IV Calculation for f99] Begin ...
[2018-06-30 16:35:56][WoE/IV Calculation for f48] End   ...[Elapsed: 68.70s]
[2018-06-30 16:35:56][WoE/IV Calculation for f100] Begin ...
[2018-06-30 16:35:59][WoE/IV Calculation for f23] End   ...[Elapsed: 72.27s]
[2018-06-30 16:35:59][WoE/IV Calculation for f101] Begin ...
[2018-06-30 16:35:59][WoE/IV Calculation

[2018-06-30 16:36:53][WoE/IV Calculation for f154] Begin ...
[2018-06-30 16:36:53][WoE/IV Calculation for f96] End   ...[Elapsed: 58.71s]
[2018-06-30 16:36:53][WoE/IV Calculation for f155] Begin ...
[2018-06-30 16:36:56][WoE/IV Calculation for f104] End   ...[Elapsed: 52.01s]
[2018-06-30 16:36:56][WoE/IV Calculation for f156] Begin ...
[2018-06-30 16:36:57][WoE/IV Calculation for f80] End   ...[Elapsed: 76.78s]
[2018-06-30 16:36:57][WoE/IV Calculation for f157] Begin ...
[2018-06-30 16:37:00][WoE/IV Calculation for f103] End   ...[Elapsed: 56.54s]
[2018-06-30 16:37:00][WoE/IV Calculation for f158] Begin ...
[2018-06-30 16:37:02][WoE/IV Calculation for f105] End   ...[Elapsed: 54.94s]
[2018-06-30 16:37:02][WoE/IV Calculation for f159] Begin ...
[2018-06-30 16:37:03][WoE/IV Calculation for f65] End   ...[Elapsed: 89.13s]
[2018-06-30 16:37:03][WoE/IV Calculation for f160] Begin ...
[2018-06-30 16:37:04][WoE/IV Calculation for f54] End   ...[Elapsed: 137.59s]
[2018-06-30 16:37:04][WoE/IV C

[2018-06-30 16:37:58][WoE/IV Calculation for f213] Begin ...
[2018-06-30 16:37:58][WoE/IV Calculation for f159] End   ...[Elapsed: 56.43s]
[2018-06-30 16:37:58][WoE/IV Calculation for f214] Begin ...
[2018-06-30 16:37:58][WoE/IV Calculation for f147] End   ...[Elapsed: 72.61s]
[2018-06-30 16:37:58][WoE/IV Calculation for f215] Begin ...
[2018-06-30 16:37:59][WoE/IV Calculation for f167] End   ...[Elapsed: 42.46s]
[2018-06-30 16:37:59][WoE/IV Calculation for f216] Begin ...
[2018-06-30 16:37:59][WoE/IV Calculation for f157] End   ...[Elapsed: 61.62s]
[2018-06-30 16:37:59][WoE/IV Calculation for f217] Begin ...
[2018-06-30 16:38:00][WoE/IV Calculation for f162] End   ...[Elapsed: 54.51s]
[2018-06-30 16:38:00][WoE/IV Calculation for f218] Begin ...
[2018-06-30 16:38:00][WoE/IV Calculation for f152] End   ...[Elapsed: 68.46s]
[2018-06-30 16:38:00][WoE/IV Calculation for f219] Begin ...
[2018-06-30 16:38:01][WoE/IV Calculation for f165] End   ...[Elapsed: 49.56s]
[2018-06-30 16:38:01][WoE/I

[2018-06-30 16:38:54][WoE/IV Calculation for f272] Begin ...
[2018-06-30 16:38:54][WoE/IV Calculation for f195] End   ...[Elapsed: 71.79s]
[2018-06-30 16:38:54][WoE/IV Calculation for f273] Begin ...
[2018-06-30 16:38:57][WoE/IV Calculation for f218] End   ...[Elapsed: 57.27s]
[2018-06-30 16:38:57][WoE/IV Calculation for f274] Begin ...
[2018-06-30 16:38:57][WoE/IV Calculation for f204] End   ...[Elapsed: 69.34s]
[2018-06-30 16:38:57][WoE/IV Calculation for f275] Begin ...
[2018-06-30 16:38:58][WoE/IV Calculation for f226] End   ...[Elapsed: 50.11s]
[2018-06-30 16:38:58][WoE/IV Calculation for f276] Begin ...
[2018-06-30 16:39:01][WoE/IV Calculation for f229] End   ...[Elapsed: 46.99s]
[2018-06-30 16:39:01][WoE/IV Calculation for f277] Begin ...
[2018-06-30 16:39:02][WoE/IV Calculation for f227] End   ...[Elapsed: 52.95s]
[2018-06-30 16:39:02][WoE/IV Calculation for f278] Begin ...
[2018-06-30 16:39:03][WoE/IV Calculation for f230] End   ...[Elapsed: 47.52s]
[2018-06-30 16:39:03][WoE/I

[2018-06-30 16:40:10][WoE/IV Calculation for NaN_LIST] End   ...[Elapsed: 46.15s]
[2018-06-30 16:40:16][WoE/IV Calculation for f271] End   ...[Elapsed: 82.77s]


In [7]:
def mt_conv_train(feature):
    return atecml.data.woe_convert(df=train_df,feature=feature,woe_table=woe_table,type_list=type_list)



In [12]:
new_feature =[]
with ProcessPoolExecutor(max_workers=60) as pool:
    for pool_result in pool.map(mt_conv_train,predictors):
        new_feature.append(pool_result)
        
_t_new_df = pd.concat(new_feature,axis=1)
rename_dict ={}
for idx in range(1,298):
    old_feature_name = 'f' + str(idx)
    new_feature_name = 'n' + str(idx)
    rename_dict[old_feature_name] = new_feature_name
rename_dict['NaN_LIST'] = 'n298'
_t_new_df.rename(columns=rename_dict,inplace=True)

#ext_df = pd.concat([train_df[atecml.data.NOT_FEATURE_COLUMNS],_t_new_df],axis=1)
#ext_df.to_pickle('./train.dat')

In [14]:
ext_df = pd.concat([train_df,_t_new_df],axis=1)

In [15]:
ext_df.to_pickle('./train.dat')

In [16]:
test_df = atecml.data.load_test()
test_df['f5'] =test_df['f5']//100
test_df['f5'][~test_df['f5'].isin(a)] =np.nan

def mt_conv_test(feature):
    return atecml.data.woe_convert(df=test_df,feature=feature,woe_table=woe_table,type_list=type_list)

new_feature_tt =[]
with ProcessPoolExecutor(max_workers=60) as pool:
    for pool_result in pool.map(mt_conv_test,predictors):
        new_feature_tt.append(pool_result)
        
_t_new_tt_df = pd.concat(new_feature_tt,axis=1)
_t_new_tt_df.rename(columns=rename_dict,inplace=True)
ext_tt_df = pd.concat([test_df,_t_new_tt_df],axis=1)
ext_tt_df.to_pickle('./test.dat')

In [20]:
iv_df = pd.DataFrame()
iv_df['IV'] = pd.DataFrame.from_dict(iv_list,orient='index')[0]
iv_df['Type'] = pd.DataFrame.from_dict(type_list,orient='index')[0]

In [22]:
iv_df.sort_values('IV',ascending=False)

Unnamed: 0,IV,Type
f31,2.446697,category
f30,2.408117,category
f29,2.164999,category
f28,2.158825,category
f261,2.026063,value
f260,2.021908,value
f259,1.988966,value
f262,1.875872,value
f25,1.875414,category
f52,1.864613,category
