In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
pd.set_option('display.max_columns', 999)

In [4]:
meta = pd.read_csv('data/meta.csv')

In [19]:
meta.head()

Unnamed: 0,varname,role,level,keep,dtype
0,id,id,nominal,False,int64
1,target,target,binary,True,int64
2,ps_ind_01,input,binary,True,int64
3,ps_ind_02_cat,input,nominal,True,int64
4,ps_ind_03,input,nominal,True,int64


# To minimize memory
- 만들어 놓은 meta df를 이용하여 pandas로 불러올때, dtype에 원하는 data type을 넣어준다.
- 원하는 data type은 사전에 data를 확인하여 적절하게 선택한다.

In [5]:
data_type = dict()
for i in range(len(meta)):
    if meta.iloc[i, 4] == 'int64': 
        data_type[meta['varname'][i]] = 'int8'
    else:
        data_type[meta['varname'][i]] = 'float16'

In [6]:
data_type_test = data_type.copy()
del data_type_test['target']

In [7]:
train = pd.read_csv('data/train.csv', dtype=data_type )
test = pd.read_csv('data/test.csv', dtype=data_type_test)

메모리 사용량이 훨씬 줄었다.

# FE

- 일단 train 과 test가 크게 다르지 않다고 생각하고 합쳐서 시작하자.

In [8]:
df = pd.concat([train, test])
print(df.shape)

(1488028, 59)


참고커널
- https://www.kaggle.com/headsortails/steering-wheel-of-fortune-porto-seguro-eda

In [10]:
meta.head()

Unnamed: 0,varname,role,level,keep,dtype
0,id,id,nominal,False,int64
1,target,target,binary,True,int64
2,ps_ind_01,input,binary,True,int64
3,ps_ind_02_cat,input,nominal,True,int64
4,ps_ind_03,input,nominal,True,int64


## FE : feature delete

In [23]:
[i for i in df.columns if 'bin' in i]

['ps_ind_06_bin',
 'ps_ind_07_bin',
 'ps_ind_08_bin',
 'ps_ind_09_bin',
 'ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_ind_16_bin',
 'ps_ind_17_bin',
 'ps_ind_18_bin',
 'ps_calc_15_bin',
 'ps_calc_16_bin',
 'ps_calc_17_bin',
 'ps_calc_18_bin',
 'ps_calc_19_bin',
 'ps_calc_20_bin']

- binary
    - too unbalance
        - 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin'
    - _ind_ is good for target

In [24]:
[i for i in df.columns if 'cat' in i]

['ps_ind_02_cat',
 'ps_ind_04_cat',
 'ps_ind_05_cat',
 'ps_car_01_cat',
 'ps_car_02_cat',
 'ps_car_03_cat',
 'ps_car_04_cat',
 'ps_car_05_cat',
 'ps_car_06_cat',
 'ps_car_07_cat',
 'ps_car_08_cat',
 'ps_car_09_cat',
 'ps_car_10_cat',
 'ps_car_11_cat']

- category
    - high cardinality
        - 'ps_car_01_cat', 'ps_car_06_cat', 'ps_car_11_cat'
    - _ind_ is good for target

In [25]:
[i for i in df.columns if ('bin' not in i) and ('cat' not in i)]

['id',
 'target',
 'ps_ind_01',
 'ps_ind_03',
 'ps_ind_14',
 'ps_ind_15',
 'ps_reg_01',
 'ps_reg_02',
 'ps_reg_03',
 'ps_car_11',
 'ps_car_12',
 'ps_car_13',
 'ps_car_14',
 'ps_car_15',
 'ps_calc_01',
 'ps_calc_02',
 'ps_calc_03',
 'ps_calc_04',
 'ps_calc_05',
 'ps_calc_06',
 'ps_calc_07',
 'ps_calc_08',
 'ps_calc_09',
 'ps_calc_10',
 'ps_calc_11',
 'ps_calc_12',
 'ps_calc_13',
 'ps_calc_14']

- rest
    - in particular for ps_ind_14 and ps_car_11 where “0” and “3” are the dominating values, respectively.
    - high cardinality
        - 'ps_ind_03','ps_ind_14'
    - uniform
        - 'ps_calc_01','ps_calc_02','ps_calc_03'

뒤에 generate하고 해당 column delete
- 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin'
- 'ps_calc_01','ps_calc_02','ps_calc_03'

In [9]:
del_col = ['ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin','ps_calc_01','ps_calc_02','ps_calc_03']

## FE : feature generate

In [33]:
bin = [i for i in train.columns if 'bin' in i]

- NA 인 값들을 feauture로 만든다.
- bin feature들을 더한다.
- category는 최빈값, numeric은 평균값으로 fillna

NA 인 값들을 feauture로 만든다.

In [13]:
na_col = (df[df == -1].sum() != 0)

In [28]:
na = []
i = 0
for col in na_col.to_numpy():
    if col : na.append(na_col.index[i])
    i += 1
na.remove('id')
print(na)

['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_07_cat', 'ps_car_09_cat', 'ps_car_11', 'ps_car_12', 'ps_car_14']


In [32]:
print(len(na))

13


In [30]:
print('before: ',df.shape)
for col in na:
    df['na' + col] = -1
    df['na' + col] = np.where(df[col]==-1,1,0)
print('after: ',df.shape)

before:  (1488028, 59)
after:  (1488028, 72)


- bin feature들을 더한다.

In [45]:
bin_col = [i for i in df.columns if 'bin' in i]
df['sum_bin'] = -1
for i in bin_col:
    df['sum_bin'] += df[i]
    
df['sum_bin'][:5]

0    4
1    4
2    4
3    1
4    3
Name: sum_bin, dtype: int64

category는 최빈값, numeric은 평균값으로 fillna

In [76]:
from statistics import mode
from tqdm import tqdm

for i in tqdm(na):
    most_freq = mode(df[i])
    df[i] = np.where(df[i]==-1, most_freq, df[i])

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:06<00:00,  1.95it/s]


In [77]:
del_col

['ps_ind_10_bin',
 'ps_ind_11_bin',
 'ps_ind_12_bin',
 'ps_ind_13_bin',
 'ps_calc_01',
 'ps_calc_02',
 'ps_calc_03']

In [79]:
df = df.drop(del_col, axis=1)

In [80]:
print(df.shape)

(1488028, 66)


## FE : category encoding

In [81]:
df.columns

Index(['id', 'target', 'ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03',
       'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin',
       'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_14', 'ps_ind_15',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01',
       'ps_reg_02', 'ps_reg_03', 'ps_car_01_cat', 'ps_car_02_cat',
       'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
       'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat',
       'ps_car_11_cat', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14',
       'ps_car_15', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07',
       'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12',
       'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin', 'ps_calc_16_bin',
       'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin',
       'naps_ind_02_cat', 'naps_ind_04_cat', 'naps_ind_05_cat', 'naps_reg_03',
       'naps_car_01_cat', 'naps_car_02_cat', '

In [93]:
meta.loc[meta['level']=='nominal', 'varname']

0                id
3     ps_ind_02_cat
4         ps_ind_03
5     ps_ind_04_cat
6     ps_ind_05_cat
23    ps_car_01_cat
24    ps_car_02_cat
25    ps_car_03_cat
26    ps_car_04_cat
27    ps_car_05_cat
28    ps_car_06_cat
29    ps_car_07_cat
30    ps_car_08_cat
31    ps_car_09_cat
32    ps_car_10_cat
33    ps_car_11_cat
34        ps_car_11
Name: varname, dtype: object

In [101]:
encoding_col = [i for i in df.columns if 'cat' in i]

for i in [i for i in encoding_col if 'na' in i]:
    encoding_col.remove(i)
    
encoding_col = encoding_col + ['ps_ind_03','ps_car_11']

In [103]:
import category_encoders as ce

df_cat = df.copy()
catboost_encoder = ce.cat_boost.CatBoostEncoder()
catboost_encoder.fit(df_cat[encoding_col][:595212], train['target'])
df_cat[encoding_col] = catboost_encoder.transform(df_cat[encoding_col])

df_target = df.copy()
target_encoder = ce.target_encoder.TargetEncoder()
target_encoder.fit(df_target[encoding_col][:595212], train['target'])
df_target[encoding_col] = target_encoder.transform(df_target[encoding_col])

In [104]:
import joblib

joblib.dump(df_cat, 'data/df_cat.pkl')
joblib.dump(df_target, 'data/df_target.pkl')

['data/df_target.pkl']

# modeling

In [105]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [107]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

#Remove redundant calls
def ginic(actual, pred):
    actual = np.asarray(actual) #In case, someone passes Series or list
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_s.sum() - (n + 1) / 2.0
    return giniSum / n
 
def gini_normalizedc(a, p):
    if p.ndim == 2:#Required for sklearn wrapper
        p = p[:,1] #If proba array contains proba for both 0 and 1 classes, just pick class 1
    return ginic(a, p) / ginic(a, a)

#XGBoost
from sklearn import metrics

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalizedc(labels, preds)
    return [('gini', gini_score)]

#LightGBM
def gini_lgb(actuals, preds):
    return 'gini', gini_normalizedc(actuals, preds), True

#SKlearn
gini_sklearn = metrics.make_scorer(gini_normalizedc, True, True)

In [110]:
df_cat.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,naps_ind_02_cat,naps_ind_04_cat,naps_ind_05_cat,naps_reg_03,naps_car_01_cat,naps_car_02_cat,naps_car_03_cat,naps_car_05_cat,naps_car_07_cat,naps_car_09_cat,naps_car_11,naps_car_12,naps_car_14,sum_bin
0,7,0.0,2,2,5,1,0,0,1,0,0,0,11,0,1,0,0.700195,0.199951,0.718262,10,1,-1,0,1,4,1,0,0,1,12,2,0.399902,0.883789,0.37085,3.605469,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,4
1,9,0.0,1,1,7,0,0,0,0,1,0,0,3,0,0,1,0.799805,0.399902,0.766113,11,1,-1,0,-1,11,1,1,2,1,19,3,0.316162,0.618652,0.388672,2.449219,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,4
2,13,0.0,5,4,9,1,0,0,0,1,0,0,12,1,0,0,0.0,0.0,-1.0,7,1,-1,0,-1,14,1,1,2,1,60,1,0.316162,0.641602,0.347168,3.316406,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0,0,0,0,1,0,0,1,1,0,0,0,0,0,4
3,16,0.0,0,1,2,0,0,1,0,0,0,0,8,1,0,0,0.899902,0.199951,0.581055,7,1,0,0,1,11,1,1,3,1,104,1,0.374268,0.542969,0.294922,2.0,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,17,0.0,0,2,0,1,0,1,0,0,0,0,9,1,0,0,0.700195,0.600098,0.84082,11,1,-1,0,-1,14,1,1,2,1,82,3,0.316162,0.565918,0.36499,2.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3


In [113]:
# df_cat
X = df_cat[:595212].drop(['id', 'target'], axis=1)
y = train['target']

rfc = RandomForestClassifier()

sfkf = StratifiedKFold(n_splits=5, random_state=1).split(X, y)

cv_score = cross_val_score(rfc, X, y, cv=sfkf, scoring=gini_sklearn, verbose=2, n_jobs=-1)

print(f'cross-val-score : {cv_score}')
print(f'cross-val-score(mean) : {cv_score.mean()}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  7.9min remaining: 11.9min


cross-val-score : [0.16934305 0.17450451 0.17105258 0.17208369 0.1778606 ]
cross-val-score(mean) : 0.1729688870746614


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  8.0min finished


In [114]:
# df_cat
X = df_target[:595212].drop(['id', 'target'], axis=1)
y = train['target']

rfc = RandomForestClassifier()

sfkf = StratifiedKFold(n_splits=5, random_state=1).split(X, y)

cv_score = cross_val_score(rfc, X, y, cv=sfkf, scoring=gini_sklearn, verbose=2, n_jobs=-1)

print(f'cross-val-score : {cv_score}')
print(f'cross-val-score(mean) : {cv_score.mean()}')

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  7.7min remaining: 11.6min


cross-val-score : [0.15985465 0.17070352 0.19272178 0.17323307 0.18108293]
cross-val-score(mean) : 0.1755191889221171


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  7.8min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  7.8min finished
