# Overall

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import qgrid

from collections import Counter
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

In [2]:
df_dm    = pd.read_csv('./data/001_demo.csv')
df_train = pd.read_csv('./data/003_train.csv')
df_test  = pd.read_csv('./data/004_test.csv')

In [3]:
df_dm.head()

Unnamed: 0,id,c0,c1,c2,c3,c4,n0,n1,n2
0,76371,2,7.0,97,0,1,42,,40000.0
1,44326,2,4.0,97,1,1,47,10000.0,175000.0
2,33717,1,3.0,97,1,1,49,122000.0,85000.0
3,96078,2,4.0,98,1,0,37,95000.0,
4,13591,2,3.0,97,0,1,40,,40000.0


In [16]:
df_dm['n1'] = df_dm['n1'].fillna(0)

df_dm[df_dm['n1'] != 0]['n1'].describe()

In [23]:
range_cut = [-1, 1.000000e+03, 2.100000e+04, 4.400000e+04, 8.900000e+04, 2.036700e+07]

df_dm['bin_n1'] = pd.cut(df_dm['n1'], range_cut)

In [24]:
df_dm['n2'] = df_dm['n2'].fillna(0)

df_dm[df_dm['n2'] != 0]['n2'].describe()

count    1.483730e+05
mean     1.596744e+05
std      4.005307e+05
min      4.000000e+03
25%      2.500000e+04
50%      6.000000e+04
75%      1.750000e+05
max      7.500000e+06
Name: n2, dtype: float64

In [25]:
range_cut = [-1, 4.000000e+03, 2.500000e+04, 6.000000e+04, 1.750000e+05, 7.500000e+06]
df_dm['bin_n2'] = pd.cut(df_dm['n2'], range_cut)

In [26]:
df_dm

Unnamed: 0,id,c0,c1,c2,c3,c4,n0,n1,n2,bin_n1,bin_n2
0,76371,2,7.0,97,0,1,42,0.0,40000.0,"(-1.0, 1000.0]","(25000.0, 60000.0]"
1,44326,2,4.0,97,1,1,47,10000.0,175000.0,"(1000.0, 21000.0]","(60000.0, 175000.0]"
2,33717,1,3.0,97,1,1,49,122000.0,85000.0,"(89000.0, 20367000.0]","(60000.0, 175000.0]"
3,96078,2,4.0,98,1,0,37,95000.0,0.0,"(89000.0, 20367000.0]","(-1.0, 4000.0]"
4,13591,2,3.0,97,0,1,40,0.0,40000.0,"(-1.0, 1000.0]","(25000.0, 60000.0]"
...,...,...,...,...,...,...,...,...,...,...,...
153235,90220,2,5.0,95,0,1,62,0.0,25000.0,"(-1.0, 1000.0]","(4000.0, 25000.0]"
153236,124884,1,3.0,98,0,1,39,0.0,11500.0,"(-1.0, 1000.0]","(4000.0, 25000.0]"
153237,65180,2,3.0,97,1,1,48,68000.0,85000.0,"(44000.0, 89000.0]","(60000.0, 175000.0]"
153238,66450,2,4.0,96,0,1,56,0.0,11500.0,"(-1.0, 1000.0]","(4000.0, 25000.0]"


In [27]:
def addCrossFeatures(*args) -> pd.DataFrame:
    '''
    Add Cross features for `n` category.
    
    Returns:
        dataframe
    '''
    
    dfs = list(args)
    column = '_'.join([str(df.columns[0]) for df in dfs])
    
    size = len(dfs[0])
    res_df = pd.DataFrame(np.full(size, ''), columns=[column])
    
    for df in dfs:
        res_df[column] = res_df[column].astype(str) + df[df.columns[0]].astype(str)
        
    return res_df

In [31]:
df_dm['c1_c2_crossed'] = addCrossFeatures(df_dm[['c1']], df_dm[['c2']])

In [32]:
df_dm['n1_n2_crossed'] = addCrossFeatures(df_dm[['bin_n1']], df_dm[['bin_n2']])

In [37]:
df_dm[['id', 'c1','c2','c1_c2_crossed', 'bin_n1','bin_n2','n1_n2_crossed']].head()

Unnamed: 0,id,c1,c2,c1_c2_crossed,bin_n1,bin_n2,n1_n2_crossed
0,76371,7.0,97,7.097,"(-1.0, 1000.0]","(25000.0, 60000.0]","(-1.0, 1000.0](25000.0, 60000.0]"
1,44326,4.0,97,4.097,"(1000.0, 21000.0]","(60000.0, 175000.0]","(1000.0, 21000.0](60000.0, 175000.0]"
2,33717,3.0,97,3.097,"(89000.0, 20367000.0]","(60000.0, 175000.0]","(89000.0, 20367000.0](60000.0, 175000.0]"
3,96078,4.0,98,4.098,"(89000.0, 20367000.0]","(-1.0, 4000.0]","(89000.0, 20367000.0](-1.0, 4000.0]"
4,13591,3.0,97,3.097,"(-1.0, 1000.0]","(25000.0, 60000.0]","(-1.0, 1000.0](25000.0, 60000.0]"


In [35]:
# df_dm[['id', 'c1','c2','c1_c2_crossed', 'bin_n1','bin_n2','n1_n2_crossed']].to_csv('./data/100_cross.csv', index=False)

In [46]:
df_dm['c1'] = df_dm['c1'].fillna(0)

In [57]:
final = pd.concat(
    [
        df_dm[['id']],
#         pd.get_dummies(df_dm['c1'].astype(int), 'c1'),
#         pd.get_dummies(df_dm['c2'].astype(int), 'c2'),
        pd.get_dummies(df_dm['c1_c2_crossed'], 'c1_c2_crossed'),
#         pd.get_dummies(df_dm['bin_n1'], 'bin_n1'),
#         pd.get_dummies(df_dm['bin_n2'], 'bin_n2'),
        pd.get_dummies(df_dm['n1_n2_crossed'], 'n1_n2_crossed')
    ],axis=1 
)

In [59]:
final.to_csv('./data/100_demo_oh.csv', index=False)