---
### Prepare the data set for Wide and Deep Learning
(Ref. : https://github.com/jrzaurin/Wide-and-Deep-PyTorch)
- These are the steps to prepare the data set for "Wide and Deep Learning" model at `wide_deep/torch_model.py`
- Steps:
  - 1) Load data set
  - 2) Target Labeling & Column Labeling for the Wide and for the Deep
  - 3) Prepare the features for the Wide `wide_cols & crossed_cols` and Deep `embedding_cols, continuous_cols`
  - 4) Split the data set to Train & Test and output as dict type
---

#### 1) Load Data set

In [4]:
import pandas as pd
import numpy as np
import os, time

### Load the data set from '.data/audult_data.csv'
raw_df = pd.read_csv(os.path.join(str(os.getcwd()), 'data/adult_data.csv'))
print(raw_df.shape)
print(raw_df.columns)
print(raw_df.head())

(48842, 16)
Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'gender',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income_bracket', 'income_label'],
      dtype='object')
   age         workclass  fnlwgt  education  education_num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital_status         occupation   relationship   race  gender  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners

#### 2) Target Labeling & Column Labeling for the Wide and for the Deep

Step 1. Label the Targets for binary classification
 - Label the targets for binary classification 
 - We set the target with income bracket over 50K (>50K) as `income_label`
   - 1 if >50K else 0

In [30]:
print(raw_df['income_bracket'].unique())

raw_df['income_label'] = raw_df['income_bracket'].apply(lambda x: 1 if '>50K' in x or '>50K.' in x else 0)
print('Target : {} / Non-Target : {}'.format(len(raw_df[raw_df.income_label == 1]), len(raw_df[raw_df.income_label != 1])))
print(raw_df.dtypes)
raw_df.head()

['<=50K' '>50K' '<=50K.' '>50K.']
Target : 11687 / Non-Target : 37155
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
gender            object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income_bracket    object
income_label       int64
dtype: object


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket,income_label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0


Step 2. Set the columns for the Wide part & the Deep part and handling of each featues
 - For the Wide part, we are going to use the columns for memorization by `Linear Layer`
 - For the Deep part, we are going to use the columns for generalization by `Embedding Layer and Linear Layer`

In [95]:
# For Wide and Crossed network (for memorization)
wide_cols = ['age','hours_per_week','education', 'relationship','workclass',
             'occupation','native_country','gender']
crossed_cols = (['education', 'occupation'], ['native_country', 'occupation']) # pair-combination 

# For Deep network (embedding + continuous) (for generalization)
continuous_cols = ["age","hours_per_week"]

embeddings_cols = [('education',10), ('relationship',8), ('workclass',10),
                    ('occupation',10),('native_country',12)] # column name / embedding dim.

embdding_dim = dict(embeddings_cols)
embeddings_cols = list(embdding_dim.keys())

deep_cols = embeddings_cols + continuous_cols

print(wide_cols)
print(deep_cols)

['age', 'hours_per_week', 'education', 'relationship', 'workclass', 'occupation', 'native_country', 'gender']
['education', 'relationship', 'workclass', 'occupation', 'native_country', 'age', 'hours_per_week']


In [96]:
# target variable
target_Y = np.array(raw_df['income_label'])

# feature handling for wide and deep columns
tmp_df = raw_df.copy()[list(set(wide_cols + deep_cols))]

# Make crossed cols
crossed_columns = []
for cols in crossed_cols:
    tmp_col_nm = '-'.join(cols)
    tmp_df[tmp_col_nm] = tmp_df[cols].apply(lambda x: '-'.join(x), axis = 1)
    crossed_columns.append(tmp_col_nm)

print(tmp_df[crossed_columns[0]].head())
print()

0       Bachelors-Adm-clerical
1    Bachelors-Exec-managerial
2    HS-grad-Handlers-cleaners
3       11th-Handlers-cleaners
4     Bachelors-Prof-specialty
Name: education-occupation, dtype: object



For encoding of categorical columns, do following 3 steps
 - find unique values for each column
 - set index(encoding values) for each unique values
 - convert categorical values to corresponding encoding values

In [97]:
## encoding for categorical columns

# find categorical variables
categorical_cols = list(tmp_df.select_dtypes(include = 'object').columns)
print(categorical_cols)


# step 1:
unique_values = dict()
for col in categorical_cols:
    unique_values[col] = list(tmp_df[col].unique())
    
# step 2:
val_2_inx = dict()
for k, v in unique_values.items():
    val_2_inx[k] = {v2: i for i, v2 in enumerate(unique_values[k])}
    
# step 3:
for k, v in val_2_inx.items():
    tmp_df[k] = tmp_df[k].apply(lambda x: val_2_inx[k][x])
    

# only for deep cols, make embedding cols info
encoding_dict = {k: v for k, v in val_2_inx.items() if k in deep_cols}
embeddings_input = []
for k, v in encoding_dict.items():
    embeddings_input.append([k, len(v), embdding_dim[k]]) # column name, number of unique items, embedding dims
print(embeddings_input)

['native_country', 'workclass', 'occupation', 'education', 'relationship', 'gender', 'education-occupation', 'native_country-occupation']
[['native_country', 42, 12], ['workclass', 9, 10], ['occupation', 15, 10], ['education', 16, 10], ['relationship', 6, 8]]


Scaling for continuous(numerical) data set

In [100]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for col in continuous_cols:
    tmp_df[col] = scaler.fit_transform(tmp_df[col].values.reshape(-1,1))
print(np.mean(tmp_df[col].values), np.std(tmp_df[col].values))

3.287798580766198e-17 1.0


#### 3) Prepare the features for the Wide wide_cols & crossed_cols and Deep embedding_cols, continuous_cols

Step 1. split the df to the Wide part and the Deep part dataframe
 - For categorical variables in the Wide part, use one-hot encoding

In [116]:
df_deep = tmp_df[deep_cols]
deep_column_idx = {k:v for v,k in enumerate(df_deep.columns)}
print(df_deep.shape)

# for categorical variables in the Wide part, we are not going to use encoing variable instead one-hot encoding variables
df_wide = tmp_df[wide_cols + crossed_columns]
print(df_wide.shape)
one_hot_cols = [c for c in wide_cols+crossed_columns if c in categorical_cols]
df_wide = pd.get_dummies(df_wide, columns=one_hot_cols)
print(df_wide.shape) # by converting categorical variables to one-hot dummys, columns length increased from 10 to 798

(48842, 7)
(48842, 10)
(48842, 798)


#### 4) Split the data set to Train & Test and output as dict type

In [118]:
from sklearn.model_selection import train_test_split
from collections import namedtuple

seed = 1
x_train_deep, x_test_deep = train_test_split(df_deep.values, test_size = 0.3, random_state = seed)
x_train_wide, x_test_wide = train_test_split(df_wide.values, test_size = 0.3, random_state = seed)
y_train, y_test = train_test_split(target_Y, test_size = 0.3, random_state = seed)

print(x_train_deep.shape)
print(x_train_wide.shape)
print(y_train.shape)


# make the output dictionary
out_dataset = dict()
train_set = namedtuple('train_set', 'wide, deep, labels')
test_set = namedtuple('test_set', 'wide, deep, labels')
out_dataset['train_set'] = train_set(x_train_wide, x_train_deep, y_train)
out_dataset['test_set'] = test_set(x_test_wide, x_test_deep, y_test)
out_dataset['embeddings_input'] = embeddings_input
out_dataset['deep_column_idx'] = deep_column_idx
out_dataset['encoding_dict'] = encoding_dict

(34189, 7)
(34189, 798)
(34189,)


In [119]:
out_dataset['train_set']

train_set(wide=array([[-0.99512893, -0.35689365,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.17187097,  1.57994645,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.02599598, -0.03408696,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.48456647,  0.36942139,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.84925394, -0.03408696,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.99512893, -0.03408696,  1.        , ...,  0.        ,
         0.        ,  0.        ]]), deep=array([[ 1.        ,  3.        ,  2.        , ...,  0.        ,
        -0.99512893, -0.35689365],
       [ 1.        ,  1.        ,  2.        , ...,  0.        ,
         0.17187097,  1.57994645],
       [ 6.        ,  4.        ,  2.        , ...,  0.        ,
         0.02599598, -0.03408696],
       ...,
       [ 0.        ,  1.        ,  2.        , ...,  0