In [9]:
import numpy as np
import pandas as pd
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

### task 1 aims to predict whether the income exceeds 50K, 
### task 2 aims to predict whether this person’s marital status is never married.

In [22]:
column_names = ['age', 'class_worker', 'det_ind_code', 'det_occ_code', 'education', 'wage_per_hour', 'hs_college',
                'marital_stat', 'major_ind_code', 'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member',
                'unemp_reason', 'full_or_part_emp', 'capital_gains', 'capital_losses', 'stock_dividends',
                'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat', 'det_hh_summ',
                'instance_weight', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                'num_emp', 'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                'own_or_self', 'vet_question', 'vet_benefits', 'weeks_worked', 'year', 'income_50k']
df_train = pd.read_csv('./data/census-income/census-income.data',header=None,names=column_names)
df_test = pd.read_csv('./data/census-income/census-income.test',header=None,names=column_names)
data = pd.concat([df_train, df_test], axis=0)

In [23]:
#task 1 label：'income_50k'
data['income_50k'].value_counts()

 - 50000.    280717
 50000+.      18568
Name: income_50k, dtype: int64

In [24]:
#task 2 label: 'marital_stat'
data['marital_stat'].value_counts()

 Never married                      129628
 Married-civilian spouse present    126315
 Divorced                            19160
 Widowed                             15788
 Separated                            5156
 Married-spouse absent                2234
 Married-A F spouse present           1004
Name: marital_stat, dtype: int64

In [25]:
#change the label to binary classification
data['label_income'] = data['income_50k'].map({' - 50000.':0, ' 50000+.':1})
data['label_marital'] = data['marital_stat'].apply(lambda x: 1 if x==' Never married' else 0)
data.drop(labels=['income_50k', 'marital_stat'], axis=1, inplace=True)

In [26]:
#define dense and sparse features. 
#the functions used here can reference https://deepctr-torch.readthedocs.io/en/latest/Quick-Start.html
columns = data.columns.values.tolist()
sparse_features = ['class_worker', 'det_ind_code', 'det_occ_code', 'education', 'hs_college', 'major_ind_code',
                        'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member', 'unemp_reason',
                        'full_or_part_emp', 'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat',
                        'det_hh_summ', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                        'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                        'vet_question']
dense_features = [col for col in columns if col not in sparse_features and col not in ['label_income', 'label_marital']]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
    
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
    
fixlen_feature_columns = [SparseFeat(feat, data[feat].max()+1, embedding_dim=4)for feat in sparse_features] \
                        + [DenseFeat(feat, 1,) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(dnn_feature_columns)

In [41]:
# Split the test dataset into 1:1 validation to test according to the MMOE paper
# validation_split = n_val/len(train) = 0.2
n_train = df_train.shape[0]
n_val = df_test.shape[0]//2
train = data[:n_train+n_val]
test = data[n_train+n_val:]

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [63]:
#Test Shared_Bottom Model
from shared_bottom import Shared_Bottom

model = Shared_Bottom(dnn_feature_columns, num_tasks=2, task_types= ['binary', 'binary'], task_names=['label_income','label_marital'], bottom_dnn_units=[16], tower_dnn_units_lists=[[8],[8]])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0, patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"], metrics=['AUC'])
history = model.fit(train_model_input, [train['label_income'].values, train['label_marital'].values],batch_size=1024, epochs=100, verbose=2,validation_split=0.2, callbacks=[early_stopping_monitor])
pred_ans = model.predict(test_model_input, batch_size=1024)

print("test income AUC", round(roc_auc_score(test['label_income'], pred_ans[0]), 4))
print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 4s - loss: 0.3384 - label_income_loss: 0.1546 - label_marital_loss: 0.1833 - label_income_auc: 0.9027 - label_marital_auc_1: 0.9789 - val_loss: 0.2262 - val_label_income_loss: 0.1299 - val_label_marital_loss: 0.0957 - val_label_income_auc: 0.9370 - val_label_marital_auc_1: 0.9940
Epoch 2/100
195/195 - 2s - loss: 0.2226 - label_income_loss: 0.1266 - label_marital_loss: 0.0954 - label_income_auc: 0.9391 - label_marital_auc_1: 0.9939 - val_loss: 0.2219 - val_label_income_loss: 0.1267 - val_label_marital_loss: 0.0945 - val_label_income_auc: 0.9444 - val_label_marital_auc_1: 0.9942
Epoch 3/100
195/195 - 2s - loss: 0.2191 - label_income_loss: 0.1246 - label_marital_loss: 0.0938 - label_income_auc: 0.9407 - label_marital_auc_1: 0.9942 - val_loss: 0.2172 - val_label_income_loss: 0.1229 - val_label_marital_loss: 0.0936 - val_label_income_auc: 0.9446 - val_label_marital_auc_1: 0.9943
Epoch 4/100
195/195 - 2s - loss: 0.2151 - label_income_loss: 0.1227 - label_marital_loss: 0

In [60]:
#Test ESSM Model
from essm import ESSM
#take marital as ctr task, take income as ctcvr task
model = ESSM(dnn_feature_columns, task_type='binary', task_names=['label_marital', 'label_income'],
        tower_dnn_units_lists=[[8],[8]])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0, patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"], metrics=['AUC'])
history = model.fit(train_model_input, [train['label_marital'].values, train['label_income'].values],batch_size=1024, epochs=100, verbose=2, validation_split=0.2, callbacks=[early_stopping_monitor])

pred_ans = model.predict(test_model_input, batch_size=1024)

print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[0]), 4))
print("test income AUC", round(roc_auc_score(test['label_income'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 2s - loss: 0.3644 - label_marital_loss: 0.1804 - label_income_loss: 0.1834 - label_marital_auc: 0.9814 - label_income_auc_1: 0.8624 - val_loss: 0.2874 - val_label_marital_loss: 0.1343 - val_label_income_loss: 0.1523 - val_label_marital_auc: 0.9886 - val_label_income_auc_1: 0.9238
Epoch 2/100
195/195 - 2s - loss: 0.2795 - label_marital_loss: 0.1298 - label_income_loss: 0.1488 - label_marital_auc: 0.9900 - label_income_auc_1: 0.9260 - val_loss: 0.2768 - val_label_marital_loss: 0.1243 - val_label_income_loss: 0.1516 - val_label_marital_auc: 0.9909 - val_label_income_auc_1: 0.9303
Epoch 3/100
195/195 - 2s - loss: 0.2712 - label_marital_loss: 0.1250 - label_income_loss: 0.1452 - label_marital_auc: 0.9906 - label_income_auc_1: 0.9307 - val_loss: 0.2712 - val_label_marital_loss: 0.1249 - val_label_income_loss: 0.1454 - val_label_marital_auc: 0.9910 - val_label_income_auc_1: 0.9352
Epoch 4/100
195/195 - 2s - loss: 0.2671 - label_marital_loss: 0.1230 - label_income_loss: 0

In [59]:
from mmoe import MMOE
model = MMOE(dnn_feature_columns, num_tasks=2, task_types=['binary', 'binary'], task_names=['income','marital'], 
             num_experts=8, expert_dnn_units=[16], gate_dnn_units=None, tower_dnn_units_lists=[[8],[8]])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"],metrics=['AUC'])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0, patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)

history = model.fit(train_model_input, [train['label_income'].values, train['label_marital'].values], 
                    batch_size=1024, epochs=100, verbose=2, validation_split=0.2, callbacks=[early_stopping_monitor])

pred_ans = model.predict(test_model_input, batch_size=1024)
print("test income AUC", round(roc_auc_score(test['label_income'], pred_ans[0]), 4))
print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 4s - loss: 0.3040 - income_loss: 0.1597 - marital_loss: 0.1438 - income_auc: 0.8942 - marital_auc_1: 0.9867 - val_loss: 0.2277 - val_income_loss: 0.1288 - val_marital_loss: 0.0983 - val_income_auc: 0.9418 - val_marital_auc_1: 0.9942
Epoch 2/100
195/195 - 2s - loss: 0.2197 - income_loss: 0.1248 - marital_loss: 0.0942 - income_auc: 0.9413 - marital_auc_1: 0.9941 - val_loss: 0.2157 - val_income_loss: 0.1217 - val_marital_loss: 0.0933 - val_income_auc: 0.9454 - val_marital_auc_1: 0.9943
Epoch 3/100
195/195 - 3s - loss: 0.2162 - income_loss: 0.1232 - marital_loss: 0.0923 - income_auc: 0.9425 - marital_auc_1: 0.9943 - val_loss: 0.2147 - val_income_loss: 0.1214 - val_marital_loss: 0.0926 - val_income_auc: 0.9456 - val_marital_auc_1: 0.9944
Epoch 4/100
195/195 - 2s - loss: 0.2127 - income_loss: 0.1213 - marital_loss: 0.0906 - income_auc: 0.9444 - marital_auc_1: 0.9945 - val_loss: 0.2125 - val_income_loss: 0.1207 - val_marital_loss: 0.0910 - val_income_auc: 0.9466 - val_ma

In [64]:
from ple_cgc import PLE_CGC

model = PLE_CGC(dnn_feature_columns, num_tasks=2, task_types=['binary', 'binary'], task_names=['income','marital'], 
                num_experts_specific=4, num_experts_shared=4, expert_dnn_units=[16], gate_dnn_units=None, tower_dnn_units_lists=[[8],[8]])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"], metrics=['AUC'])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0,patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)

history = model.fit(train_model_input, [train['label_income'].values, train['label_marital'].values], 
                    batch_size=512, epochs=100, verbose=2, validation_split=0.2, callbacks=[early_stopping_monitor])

pred_ans = model.predict(test_model_input, batch_size=512)
print("test income AUC", round(roc_auc_score(test['label_income'], pred_ans[0]), 4))
print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[1]), 4))

Epoch 1/100
390/390 - 4s - loss: 0.2645 - income_loss: 0.1430 - marital_loss: 0.1210 - income_auc: 0.9185 - marital_auc_1: 0.9904 - val_loss: 0.2250 - val_income_loss: 0.1235 - val_marital_loss: 0.1010 - val_income_auc: 0.9436 - val_marital_auc_1: 0.9940
Epoch 2/100
390/390 - 3s - loss: 0.2177 - income_loss: 0.1241 - marital_loss: 0.0930 - income_auc: 0.9415 - marital_auc_1: 0.9942 - val_loss: 0.2138 - val_income_loss: 0.1212 - val_marital_loss: 0.0918 - val_income_auc: 0.9447 - val_marital_auc_1: 0.9944
Epoch 3/100
390/390 - 4s - loss: 0.2132 - income_loss: 0.1222 - marital_loss: 0.0902 - income_auc: 0.9431 - marital_auc_1: 0.9945 - val_loss: 0.2147 - val_income_loss: 0.1225 - val_marital_loss: 0.0913 - val_income_auc: 0.9446 - val_marital_auc_1: 0.9944
Epoch 4/100
390/390 - 4s - loss: 0.2118 - income_loss: 0.1214 - marital_loss: 0.0895 - income_auc: 0.9442 - marital_auc_1: 0.9946 - val_loss: 0.2124 - val_income_loss: 0.1208 - val_marital_loss: 0.0907 - val_income_auc: 0.9451 - val_ma

In [57]:
from ple import PLE

model = PLE(dnn_feature_columns, num_tasks=2, task_types=['binary', 'binary'], task_names=['income','marital'], 
            num_levels=2, num_experts_specific=4, num_experts_shared=4, expert_dnn_units=[16],  gate_dnn_units=None,                                 tower_dnn_units_lists=[[8],[8]])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"], metrics=['AUC'])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0,patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)

history = model.fit(train_model_input, [train['label_income'].values, train['label_marital'].values], 
                    batch_size=1024, epochs=100, verbose=2, validation_split=0.2, callbacks=[early_stopping_monitor])

pred_ans = model.predict(test_model_input, batch_size=1024)
print("test income AUC", round(roc_auc_score(test['label_income'], pred_ans[0]), 4))
print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 5s - loss: 0.3415 - income_loss: 0.1676 - marital_loss: 0.1734 - income_auc: 0.8845 - marital_auc_1: 0.9802 - val_loss: 0.2316 - val_income_loss: 0.1314 - val_marital_loss: 0.0996 - val_income_auc: 0.9406 - val_marital_auc_1: 0.9941
Epoch 2/100
195/195 - 3s - loss: 0.2241 - income_loss: 0.1294 - marital_loss: 0.0941 - income_auc: 0.9398 - marital_auc_1: 0.9941 - val_loss: 0.2211 - val_income_loss: 0.1278 - val_marital_loss: 0.0927 - val_income_auc: 0.9435 - val_marital_auc_1: 0.9943
Epoch 3/100
195/195 - 3s - loss: 0.2199 - income_loss: 0.1262 - marital_loss: 0.0930 - income_auc: 0.9423 - marital_auc_1: 0.9942 - val_loss: 0.2193 - val_income_loss: 0.1269 - val_marital_loss: 0.0916 - val_income_auc: 0.9439 - val_marital_auc_1: 0.9944
Epoch 4/100
195/195 - 3s - loss: 0.2155 - income_loss: 0.1239 - marital_loss: 0.0909 - income_auc: 0.9436 - marital_auc_1: 0.9945 - val_loss: 0.2139 - val_income_loss: 0.1223 - val_marital_loss: 0.0908 - val_income_auc: 0.9454 - val_ma