In [1]:
import numpy as np
import pandas as pd
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

### task 1 aims to predict whether the education level is at least college;
### task 2 aims to predict whether this person’s marital status is never married.

In [3]:
column_names = ['age', 'class_worker', 'det_ind_code', 'det_occ_code', 'education', 'wage_per_hour', 'hs_college',
                'marital_stat', 'major_ind_code', 'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member',
                'unemp_reason', 'full_or_part_emp', 'capital_gains', 'capital_losses', 'stock_dividends',
                'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat', 'det_hh_summ',
                'instance_weight', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                'num_emp', 'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                'own_or_self', 'vet_question', 'vet_benefits', 'weeks_worked', 'year', 'income_50k']
df_train = pd.read_csv('./data/census-income/census-income.data',header=None,names=column_names)
df_test = pd.read_csv('./data/census-income/census-income.test',header=None,names=column_names)
data = pd.concat([df_train, df_test], axis=0)

In [4]:
#task 1 label 'education' predict whether the education level is at least college
data['education'].value_counts()

 High school graduate                      72554
 Children                                  70864
 Some college but no degree                41774
 Bachelors degree(BA AB BS)                29750
 7th and 8th grade                         12156
 10th grade                                11370
 11th grade                                10399
 Masters degree(MA MS MEng MEd MSW MBA)     9847
 9th grade                                  9335
 Associates degree-occup /vocational        8048
 Associates degree-academic program         6442
 5th or 6th grade                           4991
 12th grade no diploma                      3263
 1st 2nd 3rd or 4th grade                   2705
 Prof school degree (MD DDS DVM LLB JD)     2669
 Doctorate degree(PhD EdD)                  1883
 Less than 1st grade                        1235
Name: education, dtype: int64

In [24]:
#task 2 label: 'marital_stat'
data['marital_stat'].value_counts()

 Never married                      129628
 Married-civilian spouse present    126315
 Divorced                            19160
 Widowed                             15788
 Separated                            5156
 Married-spouse absent                2234
 Married-A F spouse present           1004
Name: marital_stat, dtype: int64

In [17]:
#change the label to binary classification
college = [' Some college but no degree', ' Bachelors degree(BA AB BS)', ' Masters degree(MA MS MEng MEd MSW MBA)', ' Prof school degree (MD DDS DVM LLB JD)', ' Doctorate degree(PhD EdD)']

data['label_education'] = data['education'].apply(lambda x: 1 if x in college else 0)
data['label_marital'] = data['marital_stat'].apply(lambda x: 1 if x==' Never married' else 0)
data.drop(labels=['education', 'marital_stat'], axis=1, inplace=True)

In [20]:
#define dense and sparse features. 
#the functions used here can reference https://deepctr-torch.readthedocs.io/en/latest/Quick-Start.html
columns = data.columns.values.tolist()
sparse_features = ['class_worker', 'det_ind_code', 'det_occ_code', 'hs_college', 'major_ind_code',
                        'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member', 'unemp_reason',
                        'full_or_part_emp', 'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat',
                        'det_hh_summ', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                        'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                        'vet_question', 'income_50k']
dense_features = [col for col in columns if col not in sparse_features and col not in ['label_education', 'label_marital']]

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])
    
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
    
fixlen_feature_columns = [SparseFeat(feat, data[feat].max()+1, embedding_dim=4)for feat in sparse_features] \
                        + [DenseFeat(feat, 1,) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(dnn_feature_columns)

In [21]:
# Split the test dataset into 1:1 validation to test according to the MMOE paper
# validation_split = n_val/len(train) = 0.2
n_train = df_train.shape[0]
n_val = df_test.shape[0]//2
train = data[:n_train+n_val]
test = data[n_train+n_val:]

train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}

In [24]:
#Test Shared_Bottom Model
from shared_bottom import Shared_Bottom

model = Shared_Bottom(dnn_feature_columns, num_tasks=2, task_types= ['binary', 'binary'], task_names=['label_education','label_marital'], bottom_dnn_units=[16], tower_dnn_units_lists=[[8],[8]])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0, patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"], metrics=['AUC'])
history = model.fit(train_model_input, [train['label_education'].values, train['label_marital'].values],batch_size=1024, epochs=100, verbose=2,validation_split=0.2, callbacks=[early_stopping_monitor])
pred_ans = model.predict(test_model_input, batch_size=1024)

print("test education AUC", round(roc_auc_score(test['label_education'], pred_ans[0]), 4))
print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 2s - loss: 0.5787 - label_education_loss: 0.4041 - label_marital_loss: 0.1742 - label_education_auc: 0.8528 - label_marital_auc_1: 0.9797 - val_loss: 0.4874 - val_label_education_loss: 0.3896 - val_label_marital_loss: 0.0971 - val_label_education_auc: 0.8680 - val_label_marital_auc_1: 0.9939
Epoch 2/100
195/195 - 1s - loss: 0.4797 - label_education_loss: 0.3828 - label_marital_loss: 0.0962 - label_education_auc: 0.8717 - label_marital_auc_1: 0.9939 - val_loss: 0.4835 - val_label_education_loss: 0.3849 - val_label_marital_loss: 0.0978 - val_label_education_auc: 0.8692 - val_label_marital_auc_1: 0.9939
Epoch 3/100
195/195 - 1s - loss: 0.4771 - label_education_loss: 0.3818 - label_marital_loss: 0.0944 - label_education_auc: 0.8724 - label_marital_auc_1: 0.9941 - val_loss: 0.4817 - val_label_education_loss: 0.3870 - val_label_marital_loss: 0.0938 - val_label_education_auc: 0.8703 - val_label_marital_auc_1: 0.9942
Epoch 4/100
195/195 - 1s - loss: 0.4737 - label_educati

In [25]:
#Test ESSM Model
from essm import ESSM
#take marital as ctr task, take income as ctcvr task
model = ESSM(dnn_feature_columns, task_type='binary', task_names=['label_marital', 'label_education'],
        tower_dnn_units_lists=[[8],[8]])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0, patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"], metrics=['AUC'])
history = model.fit(train_model_input, [train['label_marital'].values, train['label_education'].values],batch_size=1024, epochs=100, verbose=2, validation_split=0.2, callbacks=[early_stopping_monitor])

pred_ans = model.predict(test_model_input, batch_size=1024)

print("test education AUC", round(roc_auc_score(test['label_marital'], pred_ans[0]), 4))
print("test income AUC", round(roc_auc_score(test['label_education'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 2s - loss: 0.7163 - label_marital_loss: 0.2452 - label_education_loss: 0.4707 - label_marital_auc: 0.9735 - label_education_auc_1: 0.8243 - val_loss: 0.6675 - val_label_marital_loss: 0.2073 - val_label_education_loss: 0.4596 - val_label_marital_auc: 0.9805 - val_label_education_auc_1: 0.8511
Epoch 2/100
195/195 - 1s - loss: 0.6627 - label_marital_loss: 0.2101 - label_education_loss: 0.4519 - label_marital_auc: 0.9825 - label_education_auc_1: 0.8526 - val_loss: 0.6612 - val_label_marital_loss: 0.2185 - val_label_education_loss: 0.4420 - val_label_marital_auc: 0.9833 - val_label_education_auc_1: 0.8568
Epoch 3/100
195/195 - 1s - loss: 0.6567 - label_marital_loss: 0.2062 - label_education_loss: 0.4497 - label_marital_auc: 0.9833 - label_education_auc_1: 0.8562 - val_loss: 0.6606 - val_label_marital_loss: 0.2140 - val_label_education_loss: 0.4457 - val_label_marital_auc: 0.9825 - val_label_education_auc_1: 0.8621
Epoch 4/100
195/195 - 1s - loss: 0.6546 - label_marital

In [26]:
from mmoe import MMOE
model = MMOE(dnn_feature_columns, num_tasks=2, task_types=['binary', 'binary'], task_names=['label_education','label_marital'], 
             num_experts=8, expert_dnn_units=[16], gate_dnn_units=None, tower_dnn_units_lists=[[8],[8]])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"],metrics=['AUC'])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0, patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)

history = model.fit(train_model_input, [train['label_education'].values, train['label_marital'].values], 
                    batch_size=1024, epochs=100, verbose=2, validation_split=0.2, callbacks=[early_stopping_monitor])

pred_ans = model.predict(test_model_input, batch_size=1024)
print("test education AUC", round(roc_auc_score(test['label_education'], pred_ans[0]), 4))
print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 3s - loss: 0.5466 - label_education_loss: 0.4058 - label_marital_loss: 0.1404 - label_education_auc: 0.8513 - label_marital_auc_1: 0.9872 - val_loss: 0.4881 - val_label_education_loss: 0.3849 - val_label_marital_loss: 0.1026 - val_label_education_auc: 0.8691 - val_label_marital_auc_1: 0.9938
Epoch 2/100
195/195 - 2s - loss: 0.4790 - label_education_loss: 0.3819 - label_marital_loss: 0.0964 - label_education_auc: 0.8722 - label_marital_auc_1: 0.9938 - val_loss: 0.4842 - val_label_education_loss: 0.3830 - val_label_marital_loss: 0.1004 - val_label_education_auc: 0.8713 - val_label_marital_auc_1: 0.9940
Epoch 3/100
195/195 - 2s - loss: 0.4747 - label_education_loss: 0.3800 - label_marital_loss: 0.0938 - label_education_auc: 0.8739 - label_marital_auc_1: 0.9941 - val_loss: 0.4779 - val_label_education_loss: 0.3836 - val_label_marital_loss: 0.0934 - val_label_education_auc: 0.8715 - val_label_marital_auc_1: 0.9942
Epoch 4/100
195/195 - 2s - loss: 0.4711 - label_educati

In [27]:
from ple_cgc import PLE_CGC

model = PLE_CGC(dnn_feature_columns, num_tasks=2, task_types=['binary', 'binary'], task_names=['label_education','label_marital'], 
                num_experts_specific=4, num_experts_shared=4, expert_dnn_units=[16], gate_dnn_units=None, tower_dnn_units_lists=[[8],[8]])

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"], metrics=['AUC'])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0,patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)

history = model.fit(train_model_input, [train['label_education'].values, train['label_marital'].values], 
                    batch_size=1024, epochs=100, verbose=2, validation_split=0.2, callbacks=[early_stopping_monitor])

pred_ans = model.predict(test_model_input, batch_size=1024)
print("test education AUC", round(roc_auc_score(test['label_education'], pred_ans[0]), 4))
print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 3s - loss: 0.5478 - label_education_loss: 0.4071 - label_marital_loss: 0.1402 - label_education_auc: 0.8507 - label_marital_auc_1: 0.9874 - val_loss: 0.4842 - val_label_education_loss: 0.3847 - val_label_marital_loss: 0.0989 - val_label_education_auc: 0.8697 - val_label_marital_auc_1: 0.9938
Epoch 2/100
195/195 - 2s - loss: 0.4765 - label_education_loss: 0.3807 - label_marital_loss: 0.0950 - label_education_auc: 0.8735 - label_marital_auc_1: 0.9940 - val_loss: 0.4790 - val_label_education_loss: 0.3833 - val_label_marital_loss: 0.0949 - val_label_education_auc: 0.8709 - val_label_marital_auc_1: 0.9941
Epoch 3/100
195/195 - 2s - loss: 0.4733 - label_education_loss: 0.3790 - label_marital_loss: 0.0935 - label_education_auc: 0.8749 - label_marital_auc_1: 0.9942 - val_loss: 0.4772 - val_label_education_loss: 0.3822 - val_label_marital_loss: 0.0940 - val_label_education_auc: 0.8716 - val_label_marital_auc_1: 0.9942
Epoch 4/100
195/195 - 2s - loss: 0.4717 - label_educati

In [28]:
from ple import PLE

model = PLE(dnn_feature_columns, num_tasks=2, task_types=['binary', 'binary'], task_names=['label_education','label_marital'], 
            num_levels=2, num_experts_specific=4, num_experts_shared=4, expert_dnn_units=[16],  gate_dnn_units=None,                                 tower_dnn_units_lists=[[8],[8]])
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss=["binary_crossentropy", "binary_crossentropy"], metrics=['AUC'])

early_stopping_monitor = tf.keras.callbacks.EarlyStopping(monitor='val_loss',min_delta=0,patience=10, verbose=1,
                                       mode='min',baseline=None,restore_best_weights=True)

history = model.fit(train_model_input, [train['label_education'].values, train['label_marital'].values], 
                    batch_size=1024, epochs=100, verbose=2, validation_split=0.2, callbacks=[early_stopping_monitor])

pred_ans = model.predict(test_model_input, batch_size=1024)
print("test education AUC", round(roc_auc_score(test['label_education'], pred_ans[0]), 4))2
print("test marital AUC", round(roc_auc_score(test['label_marital'], pred_ans[1]), 4))

Epoch 1/100
195/195 - 4s - loss: 0.8963 - label_education_loss: 0.4108 - label_marital_loss: 0.4851 - label_education_auc: 0.8497 - label_marital_auc_1: 0.8235 - val_loss: 0.5634 - val_label_education_loss: 0.3849 - val_label_marital_loss: 0.1778 - val_label_education_auc: 0.8693 - val_label_marital_auc_1: 0.9852
Epoch 2/100
195/195 - 3s - loss: 0.5222 - label_education_loss: 0.3811 - label_marital_loss: 0.1403 - label_education_auc: 0.8728 - label_marital_auc_1: 0.9881 - val_loss: 0.5060 - val_label_education_loss: 0.3831 - val_label_marital_loss: 0.1221 - val_label_education_auc: 0.8707 - val_label_marital_auc_1: 0.9903
Epoch 3/100
195/195 - 3s - loss: 0.4935 - label_education_loss: 0.3795 - label_marital_loss: 0.1131 - label_education_auc: 0.8743 - label_marital_auc_1: 0.9927 - val_loss: 0.4915 - val_label_education_loss: 0.3826 - val_label_marital_loss: 0.1080 - val_label_education_auc: 0.8716 - val_label_marital_auc_1: 0.9939
Epoch 4/100
195/195 - 3s - loss: 0.4820 - label_educati