In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
# binary classification problem
# https://research.google/blog/wide-amp-deep-learning-better-together-with-tensorflow/
# https://chromium.googlesource.com/external/github.com/tensorflow/tensorflow/+/r0.10/tensorflow/g3doc/tutorials/wide/index.md
# https://chromium.googlesource.com/external/github.com/tensorflow/tensorflow/+/r0.10/tensorflow/g3doc/tutorials/wide_and_deep/index.md
# https://arxiv.org/abs/1606.07792
# https://www.tensorflow.org/guide/migrate/migrating_feature_columns#complete_training_example

In [5]:
import tensorflow as tf

In [6]:
# remember to enable internet access by verifying your phone number, from your profile section...
# otherwise you can't download data (maybe you can upload it by yourself, it's worth a try)

In [45]:
import tempfile
import urllib
train_file = tempfile.NamedTemporaryFile()
test_file = tempfile.NamedTemporaryFile()
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", train_file.name)
urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test", test_file.name)

('/tmp/tmp2brf36tq', <http.client.HTTPMessage at 0x79ba0f487a00>)

In [46]:
import pandas as pd
COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
           "marital_status", "occupation", "relationship", "race", "gender",
           "capital_gain", "capital_loss", "hours_per_week", "native_country",
           "income_bracket"]
df_train = pd.read_csv(train_file, names=COLUMNS, skipinitialspace=True)
df_test = pd.read_csv(test_file, names=COLUMNS, skipinitialspace=True, skiprows=1)

In [47]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [48]:
LABEL_COLUMN = "label"
df_train[LABEL_COLUMN] = (df_train["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)
df_test[LABEL_COLUMN] = (df_test["income_bracket"].apply(lambda x: ">50K" in x)).astype(int)

In [49]:
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation",
                       "relationship", "race", "gender", "native_country"]
CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

In [12]:
# When training a tf.estimator.Estimator in TensorFlow 1, you usually perform feature preprocessing with
# the tf.feature_column API. In TensorFlow 2, you can do this directly with Keras preprocessing layers.

In [50]:
target=df_train.pop('label')  # !
target_array = target.to_numpy()[:, tf.newaxis]  # expand one dim

# Wide Approach (Linear Regression)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

X = df_train[CATEGORICAL_COLUMNS]
y = target

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_COLUMNS)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=500))
])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
# model.fit(X_train, y_train)
# accuracy = model.score(X_test, y_test)

X_test=df_test[CATEGORICAL_COLUMNS]
y_test=df_test.pop('label')

model.fit(X,y)
accuracy = model.score(X_test, y_test)

print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.836


## Now we can try to increase the overall accuracy by adding the non-categorical (numeric) features

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

X = df_train
y = target

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), CATEGORICAL_COLUMNS),
        ('num', StandardScaler(), CONTINUOUS_COLUMNS),
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=500))
])

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
# model.fit(X_train, y_train)
# accuracy = model.score(X_test, y_test)

'''
X_test=df_test[CATEGORICAL_COLUMNS]
y_test=df_test.pop('label')
'''

model.fit(X,y)
accuracy = model.score(df_test, y_test)

print(f"Accuracy: {accuracy:.3f}")

Accuracy: 0.852


# Deep Learning Approach

In [16]:
inputs = {
    'workclass': tf.keras.Input(shape=(), dtype='string', name='workclass'),
    'education': tf.keras.Input(shape=(), dtype='string', name='education'),
    'marital_status': tf.keras.Input(shape=(), dtype='string', name='marital_status'),
    'occupation': tf.keras.Input(shape=(), dtype='string', name='occupation'),
    'relationship': tf.keras.Input(shape=(), dtype='string', name='relationship'),
    'race': tf.keras.Input(shape=(), dtype='string', name='race'),
    'gender': tf.keras.Input(shape=(), dtype='string', name='gender'),
    'native_country': tf.keras.Input(shape=(), dtype='string', name='native_country')
}

In [17]:
# categorical features management

keys_work = df_train['workclass'].unique().tolist()
keys_edu = df_train['education'].unique().tolist()
keys_marital = df_train['marital_status'].unique().tolist()
keys_occ = df_train['occupation'].unique().tolist()
keys_relation = df_train['relationship'].unique().tolist()
keys_race=["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"]
keys_gender = ["Female", "Male"]
keys_country = df_train['native_country'].unique().tolist()

work = tf.keras.layers.StringLookup(vocabulary=keys_work)(inputs['workclass'])
edu = tf.keras.layers.StringLookup(vocabulary=keys_edu)(inputs['education'])
marital = tf.keras.layers.StringLookup(vocabulary=keys_marital)(inputs['marital_status'])
occ = tf.keras.layers.StringLookup(vocabulary=keys_occ)(inputs['occupation'])
relat = tf.keras.layers.StringLookup(vocabulary=keys_relation)(inputs['relationship'])
race = tf.keras.layers.StringLookup(vocabulary=keys_race)(inputs['race'])
gender = tf.keras.layers.StringLookup(vocabulary=keys_gender)(inputs['gender'])
country = tf.keras.layers.StringLookup(vocabulary=keys_country)(inputs['native_country'])

concatenated_outputs = tf.keras.layers.Concatenate()([work, edu, marital, occ, relat, race, gender, country])

preprocessing_model = tf.keras.Model(inputs, concatenated_outputs)

In [18]:
categoricalTrainSubset = df_train[CATEGORICAL_COLUMNS]

In [19]:
categoricalFeatureDict = {k: v.to_numpy()[:, tf.newaxis] for k,v in dict(categoricalTrainSubset).items()}

In [20]:
categorical_dict_ds = tf.data.Dataset.from_tensor_slices((categoricalFeatureDict , target_array))  # !!!

In [21]:
batch_size=32

categoricalDataset = categorical_dict_ds.map(lambda x, y: (preprocessing_model(x), y),
                                             num_parallel_calls=tf.data.AUTOTUNE)
categoricalDataset = categoricalDataset.batch(batch_size)

In [22]:
# DL Approach

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(8,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 8. Compilazione del modello
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [23]:
model.summary()

In [24]:
# 9. Allenamento del modello
model.fit(categoricalDataset, epochs=20)

Epoch 1/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7399 - loss: 0.5442
Epoch 2/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7909 - loss: 0.4349
Epoch 3/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.7986 - loss: 0.4216
Epoch 4/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8043 - loss: 0.4145
Epoch 5/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8072 - loss: 0.4103
Epoch 6/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8080 - loss: 0.4070
Epoch 7/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8089 - loss: 0.4047
Epoch 8/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8086 - loss: 0.4026
Epoch 9/20
[1m1018/1018

<keras.src.callbacks.history.History at 0x79bae6385870>

## Testing phase

In [None]:
categoricalTestSubset = df_test[CATEGORICAL_COLUMNS]
target_arrayTest =  y_test.to_numpy()[:, tf.newaxis]

categoricalFeatureDictTest = {k: v.to_numpy()[:, tf.newaxis] for k,v in dict(categoricalTestSubset).items()}
categorical_dict_dsTest = tf.data.Dataset.from_tensor_slices((categoricalFeatureDictTest , target_arrayTest))  # !!!
batch_size=32

testDataset = categorical_dict_dsTest.map(lambda x, y: (preprocessing_model(x), y),
                                             num_parallel_calls=tf.data.AUTOTUNE)
testDataset = testDataset.batch(batch_size)

model.evaluate(testDataset)

## Add numeric features

In [51]:
inputs = {
    'workclass': tf.keras.Input(shape=(), dtype='string', name='workclass'),
    'education': tf.keras.Input(shape=(), dtype='string', name='education'),
    'marital_status': tf.keras.Input(shape=(), dtype='string', name='marital_status'),
    'occupation': tf.keras.Input(shape=(), dtype='string', name='occupation'),
    'relationship': tf.keras.Input(shape=(), dtype='string', name='relationship'),
    'race': tf.keras.Input(shape=(), dtype='string', name='race'),
    'gender': tf.keras.Input(shape=(), dtype='string', name='gender'),
    'native_country': tf.keras.Input(shape=(), dtype='string', name='native_country'),
    'age': tf.keras.Input(shape=(), dtype='int64', name='age'),
    'education_num': tf.keras.Input(shape=(), dtype='int64', name='education_num'),
    'capital_gain': tf.keras.Input(shape=(), dtype='int64', name='capital_gain'),
    'capital_loss': tf.keras.Input(shape=(), dtype='int64', name='capital_loss'),
    'hours_per_week': tf.keras.Input(shape=(), dtype='int64', name='hours_per_week')
}

In [52]:
# categorical features management
age_mean = df_train.loc[:, 'age'].mean()
age_std = df_train.loc[:, 'age'].std()
education_num_mean = df_train.loc[:, 'education_num'].mean()
education_num_std = df_train.loc[:, 'education_num'].std()
capital_gain_mean = df_train.loc[:, 'capital_gain'].mean()
capital_gain_std = df_train.loc[:, 'capital_gain'].std()
capital_loss_mean = df_train.loc[:, 'capital_loss'].mean()
capital_loss_std = df_train.loc[:, 'capital_loss'].std()
hours_per_week_mean = df_train.loc[:, 'hours_per_week'].mean()
hours_per_week_std = df_train.loc[:, 'hours_per_week'].std()

keys_work = df_train['workclass'].unique().tolist()
keys_edu = df_train['education'].unique().tolist()
keys_marital = df_train['marital_status'].unique().tolist()
keys_occ = df_train['occupation'].unique().tolist()
keys_relation = df_train['relationship'].unique().tolist()
keys_race=["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"]
keys_gender = ["Female", "Male"]
keys_country = df_train['native_country'].unique().tolist()

work = tf.keras.layers.StringLookup(vocabulary=keys_work)(inputs['workclass'])
edu = tf.keras.layers.StringLookup(vocabulary=keys_edu)(inputs['education'])
marital = tf.keras.layers.StringLookup(vocabulary=keys_marital)(inputs['marital_status'])
occ = tf.keras.layers.StringLookup(vocabulary=keys_occ)(inputs['occupation'])
relat = tf.keras.layers.StringLookup(vocabulary=keys_relation)(inputs['relationship'])
race = tf.keras.layers.StringLookup(vocabulary=keys_race)(inputs['race'])
gender = tf.keras.layers.StringLookup(vocabulary=keys_gender)(inputs['gender'])
country = tf.keras.layers.StringLookup(vocabulary=keys_country)(inputs['native_country'])
age_output = tf.keras.layers.Normalization(
      axis=None, mean=age_mean, variance=age_std)(inputs['age'])
education_num_output = tf.keras.layers.Normalization(
      axis=None, mean=education_num_mean, variance=education_num_std)(inputs['education_num'])
capital_gain_output = tf.keras.layers.Normalization(
      axis=None, mean=capital_gain_mean, variance=capital_gain_std)(inputs['capital_gain'])
capital_loss_output = tf.keras.layers.Normalization(
      axis=None, mean=capital_loss_mean, variance=capital_loss_std)(inputs['capital_loss'])
hours_per_week_output = tf.keras.layers.Normalization(
      axis=None, mean=hours_per_week_mean, variance=hours_per_week_std)(inputs['hours_per_week'])

concatenated_outputs = tf.keras.layers.Concatenate()([work, edu, marital, occ, relat, race, gender, country,
                                                      age_output, education_num_output, capital_gain_output,
                                                      capital_loss_output, hours_per_week_output])

preprocessing_model = tf.keras.Model(inputs, concatenated_outputs)

In [53]:
categoricalTrainSubset = df_train

In [54]:
categoricalFeatureDict = {k: v.to_numpy()[:, tf.newaxis] for k,v in dict(categoricalTrainSubset).items()}

In [55]:
categorical_dict_ds = tf.data.Dataset.from_tensor_slices((categoricalFeatureDict , target_array))  # !!!

In [56]:
batch_size=32

categoricalDataset = categorical_dict_ds.map(lambda x, y: (preprocessing_model(x), y),
                                             num_parallel_calls=tf.data.AUTOTUNE)
categoricalDataset = categoricalDataset.batch(batch_size)

In [59]:
# DL Approach

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(13,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 8. Compilazione del modello
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [60]:
# 9. Allenamento del modello
model.fit(categoricalDataset, epochs=20)

Epoch 1/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7985 - loss: 0.4474
Epoch 2/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8278 - loss: 0.3773
Epoch 3/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8351 - loss: 0.3548
Epoch 4/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8382 - loss: 0.3483
Epoch 5/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8411 - loss: 0.3404
Epoch 6/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8418 - loss: 0.3374
Epoch 7/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8443 - loss: 0.3343
Epoch 8/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8433 - loss: 0.3345
Epoch 9/20
[1m1018/1018

<keras.src.callbacks.history.History at 0x79ba0eb22aa0>

## Testing phase

In [61]:
categoricalTestSubset = df_test
target_arrayTest =  y_test.to_numpy()[:, tf.newaxis]

categoricalFeatureDictTest = {k: v.to_numpy()[:, tf.newaxis] for k,v in dict(categoricalTestSubset).items()}
categorical_dict_dsTest = tf.data.Dataset.from_tensor_slices((categoricalFeatureDictTest , target_arrayTest))  # !!!
batch_size=32

testDataset = categorical_dict_dsTest.map(lambda x, y: (preprocessing_model(x), y),
                                             num_parallel_calls=tf.data.AUTOTUNE)
testDataset = testDataset.batch(batch_size)

model.evaluate(testDataset)

[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8465 - loss: 0.3267


[0.32561537623405457, 0.8492107391357422]

# Maybe increase model size

In [63]:
# DL Approach

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(13,)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# 8. Compilazione del modello
optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [64]:
# 9. Allenamento del modello
model.fit(categoricalDataset, epochs=20)

Epoch 1/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8092 - loss: 0.4224
Epoch 2/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8337 - loss: 0.3559
Epoch 3/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8387 - loss: 0.3438
Epoch 4/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8414 - loss: 0.3397
Epoch 5/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8429 - loss: 0.3339
Epoch 6/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.8443 - loss: 0.3298
Epoch 7/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8454 - loss: 0.3281
Epoch 8/20
[1m1018/1018[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8464 - loss: 0.3255
Epoch 9/20
[1m1018/1018

<keras.src.callbacks.history.History at 0x79ba0eb22c80>

In [65]:
categoricalTestSubset = df_test
target_arrayTest =  y_test.to_numpy()[:, tf.newaxis]

categoricalFeatureDictTest = {k: v.to_numpy()[:, tf.newaxis] for k,v in dict(categoricalTestSubset).items()}
categorical_dict_dsTest = tf.data.Dataset.from_tensor_slices((categoricalFeatureDictTest , target_arrayTest))  # !!!
batch_size=32

testDataset = categorical_dict_dsTest.map(lambda x, y: (preprocessing_model(x), y),
                                             num_parallel_calls=tf.data.AUTOTUNE)
testDataset = testDataset.batch(batch_size)

model.evaluate(testDataset)

[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8438 - loss: 0.3378


[0.338194876909256, 0.8463239073753357]

# Next steps:
1. bucket the 'age' column
2. Intersecting Multiple Columns with CrossedColumn
3. Adding Regularization to Prevent Overfitting
4. real implementation of the paper