# Approaching Categorical Variables II

Now that we have covered all the preprocessing of categorical features, we can use these techniques in model training and evaluation. Contrary to the book, native scikit-learn classes and methods are used for pipelines and cross validation.

- **Preprocessing:** This step is run through a pipeline and feature distinction is performed automatically using `make_column_selector` method. This is important because inproduction, data ingestion and processing step needs to automated.
- **Model evaluation:** Scikit-learn provides a cross validation function yielding scores instead of preparing folds manually. In addition, different folding strategies can be passed to the function `cross_val_score`.

In [3]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

M = 20_000 # Sample size

df = pd.read_csv("data/catinthedat_train.csv").drop("id", axis=1)
df_X = df.drop("target", axis=1)
df_y = df.target.values

categorical_columns = df_X.select_dtypes(exclude="number").columns
df_X[categorical_columns] = df_X[categorical_columns].astype("category")

simple_imputer = SimpleImputer(strategy="median")
one_hot_encoder = OneHotEncoder(handle_unknown="ignore")

ct = ColumnTransformer(
    [
        ("numeric_transform", simple_imputer, selector(dtype_exclude="category")),
        ("categoric_transform", one_hot_encoder, selector(dtype_include="category")),
    ],
    remainder="passthrough",
    n_jobs=-1,
    verbose=True,
)

X_transformed = ct.fit_transform(df_X)

linear_regression = LogisticRegression(max_iter=1000)

cross_val_score(linear_regression, X_transformed[:M,:], df_y[:M], n_jobs=-1, scoring="roc_auc")

[ColumnTransformer]  (1 of 2) Processing numeric_transform, total=   0.7s
[ColumnTransformer]  (2 of 2) Processing categoric_transform, total=   2.6s


array([0.72555338, 0.71580379, 0.73621327, 0.73979058, 0.74117373])

Book implementation

In [None]:
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics


def run(fold):
    df = pd.read_csv('data/catinthedat_train_folds.csv')
    
    features = [ f for f in df.columns if f not in ['id','target','kfold']]
    
    for col in features:
        df.loc[:,col] = df[col].astype(str).fillna('NONE')
        
    df_train = df[df.kfold != fold].reset_index(drop=True)
    
    df_val = df[df.kfold == fold].reset_index(drop=True)
    ohe = preprocessing.OneHotEncoder()
    
    full_data = pd.concat([df_train[features], df_val[features]], axis=0)
    ohe.fit(full_data[features])
    x_train = ohe.transform(df_train[features])
    x_valid = ohe.transform(df_val[features])
    
    model = linear_model.LogisticRegression()
    
    model.fit(X=x_train, y=df_train.target.values)
    valid_predictions = model.predict_proba(x_valid)[:,1]
    auc = metrics.roc_auc_score(df_val.target.values, valid_predictions)
    print(f'Fold = {fold}, Accuracy = {auc}')
    return x_train, x_valid, model
    
for fold_ in range(1):
    x_train, x_valid, model = run(fold_)

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder

M = 100_000  # Sample size

df = pd.read_csv("data/catinthedat_train.csv").drop("id", axis=1)
df_X = df.drop("target", axis=1)
df_y = df.target.values.astype(int)

categorical_columns = df_X.select_dtypes(exclude="number").columns
df_X[categorical_columns] = df_X[categorical_columns].astype(str).fillna("None")

simple_imputer = SimpleImputer(strategy="median")
one_hot_encoder = OneHotEncoder(handle_unknown="ignore")
feature_encoder = OrdinalEncoder()

ct = ColumnTransformer(
    [
        ("numeric_transform", simple_imputer, selector(dtype_include="number")),
        ("categoric_transform", feature_encoder, selector(dtype_exclude="number"))
    ],
    remainder="passthrough",
    # n_jobs=-1,
    verbose=True,
)

X_transformed = ct.fit_transform(df_X)

xtreme_gradient_boosting = xgb.XGBClassifier(
    n_jobs=-1,
    max_depth=7,
    n_estimators=200,
    use_label_encoder=False,
)

cross_val_score(
    xtreme_gradient_boosting,
    X_transformed[:M, :],
    df_y[:M],
    n_jobs=-1
)


[ColumnTransformer]  (1 of 2) Processing numeric_transform, total=   0.4s
[ColumnTransformer]  (2 of 2) Processing categoric_transform, total=   2.0s


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index




array([0.80795, 0.8095 , 0.80755, 0.80645, 0.8138 ])

## US Adult Census Data

- This is a somewhat more manageable dataset consisting of few numerical features. 

In [None]:
df_adult = pd.read_csv('data/adult.csv')
df_adult = df_adult.rename(columns={'income':'target'})

In [None]:
from sklearn.model_selection import StratifiedKFold

df_adult = df_adult.sample(frac=1).reset_index(drop=True)
df_adult['kfold'] = -1
kf = StratifiedKFold(n_splits=5)
y = df_adult.target.values

for fold_, (train_, val_) in enumerate(kf.split(df_adult, y=y)):
    df_adult.loc[val_, 'kfold'] = fold_

df_adult.to_csv('data/adult_folds.csv', index=False)

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# nnum_features = [f for f in df_adult.columns 
#                  if f not in ['age','fnlwgt','education.num',
#                                                'capital.gain','capital.loss',
#                                                'hours.per.week','kfold']]

# for col in nnum_features:
#     lbl_encoder = LabelEncoder()
#     lbl_encoder.fit(df_adult[col])
#     df_adult[col] = lbl_encoder.transform(df_adult[col])


In [None]:
import pandas as pd
from sklearn import ensemble
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

def run(fold):
    df = pd.read_csv('data/adult_folds.csv')

    # list of numerical columns
    num_cols = [
    "fnlwgt",
    "age",
    "capital.gain",
    "capital.loss",
    "hours.per.week",
    ]
    target_maps = {'<=50K':0,'>50K':1}
    df.loc[:,'target'] = df.target.map(target_maps)
    # drop numerical columns
#     df = df.drop(num_cols, axis=1)
    
    features = [f for f in df.columns if f not in ['kfold','target']]
    
    for col in features:
        if col not in num_cols:
            df.loc[:,col] = df[col].astype(str).fillna('NONE')
            lbl_encoder = LabelEncoder()
            lbl_encoder.fit(df[col])
            df[col] = lbl_encoder.transform(df[col])
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    x_train = df_train[features].values
    y_train = df_train.target.values
    
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    x_valid = df_valid[features].values
    y_valid = df_valid.target.values
    
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=100
        )
#     model = LogisticRegression()
    model.fit(x_train, y_train)
    
    valid_preds = model.predict_proba(x_valid)[:, 1]
    # get roc auc score
    auc = metrics.roc_auc_score(y_valid, valid_preds)
    print(f"Fold = {fold}, AUC = {auc}")
    
for fold in range(5):
    run(fold)

In [None]:
# Let's write a feature engineering functionf

def feature_eng(df, cat_cols):
    import itertools
    
    feat_comb = list(itertools.combinations(cat_cols,2))
    
    for c1,c2 in feat_comb:
        feat_name = f'{c1}_{c2}'
        df.loc[:,feat_name] = df[c1].astype(str) + '_' + df[c2].astype(str)
        
    return df

In [None]:
import pandas as pd
from sklearn import ensemble
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

def run(fold):
    df = pd.read_csv('data/adult_folds.csv')

    # list of numerical columns
    num_cols = [
    "fnlwgt",
    "age",
    "capital.gain",
    "capital.loss",
    "hours.per.week",
    ]
    
    cat_cols = [f for f in df.columns 
                if f not in num_cols and f not in ['kfold','target'] ]
    
    target_maps = {'<=50K':0,'>50K':1}
    df.loc[:,'target'] = df.target.map(target_maps)
    # drop numerical columns
#     df = df.drop(num_cols, axis=1)
    df = feature_eng(df, cat_cols)
    
    features = [f for f in df.columns if f not in ['kfold','target']]
    
    for col in features:
        if col not in num_cols:
            df.loc[:,col] = df[col].astype(str).fillna('NONE')
            lbl_encoder = LabelEncoder()
            lbl_encoder.fit(df[col])
            df[col] = lbl_encoder.transform(df[col])
    
    df_train = df[df.kfold != fold].reset_index(drop=True)
    x_train = df_train[features].values
    y_train = df_train.target.values
    
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    x_valid = df_valid[features].values
    y_valid = df_valid.target.values
    
    model = xgb.XGBClassifier(
        n_jobs=-1,
        max_depth=7,
        n_estimators=100
        )
#     model = LogisticRegression()
    model.fit(x_train, y_train)
    
    valid_preds = model.predict_proba(x_valid)[:, 1]
    # get roc auc score
    auc = metrics.roc_auc_score(y_valid, valid_preds)
    print(f"Fold = {fold}, AUC = {auc}")
    
for fold in range(5):
    run(fold)

# Entity Embedding

When number of categorical features increases, transformed matrices might have enormous number of columns. That's why we need another way to denote categories.

In [1]:
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils


def create_model(data, catcols):
    """
    This function returns a compiled tf.keras model
    for entity embeddings
    :param data: this is a pandas dataframe
    :param catcols: list of categorical column names
    :return: compiled tf.keras model
    """
    # init list of inputs for embeddings
    inputs = []
    # init list of outputs for embeddings
    outputs = []
    # loop over all categorical columns
    for c in catcols:
        # find the number of unique values in the column
        num_unique_values = int(data[c].nunique())
        # simple dimension of embedding calculator
        # min size is half of the number of unique values
        # max size is 50. max size depends on the number of unique
        # categories too. 50 is quite sufficient most of the times
        # but if you have millions of unique values, you might need
        # a larger dimension
        embed_dim = int(min(np.ceil((num_unique_values) / 2), 50))
        # simple keras input layer with size 1
        inp = layers.Input(shape=(1,))
        # add embedding layer to raw input
        # embedding size is always 1 more than unique values in input
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=c)(inp)
        # 1-d spatial dropout is the standard for emebedding layers
        # you can use it in NLP tasks too
        out = layers.SpatialDropout1D(0.3)(out)
        # reshape the input to the dimension of embedding
        # this becomes our output layer for current feature
        out = layers.Reshape(target_shape=(embed_dim,))(out)

        # add input to input list
        inputs.append(inp)
        # add output to output list

        outputs.append(out)
        # concatenate all output layers
        x = layers.Concatenate()(outputs)

        # x add a batchnorm layer.
        # from here, everything is up to you
        # you can try different architectures
        # this is the architecture I like to use
        # if you have numerical features, you should add
        # them here or in concatenate layer

        x = layers.BatchNormalization()(x)

        x = layers.Dense(300, activation="relu")(x)
        x = layers.Dropout(0.3)(x)
        x = layers.BatchNormalization()(x)

        x = layers.Dense(300, activation="relu")(x)
        x = layers.Dropout(0.3)(x)
        x = layers.BatchNormalization()(x)

        y = layers.Dense(2, activation="softmax")(x)
        model = Model(inputs=inputs, outputs=y)
        model.compile(loss="binary_crossentropy", optimizer="adam")
        return model


def run(fold):
    df = pd.read_csv("data/catinthedat_train_folds.csv").sample(frac=0.2).reset_index(drop=True)
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]

    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("None")

    for col in features:
        feature_encoder = preprocessing.LabelEncoder()
        df.loc[:, col] = feature_encoder.fit_transform(df[col].values)

    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_validation = df[df.kfold == fold].reset_index(drop=True)

    model = create_model(df, features)

    X_train = tuple([df_train[features].values[:, k] for k in range(len(features))])
    X_validation = tuple([df_validation[features].values[:, k] for k in range(len(features))])

    y_train = df_train.target.values
    y_validation = df_validation.target.values

    y_train_cat = utils.to_categorical(y_train)
    y_valid_cat = utils.to_categorical(y_validation)
    # fit the model

    model.fit(
        X_train,
        y_train_cat,
        validation_data=(X_validation, y_valid_cat),
        verbose=1,
        batch_size=1024,
        epochs=1,
    )
    valid_preds = model.predict(x_valid)[:,1]
    print(metrics.roc_auc_score(y_validation, valid_preds))
    K.clear_session()

run(1)


2022-04-11 15:52:24.204239: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-11 15:52:24.253593: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-11 15:52:24.254071: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-11 15:52:24.255867: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

ValueError: in user code:

    File "/home/onur/companions/aaamlp/venv/lib/python3.8/site-packages/keras/engine/training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "/home/onur/companions/aaamlp/venv/lib/python3.8/site-packages/keras/engine/training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/onur/companions/aaamlp/venv/lib/python3.8/site-packages/keras/engine/training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "/home/onur/companions/aaamlp/venv/lib/python3.8/site-packages/keras/engine/training.py", line 859, in train_step
        y_pred = self(x, training=True)
    File "/home/onur/companions/aaamlp/venv/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/onur/companions/aaamlp/venv/lib/python3.8/site-packages/keras/engine/input_spec.py", line 200, in assert_input_compatibility
        raise ValueError(f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Layer "model" expects 1 input(s), but it received 23 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:1' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:4' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:5' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:6' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:7' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:8' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:9' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:10' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:11' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:12' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:13' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:14' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:15' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:16' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:17' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:18' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:19' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:20' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:21' shape=(None,) dtype=int64>, <tf.Tensor 'IteratorGetNext:22' shape=(None,) dtype=int64>]
