# Dealing with categorical variables

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder 
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Input, concatenate
from tensorflow.keras.utils import plot_model

In [4]:
data = pd.read_csv('D:/data/csv/breast_cancer.csv', header=None)
dataset = data.values
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,'40-49','premeno','15-19','0-2','yes','3','right','left_up','no','recurrence-events'
1,'50-59','ge40','15-19','0-2','no','1','right','central','no','no-recurrence-events'
2,'50-59','ge40','35-39','0-2','no','2','left','left_low','no','recurrence-events'
3,'40-49','premeno','35-39','0-2','yes','3','right','left_low','yes','no-recurrence-events'
4,'40-49','premeno','30-34','3-5','yes','2','left','right_up','no','recurrence-events'


In [5]:
# load the dataset
def load_dataset(filename):
    # load the dataset as a pandas DataFrame
    data = pd.read_csv(filename, header=None)
    # retrieve numpy array
    dataset = data.values
    # split into input (X) and output (y) variables
    X = dataset[:, :-1]
    y = dataset[:,-1]
    # format all fields as string
    X = X.astype(str)
    # reshape target to be a 2d array
    y = y.reshape((len(y), 1))
    return X, y

In [6]:
# load the dataset
X, y = load_dataset('D:/data/csv/breast_cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (191, 9) (191, 1)
Test (95, 9) (95, 1)


## Ordinal encoding

In [13]:
# prepare input data
def prepare_inputs(X_train, X_test):
    oe = OrdinalEncoder()
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)
    return X_train_enc, X_test_enc

In [14]:
# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [15]:
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


## Build the model

In [16]:
# define the model
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

Epoch 1/100
12/12 - 0s - loss: 0.6949 - accuracy: 0.5497
Epoch 2/100
12/12 - 0s - loss: 0.6534 - accuracy: 0.6545
Epoch 3/100
12/12 - 0s - loss: 0.6374 - accuracy: 0.6806
Epoch 4/100
12/12 - 0s - loss: 0.6306 - accuracy: 0.6806
Epoch 5/100
12/12 - 0s - loss: 0.6218 - accuracy: 0.6911
Epoch 6/100
12/12 - 0s - loss: 0.6183 - accuracy: 0.6859
Epoch 7/100
12/12 - 0s - loss: 0.6122 - accuracy: 0.6963
Epoch 8/100
12/12 - 0s - loss: 0.6076 - accuracy: 0.7068
Epoch 9/100
12/12 - 0s - loss: 0.6029 - accuracy: 0.6963
Epoch 10/100
12/12 - 0s - loss: 0.5984 - accuracy: 0.7173
Epoch 11/100
12/12 - 0s - loss: 0.5941 - accuracy: 0.7382
Epoch 12/100
12/12 - 0s - loss: 0.5900 - accuracy: 0.7382
Epoch 13/100
12/12 - 0s - loss: 0.5874 - accuracy: 0.7382
Epoch 14/100
12/12 - 0s - loss: 0.5822 - accuracy: 0.7382
Epoch 15/100
12/12 - 0s - loss: 0.5807 - accuracy: 0.7382
Epoch 16/100
12/12 - 0s - loss: 0.5780 - accuracy: 0.7382
Epoch 17/100
12/12 - 0s - loss: 0.5726 - accuracy: 0.7435
Epoch 18/100
12/12 - 0s

## One hot encoding

In [20]:
# prepare input data
def prepare_inputs(X_train, X_test):
    ohe = OneHotEncoder()
    ohe.fit(X_train)
    X_train_enc = ohe.transform(X_train)
    X_test_enc = ohe.transform(X_test)
    return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# define the  model
model = Sequential()
model.add(Dense(10, input_dim=X_train_enc.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='sigmoid'))
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=100, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Epoch 1/100
12/12 - 0s - loss: 0.7595 - accuracy: 0.4503
Epoch 2/100
12/12 - 0s - loss: 0.7045 - accuracy: 0.5183
Epoch 3/100
12/12 - 0s - loss: 0.6570 - accuracy: 0.5916
Epoch 4/100
12/12 - 0s - loss: 0.6267 - accuracy: 0.6230
Epoch 5/100
12/12 - 0s - loss: 0.6006 - accuracy: 0.6754
Epoch 6/100
12/12 - 0s - loss: 0.5821 - accuracy: 0.7016
Epoch 7/100
12/12 - 0s - loss: 0.5688 - accuracy: 0.7120
Epoch 8/100
12/12 - 0s - loss: 0.5585 - accuracy: 0.7277
Epoch 9/100
12/12 - 0s - loss: 0.5498 - accuracy: 0.7330
Epoch 10/100
12/12 - 0s - loss: 0.5430 - accuracy: 0.7382
Epoch 11/100
12/12 - 0s - loss: 0.5375 - accuracy: 0.7487
Epoch 12/100
12/12 - 0s - loss: 0.5332 - accuracy: 0.7539
Epoch 13/100
12/12 - 0s - loss: 0.5266 - accuracy: 0.7644
Epoch 14/100
12/12 - 0s - loss: 0.5229 - accuracy: 0.7749
Epoch 15/100
12/12 - 0s - loss: 0.5164 - accuracy: 0.7749
Epoch 16/100
12/12 - 0s - loss: 0.5126 - accuracy: 0.7749
Epoch 17/100
12/12 - 0s - loss: 0.5092 - accuracy: 0.7749
Epoch 18/100
12/12 - 0s

## Learned Embeddings

In [28]:
# prepare input data
def prepare_inputs(X_train, X_test):
    X_train_enc, X_test_enc = list(), list()
    # label encode each column
    for i in range(X_train.shape[1]):
        le = LabelEncoder()
        le.fit(X_train[:, i])
        # encode
        train_enc = le.transform(X_train[:, i])
        test_enc = le.transform(X_test[:, i])
        # store
        X_train_enc.append(train_enc)
        X_test_enc.append(test_enc)
        return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

In [39]:
X_train.shape, X_test.shape

((191, 9), (95, 9))

In [36]:
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
print(len(X_train_enc), len(X_test_enc)) 
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
print(len(y_train_enc), len(y_test_enc))             

1 1
191 95


In [33]:
# make output 3d
y_train_enc = y_train_enc.reshape((len(y_train_enc), 1, 1))
y_test_enc = y_test_enc.reshape((len(y_test_enc), 1, 1))

# prepare each input head
in_layers = list()
em_layers = list()
for i in range(len(X_train_enc)):
    # calculate the number of unique inputs
    n_labels = len(np.unique(X_train_enc[i]))
    # define input layer
    in_layer = Input(shape=(1,))
    # define embedding layer
    em_layer = Embedding(n_labels, 10)(in_layer)
    # store layers
    in_layers.append(in_layer)
    em_layers.append(em_layer)

1

In [31]:
# concat all embeddings
merge = concatenate(em_layers)
dense = Dense(10, activation='relu', kernel_initializer='he_normal')(merge)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=in_layers, outputs=output)
# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# plot graph
plot_model(model, show_shapes=True, to_file='embeddings.png')
# fit the keras model on the dataset
model.fit(X_train_enc, y_train_enc, epochs=20, batch_size=16, verbose=2)
# evaluate the keras model
_, accuracy = model.evaluate(X_test_enc, y_test_enc, verbose=0)
print('Accuracy: %.2f' % (accuracy*100))

ValueError: A `Concatenate` layer should be called on a list of at least 2 inputs

## Credits & Links

https://mc.ai/3-ways-to-encode-categorical-variables-for-deep-learning/
https://towardsdatascience.com/deep-embeddings-for-categorical-variables-cat2vec-b05c8ab63ac0