## DNN - Keras and categorical feature embedding
### Demostration code from Kaggle:
#### https://www.kaggle.com/blaskowitz100/dnn-keras-and-categorical-feature-embedding

In [43]:
# Import the necessary libraries
import os
import time
import datetime
import numpy as np
import pandas as pd

# Keras
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K

# Standard ML stuff
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import FastICA
from sklearn.random_projection import GaussianRandomProjection 
from sklearn.random_projection import SparseRandomProjection

from keras.models import Sequential
from keras.layers import Dense

# Oversampling of minority class 'Churn customers'
from imblearn.over_sampling import SMOTE

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
#setting the data directory
os.chdir('/home/mike/Documents/mkp_code/Institute of Data Course/telco-customer-churn-project/data/processed')

In [45]:
customer_data = pd.read_csv('Telco-Customer-Churn-Processed.csv')

* Remove the Unnamed: 0 Column

In [46]:
# Remove an unnecessary columns
customer_data.drop('Unnamed: 0',axis =1, inplace=True)

In [62]:
pca = PCA(n_components=3)
_X = pca.fit_transform(customer_data)
pca_data = pd.DataFrame(_X, columns=["PCA1", "PCA2", "PCA3"])
customer_data[["PCA1", "PCA2", "PCA3"]] = pca_data

fica = FastICA(n_components=3)
_X = fica.fit_transform(customer_data)
fica_data = pd.DataFrame(_X, columns=["FICA1", "FICA2", "FICA3"])
customer_data[["FICA1", "FICA2", "FICA3"]] = fica_data

tsvd = TruncatedSVD(n_components=3)
_X = tsvd.fit_transform(customer_data)
tsvd_data = pd.DataFrame(_X, columns=["TSVD1", "TSVD2", "TSVD3"])
customer_data[["TSVD1", "TSVD2", "TSVD3"]] = tsvd_data

grp = GaussianRandomProjection(n_components=3)
_X = grp.fit_transform(customer_data)
grp_data = pd.DataFrame(_X, columns=["GRP1", "GRP2", "GRP3"])
customer_data[["GRP1", "GRP2", "GRP3"]] = grp_data

srp = SparseRandomProjection(n_components=3)
_X = srp.fit_transform(customer_data)
srp_data = pd.DataFrame(_X, columns=["SRP1", "SRP2", "SRP3"])
customer_data[["SRP1", "SRP2", "SRP3"]] = srp_data

numeric_cols.extend(pca_data.columns.values)
numeric_cols.extend(fica_data.columns.values)
numeric_cols.extend(tsvd_data.columns.values)
numeric_cols.extend(grp_data.columns.values)
numeric_cols.extend(srp_data.columns.values)



In [66]:
# Set Constants
categorical_cols = ['gender', 'Partner', 'Dependents', 'PhoneService','MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'tenure_group']

FEATURE_COLS = customer_data.columns
TARGET_COL = 'Churn'
EPOCHS = 50
BATCH_SIZE = 4
CLASS_WEIGHTS = {0 : 1., 1 : 2.5}

In [67]:
# Placeholders for the model
cat_inputs = []
num_inputs = []
embeddings = []
embedding_layer_names = []
emb_n = 10

In [69]:
# Embedding for categorical features
for col in categorical_cols:
    _input = layers.Input(shape=[1], name=col)
    _embed = layers.Embedding(customer_data[col].max() + 1, emb_n, name=col+'_emb')(_input)
    cat_inputs.append(_input)
    embeddings.append(_embed)
    embedding_layer_names.append(col+'_emb')
    
# Simple inputs for the numeric features
for col in numeric_cols:
    numeric_input = layers.Input(shape=(1,), name=col)
    num_inputs.append(numeric_input)
    
# Merge the numeric inputs
merged_num_inputs = layers.concatenate(num_inputs)
#numeric_dense = layers.Dense(20, activation='relu')(merged_num_inputs)

# Merge embedding and use a Droput to prevent overfittting
merged_inputs = layers.concatenate(embeddings)
spatial_dropout = layers.SpatialDropout1D(0.2)(merged_inputs)
flat_embed = layers.Flatten()(spatial_dropout)

# Merge embedding and numeric features
all_features = layers.concatenate([flat_embed, merged_num_inputs])

# MLP for classification
x = layers.Dropout(0.2)(layers.Dense(100, activation='relu')(all_features))
x = layers.Dropout(0.2)(layers.Dense(50, activation='relu')(x))
x = layers.Dropout(0.2)(layers.Dense(25, activation='relu')(x))
x = layers.Dropout(0.2)(layers.Dense(15, activation='relu')(x))

# Final model
output = layers.Dense(1, activation='sigmoid')(x)
model = models.Model(inputs=cat_inputs + num_inputs, outputs=output)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [70]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
gender (InputLayer)             [(None, 1)]          0                                            
__________________________________________________________________________________________________
Partner (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
Dependents (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
PhoneService (InputLayer)       [(None, 1)]          0                                            


In [86]:
# TB Callback
log_folder = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H-%M-%S')
tb_callback = callbacks.TensorBoard(
    log_dir=os.path.join('tb-logs', log_folder),)

In [87]:
# Best model callback
bm_callback = callbacks.ModelCheckpoint(
    filepath=os.path.join('tb-logs', log_folder, 'bm.h5'),
    save_best_only=True,
    save_weights_only=False
)

In [94]:
train_df, test_df = train_test_split(customer_data, test_size=0.15, random_state=42)
print(train_df.shape)

(5986, 36)


In [95]:
def get_keras_dataset(df):
    X = {str(col) : np.array(df[col]) for col in df.columns}
    return X

In [96]:
_hist = model.fit(
    x=get_keras_dataset(train_df[FEATURE_COLS]),
    y=train_df[TARGET_COL],
    validation_data=(get_keras_dataset(test_df[FEATURE_COLS]), test_df[TARGET_COL]),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=CLASS_WEIGHTS,
    callbacks=[tb_callback, bm_callback],
    verbose=2
)

Train on 5986 samples, validate on 1057 samples
Epoch 1/50


ValueError: Could not pack sequence. Structure had 48 elements, but flat_sequence had 16 elements.  Structure: [<tf.Tensor 'gender_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Partner_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Dependents_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PhoneService_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'MultipleLines_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'InternetService_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'OnlineSecurity_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'OnlineBackup_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'DeviceProtection_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'TechSupport_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'StreamingTV_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'StreamingMovies_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Contract_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PaperlessBilling_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PaymentMethod_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'tenure_group_emb/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'gender_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Partner_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Dependents_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PhoneService_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'MultipleLines_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'InternetService_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'OnlineSecurity_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'OnlineBackup_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'DeviceProtection_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'TechSupport_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'StreamingTV_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'StreamingMovies_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Contract_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PaperlessBilling_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PaymentMethod_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'tenure_group_emb_1/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'gender_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Partner_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Dependents_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PhoneService_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'MultipleLines_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'InternetService_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'OnlineSecurity_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'OnlineBackup_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'DeviceProtection_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'TechSupport_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'StreamingTV_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'StreamingMovies_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'Contract_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PaperlessBilling_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'PaymentMethod_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>, <tf.Tensor 'tenure_group_emb_2/embedding_lookup/Identity_1:0' shape=(?, 1, 10) dtype=float32>], flat_sequence: [<tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f546010db38>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f546010d3c8>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f546010dfd0>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f546010d978>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123e48>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123748>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123b38>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123c88>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123ac8>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123f98>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123908>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f54601235f8>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123f60>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123208>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f5460123cf8>, <tensorflow.python.keras.utils.tf_utils.ListWrapper object at 0x7f54601233c8>].

In [72]:
# Plot the results of the training
def plot_history(history):
    fig = plt.figure(figsize=(15,8))
    ax = plt.subplot(211)
    
    plt.xlabel('Epoch')
    plt.ylabel('loss, acc')
    
    # losses
    ax.plot(history.epoch, history.history['loss'], label='Train LOSS')
    ax.plot(history.epoch, history.history['val_loss'], label='Val LOSS')
    ax.plot(history.epoch, history.history['acc'], label ='Train Accuracy')
    ax.plot(history.epoch, history.history['val_acc'], label='Val Accuracy')
    plt.legend()
    
    # Plot the learning_rate
    if 'lr' in history.history:
        ax = plt.subplot(212)
        plt.ylabel('Learning rate')
        ax.plot(history.epoch, history.history['lr'], label='learning_rate')
        plt.legend()
    plt.show()

In [59]:
# Export the transformed data to verify 
customer_data.to_csv('/home/mike/Documents/mkp_code/Institute of Data Course/telco-customer-churn-project/data/processed/Telco-Customer-Churn-Keras.csv')