## DNN - Keras and categorical feature embedding
### Demostration code from Kaggle:
#### https://www.kaggle.com/blaskowitz100/dnn-keras-and-categorical-feature-embedding

In [1]:
# Import the necessary libraries
import os
import time
import datetime
import numpy as np
import pandas as pd

# Keras
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K

# Standard ML stuff
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD, FastICA
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection

from keras.models import Sequential
from keras.layers import Dense

# Oversampling of minority class 'Churn customers'
# from imblearn.over_sampling import SMOTE

# Plotting
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

Using TensorFlow backend.


In [2]:
#setting the data directory
os.chdir('/home/mike/Documents/mkp_code/Institute of Data Course/telco-customer-churn-project/data/processed')

In [3]:
customer_data = pd.read_csv('Telco-Customer-Churn-Processed.csv')

* Remove the Unnamed: 0 Column

In [4]:
customer_data.drop('Unnamed: 0',axis =1, inplace=True)

In [5]:
# Churn - target column
customer_data['Churn'].replace('Yes',1, inplace = True)
customer_data['Churn'].replace('No',0, inplace = True)

In [38]:
train_df, test_df = train_test_split(customer_data, test_size=0.15, random_state=42)
print(train_df.shape)

(5986, 21)


* Helper Functions

In [6]:
def get_keras_dataset(df):
    X = {str(col) : np.array(df[col]) for col in df.columns}
    return X

In [7]:
# Plot the results of the training
def plot_history(history):
    fig = plt.figure(figsize=(15,8))
    ax = plt.subplot(211)
    
    plt.xlabel('Epoch')
    plt.ylabel('loss, acc')
    
    # losses
    ax.plot(history.epoch, history.history['loss'], label='Train LOSS')
    ax.plot(history.epoch, history.history['val_loss'], label='Val LOSS')
    ax.plot(history.epoch, history.history['acc'], label ='Train Accuracy')
    ax.plot(history.epoch, history.history['val_acc'], label='Val Accuracy')
    plt.legend()
    
    # Plot the learning_rate
    if 'lr' in history.history:
        ax = plt.subplot(212)
        plt.ylabel('Learning rate')
        ax.plot(history.epoch, history.history['lr'], label='learning_rate')
        plt.legend()
    plt.show()

In [8]:
# Create feature category for customer tenure
def tenure_lab(customer_data) :
    if customer_data["tenure"] <= 12 :
        return "Tenure_0-12"
    elif (customer_data["tenure"] > 12) & (customer_data["tenure"] <= 24 ):
        return "Tenure_12-24"
    elif (customer_data["tenure"] > 24) & (customer_data["tenure"] <= 48) :
        return "Tenure_24-48"
    elif (customer_data["tenure"] > 48) & (customer_data["tenure"] <= 60) :
        return "Tenure_48-60"
    elif customer_data["tenure"] > 60 :
        return "Tenure_gt_60"

In [9]:
# Add the tenure group to the dataset:
customer_data["tenure_group"] = customer_data.apply(lambda customer_data:tenure_lab(customer_data), axis=1)

In [10]:
# Divide the numeric columns from the non-numeric 
numeric_cols = ['MonthlyCharges', 'TotalCharges', 'tenure']
target_col = ['Churn']

# Select categorical 
categorical_cols = customer_data.select_dtypes(include='object').columns
categorical_cols = [col for col in categorical_cols if col not in target_col]

In [11]:
# Use LabelEncoder instead of dummy categories
for col in categorical_cols:
    customer_data[col] = LabelEncoder().fit_transform(customer_data[col])

In [12]:
# Standardise the numeric data before fitting to the data to a model
customer_data[numeric_cols] = StandardScaler().fit_transform(customer_data[numeric_cols])

In [13]:
# Initialise the models
K.clear_session()

In [14]:
# Define constants
FEATURE_COLS = numeric_cols + categorical_cols
TARGET_COL = 'Churn'
EPOCHS = 50
BATCH_SIZE = 4
CLASS_WEIGHTS = {0 : 1., 1 : 2.5}

In [15]:
# model outputs
cat_inputs = []
num_inputs = []
embeddings = []
embedding_layer_names = []
emb_n = 10

In [33]:
# Embedding for categorical features
for col in categorical_cols:
    _input = layers.Input(shape=[1], name=col)
    _embed = layers.Embedding(customer_data[col].max() + 1, emb_n, name=col+'_emb')(_input)
    cat_inputs.append(_input)
    embeddings.append(_embed)
    embedding_layer_names.append(col+'_emb')
    

In [34]:
# Simple inputs for the numeric features
for col in numeric_cols:
    numeric_input = layers.Input(shape=(1,), name=col)
    num_inputs.append(numeric_input)

In [35]:
# Merge the numeric inputs
merged_num_inputs = layers.concatenate(num_inputs)
#numeric_dense = layers.Dense(20, activation='relu')(merged_num_inputs)


In [36]:
# MLP for classification
x = layers.Dropout(0.2)(layers.Dense(100, activation='relu')(all_features))
x = layers.Dropout(0.2)(layers.Dense(50, activation='relu')(x))
x = layers.Dropout(0.2)(layers.Dense(25, activation='relu')(x))
x = layers.Dropout(0.2)(layers.Dense(15, activation='relu')(x))

NameError: name 'all_features' is not defined

In [29]:
# Final model
output = layers.Dense(1, activation='sigmoid')(x)
model = models.Model(inputs=cat_inputs + num_inputs, outputs=output)

NameError: name 'x' is not defined

In [30]:
def dice_coef(y_true, y_pred, smooth=1): intersection = K.sum(K.abs(y_true y_pred), axis=-1) return (2. intersection + smooth) / (K.sum(K.square(y_true),-1) + K.sum(K.square(y_pred),-1) + smooth)

def dice_coef_loss(y_true, y_pred): return 1-dice_coef(y_true, y_pred)

SyntaxError: invalid syntax (<ipython-input-30-6ab0a19527a5>, line 1)

In [31]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 12)                264       
_________________________________________________________________
dense_1 (Dense)              (None, 21)                273       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 22        
Total params: 559
Trainable params: 559
Non-trainable params: 0
_________________________________________________________________
None


In [32]:
# Export the transformed data to verify 
customer_data.to_csv('/home/mike/Documents/mkp_code/Institute of Data Course/telco-customer-churn-project/data/processed/Telco-Customer-Churn-Keras.csv')