In [56]:
 %matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from sklearn import tree

In [57]:
data = pd.read_csv('./Resources/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv')
data.head()

Unnamed: 0,Customer,State,Customer Lifetime Value,Response,Coverage,Education,Effective To Date,EmploymentStatus,Gender,Income,...,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size
0,BU79786,Washington,2763.519279,No,Basic,Bachelor,2/24/11,Employed,F,56274,...,5,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize
1,QZ44356,Arizona,6979.535903,No,Extended,Bachelor,1/31/11,Unemployed,F,0,...,42,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize
2,AI49188,Nevada,12887.43165,No,Premium,Bachelor,2/19/11,Employed,F,48767,...,38,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize
3,WW63253,California,7645.861827,No,Basic,Bachelor,1/20/11,Unemployed,M,0,...,65,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize
4,HB64268,Washington,2813.692575,No,Basic,Bachelor,2/3/11,Employed,M,43836,...,44,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize


In [58]:
new_data = data.drop(columns=['Effective To Date', 'Customer'])

In [59]:
# Add a column of 'Quartile LTV'. This column will determine the quartile where each 
# customer falls in thems of Customer Life Time Value.

# First, find the quartile threshold values

ltv = new_data["Customer Lifetime Value"]  

print("Q3 quantile of arr : ", np.quantile(ltv, .25))
print("Q2 quantile of arr : ", np.quantile(ltv, .50)) 
print("Q1 quantile of arr : ", np.quantile(ltv, .75))
q3 = np.quantile(ltv, .25)
q2 = np.quantile(ltv, .50)
q1 = np.quantile(ltv, .75)

Q3 quantile of arr :  3994.25179425
Q2 quantile of arr :  5780.182197
Q1 quantile of arr :  8962.16704125


In [60]:
# create Quartile LTV column
quartile_ltv = []
for val in ltv.tolist():
    if (0 < val) and (val <= q3):
        quartile_ltv.append(4)
    elif (q3 < val) and (val <= q2):
        quartile_ltv.append(3)
    elif (q2 < val) and (val <= q1):
        quartile_ltv.append(2)
    else:
        quartile_ltv.append(1)
new_data['Quartile LTV'] = quartile_ltv

In [61]:
test_2 = new_data['Quartile LTV'].isnull()
new_data[test_2]

Unnamed: 0,State,Customer Lifetime Value,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,...,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Quartile LTV


In [62]:
new_data_quartileLTV = new_data.drop(columns=['Customer Lifetime Value'])
new_data_quartileLTV.head()

Unnamed: 0,State,Response,Coverage,Education,EmploymentStatus,Gender,Income,Location Code,Marital Status,Monthly Premium Auto,...,Number of Open Complaints,Number of Policies,Policy Type,Policy,Renew Offer Type,Sales Channel,Total Claim Amount,Vehicle Class,Vehicle Size,Quartile LTV
0,Washington,No,Basic,Bachelor,Employed,F,56274,Suburban,Married,69,...,0,1,Corporate Auto,Corporate L3,Offer1,Agent,384.811147,Two-Door Car,Medsize,4
1,Arizona,No,Extended,Bachelor,Unemployed,F,0,Suburban,Single,94,...,0,8,Personal Auto,Personal L3,Offer3,Agent,1131.464935,Four-Door Car,Medsize,2
2,Nevada,No,Premium,Bachelor,Employed,F,48767,Suburban,Married,108,...,0,2,Personal Auto,Personal L3,Offer1,Agent,566.472247,Two-Door Car,Medsize,1
3,California,No,Basic,Bachelor,Unemployed,M,0,Suburban,Married,106,...,0,7,Corporate Auto,Corporate L2,Offer1,Call Center,529.881344,SUV,Medsize,2
4,Washington,No,Basic,Bachelor,Employed,M,43836,Rural,Single,73,...,0,1,Personal Auto,Personal L1,Offer1,Agent,138.130879,Four-Door Car,Medsize,4


In [63]:
# Dummy coding for categorical variables (get_dummies)
# Machine Learning algorithms work with numerical data. We have to convert our strings into meaningful numbers.
# We often use Integer, One-hot, or Binary Encoding. Sklearn provides a preprocessing libarary for all of these 
# standard preprocessing techniques. Pandas also provides a get_dummies method that is useful to generate binary 
# encoded data from a Data Frame.

data_binary_encoded = pd.get_dummies(new_data_quartileLTV, columns=['State','Response', 'Coverage', 'Education', 'EmploymentStatus', 'Gender', 'Location Code',
       'Marital Status', 'Policy Type', 'Policy', 'Renew Offer Type',
       'Sales Channel', 'Vehicle Class', 'Vehicle Size'])
data_binary_encoded.head()

Unnamed: 0,Income,Monthly Premium Auto,Months Since Last Claim,Months Since Policy Inception,Number of Open Complaints,Number of Policies,Total Claim Amount,Quartile LTV,State_Arizona,State_California,...,Sales Channel_Web,Vehicle Class_Four-Door Car,Vehicle Class_Luxury Car,Vehicle Class_Luxury SUV,Vehicle Class_SUV,Vehicle Class_Sports Car,Vehicle Class_Two-Door Car,Vehicle Size_Large,Vehicle Size_Medsize,Vehicle Size_Small
0,56274,69,32,5,0,1,384.811147,4,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,94,13,42,0,8,1131.464935,2,1,0,...,0,1,0,0,0,0,0,0,1,0
2,48767,108,18,38,0,2,566.472247,1,0,0,...,0,0,0,0,0,0,1,0,1,0
3,0,106,18,65,0,7,529.881344,2,0,1,...,0,0,0,0,1,0,0,0,1,0
4,43836,73,12,44,0,1,138.130879,4,0,0,...,0,1,0,0,0,0,0,0,1,0


In [64]:
data_binary_encoded.columns

Index(['Income', 'Monthly Premium Auto', 'Months Since Last Claim',
       'Months Since Policy Inception', 'Number of Open Complaints',
       'Number of Policies', 'Total Claim Amount', 'Quartile LTV',
       'State_Arizona', 'State_California', 'State_Nevada', 'State_Oregon',
       'State_Washington', 'Response_No', 'Response_Yes', 'Coverage_Basic',
       'Coverage_Extended', 'Coverage_Premium', 'Education_Bachelor',
       'Education_College', 'Education_Doctor',
       'Education_High School or Below', 'Education_Master',
       'EmploymentStatus_Disabled', 'EmploymentStatus_Employed',
       'EmploymentStatus_Medical Leave', 'EmploymentStatus_Retired',
       'EmploymentStatus_Unemployed', 'Gender_F', 'Gender_M',
       'Location Code_Rural', 'Location Code_Suburban', 'Location Code_Urban',
       'Marital Status_Divorced', 'Marital Status_Married',
       'Marital Status_Single', 'Policy Type_Corporate Auto',
       'Policy Type_Personal Auto', 'Policy Type_Special Auto',
    

In [65]:
X = data_binary_encoded.drop("Quartile LTV", axis=1)
y = data_binary_encoded["Quartile LTV"]
print(X.shape, y.shape)

(9134, 64) (9134,)


In [66]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [67]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

X_scaler = StandardScaler().fit(X_train)

In [68]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [69]:
from tensorflow.keras.utils import to_categorical

In [70]:
data_binary_encoded = data_binary_encoded.dropna()
print(data_binary_encoded['Quartile LTV'].unique())

[4 2 1 3]


In [71]:
# One-hot encoding
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [72]:
# Step 2: Convert encoded labels to one-hot-encoding
y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [73]:
# Build model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [74]:
# model parameters
number_inputs = 64
number_hidden_nodes = 100
number_classes = 4

model = Sequential()
model.add(Dense(units=number_hidden_nodes, activation='relu', input_dim=number_inputs))
model.add(Dense(units=number_classes, activation='softmax'))

In [75]:
 model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_4 (Dense)              (None, 100)               6500      
_________________________________________________________________
dense_5 (Dense)              (None, 4)                 404       
Total params: 6,904
Trainable params: 6,904
Non-trainable params: 0
_________________________________________________________________


In [76]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [77]:
model.fit(
    X_train_scaled,
    y_train_categorical,
    batch_size=64,
    epochs=50,
    shuffle=False,
    verbose=2
)

Epoch 1/50
108/108 - 0s - loss: 1.3099 - accuracy: 0.4013
Epoch 2/50
108/108 - 0s - loss: 1.0766 - accuracy: 0.5648
Epoch 3/50
108/108 - 0s - loss: 0.9571 - accuracy: 0.6190
Epoch 4/50
108/108 - 0s - loss: 0.8495 - accuracy: 0.6653
Epoch 5/50
108/108 - 0s - loss: 0.7504 - accuracy: 0.7099
Epoch 6/50
108/108 - 0s - loss: 0.6653 - accuracy: 0.7533
Epoch 7/50
108/108 - 0s - loss: 0.5951 - accuracy: 0.7940
Epoch 8/50
108/108 - 0s - loss: 0.5373 - accuracy: 0.8232
Epoch 9/50
108/108 - 0s - loss: 0.4898 - accuracy: 0.8455
Epoch 10/50
108/108 - 0s - loss: 0.4501 - accuracy: 0.8634
Epoch 11/50
108/108 - 0s - loss: 0.4168 - accuracy: 0.8761
Epoch 12/50
108/108 - 0s - loss: 0.3889 - accuracy: 0.8850
Epoch 13/50
108/108 - 0s - loss: 0.3651 - accuracy: 0.8917
Epoch 14/50
108/108 - 0s - loss: 0.3443 - accuracy: 0.8966
Epoch 15/50
108/108 - 0s - loss: 0.3262 - accuracy: 0.9009
Epoch 16/50
108/108 - 0s - loss: 0.3101 - accuracy: 0.9054
Epoch 17/50
108/108 - 0s - loss: 0.2957 - accuracy: 0.9088
Epoch 

<tensorflow.python.keras.callbacks.History at 0x16defa48630>