In [1]:
# ETL
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.preprocessing import Normalizer, StandardScaler

# Evaluating, Scoring and splitting
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, train_test_split, cross_val_score

# Saving and loading model
import pickle

# Data Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier

# Utils
import warnings
warnings.filterwarnings("ignore")

In [20]:
## Configs

# paths
original_data_path = '../02_data/raw/ml_project1_data.csv'
data_path = '../02_data/processed/01_customer_clusters.csv'
save_data_path = '../02_data/processed/02_customer_acceptance_prob.csv'
save_model_path = '../05_models/customer_customer_acceptance.pkl'

# models config
test_size = 0.33
num_folds = 10
seed = 5
num_trees = 100

In [3]:
df = pd.read_csv(data_path)
df_aux = deepcopy(df.drop(['Dt_Customer'], axis=1))

In [4]:
# Data type fixing
categorical_columns = ['Response',
                       'Complain',
                       'AcceptedCmp1',             
                       'AcceptedCmp2',             
                       'AcceptedCmp3',            
                       'AcceptedCmp4',             
                       'AcceptedCmp5',
                       'Education',
                       'Marital_Status',
                       'accepted_cmp_before']

for col in categorical_columns:
    
    df_aux[col] = df_aux[col].astype('category')
    df_aux[col] = df_aux[col].cat.codes

In [5]:
df_aux = df_aux[['YearsOfEnrollment', 'Age', 'Education', 'Marital_Status', 'Kidhome',
       'Teenhome', 'Recency', 'Income', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4',
       'AcceptedCmp5', 'Complain', 'accepted_cmp_before',
       'qtd_cmp_accepted', 'Cluster', 'Response']]

In [6]:
# Feature Selection
array = df_aux.values

# creating input and output arrays
X = array[:,:28]
Y = array[:,28]

# Creating feature selection model
modelo = ExtraTreesClassifier()
modelo.fit(X, Y)

# Print Results
res = [[round(modelo.feature_importances_[i],4),df_aux.columns[i]] for i in range(len(df_aux.columns)-1)]
res.sort(); res

[[0.002, 'Complain'],
 [0.0097, 'AcceptedCmp2'],
 [0.0118, 'AcceptedCmp4'],
 [0.0153, 'Kidhome'],
 [0.0236, 'Teenhome'],
 [0.0255, 'AcceptedCmp1'],
 [0.0258, 'accepted_cmp_before'],
 [0.0344, 'Education'],
 [0.0351, 'NumDealsPurchases'],
 [0.0362, 'NumWebPurchases'],
 [0.0367, 'MntFishProducts'],
 [0.0374, 'Cluster'],
 [0.0376, 'MntFruits'],
 [0.038, 'Age'],
 [0.038, 'NumWebVisitsMonth'],
 [0.0387, 'Income'],
 [0.039, 'MntGoldProds'],
 [0.0399, 'MntSweetProducts'],
 [0.0412, 'Marital_Status'],
 [0.0414, 'NumStorePurchases'],
 [0.0423, 'AcceptedCmp3'],
 [0.0427, 'NumCatalogPurchases'],
 [0.0439, 'MntWines'],
 [0.0459, 'AcceptedCmp5'],
 [0.0465, 'YearsOfEnrollment'],
 [0.0476, 'MntMeatProducts'],
 [0.0551, 'qtd_cmp_accepted'],
 [0.0687, 'Recency']]

In [7]:
# let's filter features with importance above 0.03
informative_features = [x[1] for x in res if x[0] >= 0.037]
informative_features

['Cluster',
 'MntFruits',
 'Age',
 'NumWebVisitsMonth',
 'Income',
 'MntGoldProds',
 'MntSweetProducts',
 'Marital_Status',
 'NumStorePurchases',
 'AcceptedCmp3',
 'NumCatalogPurchases',
 'MntWines',
 'AcceptedCmp5',
 'YearsOfEnrollment',
 'MntMeatProducts',
 'qtd_cmp_accepted',
 'Recency']

In [8]:
## Testing various models

# Scalling the data
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

# Preparing a list of models
models = []
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('NB', GaussianNB()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))


# Evaluating each model in a loop
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits = num_folds, random_state = seed, shuffle=True)
    cv_results = cross_val_score(model, X, Y, cv = kfold, scoring = 'accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "%s: %4f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LDA: 0.885268 (0.028167)
NB: 0.829464 (0.020242)
KNN: 0.874107 (0.020144)
DTC: 0.848661 (0.019505)
SVM: 0.880357 (0.028079)
RFC: 0.883929 (0.030080)


LinearDiscriminantAnalysis, SVM, KNN and RandomForestClassifier were the most accurate models, we will continue to work with them.

In [9]:
## Optimization

# Separating data in folds
kfold = KFold(num_folds, True, random_state = seed)


# Creating a list with best models
best_models = [('RFC', RandomForestClassifier()), 
               ('LDA', LinearDiscriminantAnalysis()),
               ('SVM', SVC()),
               ('KNN', KNeighborsClassifier())]

# Creating a dict to store resulting models 
results = dict()

for name, model_ in best_models:
    
    model = BaggingClassifier(base_estimator = model_, n_estimators = num_trees, random_state = seed)
    results[name] = model
    result = cross_val_score(model, X, Y, cv = kfold)
    
    print(name, "- Accuracy: %.2f%% (%.4f)" % (result.mean() * 100,  result.std()))

RFC - Accuracy: 88.39% (0.0284)
LDA - Accuracy: 88.57% (0.0289)
SVM - Accuracy: 87.99% (0.0281)
KNN - Accuracy: 87.54% (0.0218)


The four algorithms performed in a similar way, let's choose the model with the highest standard accuracy (Linear Discriminant Analysis)

In [17]:
# Spliting Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = test_size, random_state = seed)

# Selecting and evalueting model
model = results['LDA']
model = model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

print((accuracy_score(Y_test, Y_pred)))

0.8851351351351351


In [18]:
# Creating 
probs = np.array(model.predict_proba(X))
acceptance_proba_list = [x[1] for x in probs]

df['acceptance_prob'] = acceptance_proba_list

In [19]:
df.head()

Unnamed: 0,Dt_Customer,YearsOfEnrollment,Age,Education,Marital_Status,Kidhome,Teenhome,Recency,Income,MntWines,...,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Complain,Response,accepted_cmp_before,qtd_cmp_accepted,Cluster,acceptance_prob
0,2012-09-04,2,55,Graduation,Single,0,0,58,58138.0,635,...,0,0,0,0,0,1,0,0,1,0.422973
1,2014-03-08,0,60,Graduation,Single,1,1,38,46344.0,11,...,0,0,0,0,0,0,0,0,2,0.004031
2,2013-08-21,1,48,Graduation,Together,0,0,26,71613.0,426,...,0,0,0,0,0,0,0,0,4,0.02672
3,2014-02-10,0,30,Graduation,Together,1,0,26,26646.0,11,...,0,0,0,0,0,0,0,0,2,0.009109
4,2014-01-19,0,33,PhD,Married,1,0,94,58293.0,173,...,0,0,0,0,0,0,0,0,5,0.01491


In [26]:
# adding acceptance probability and customer cluster to the original dataset
raw_data = pd.read_csv(original_data_path)
raw_data['Cluster'] = df['Cluster']
raw_data['acceptance_prob'] = df['acceptance_prob']

In [25]:
raw_data.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Cluster,acceptance_prob
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,0,0,0,0,0,3,11,1,1,0.422973
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,0,0,0,0,0,3,11,0,2,0.004031
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,0,0,0,0,0,3,11,0,4,0.026720
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,0,0,0,0,0,3,11,0,2,0.009109
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,0,0,0,0,0,3,11,0,5,0.014910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,10870,1967,Graduation,Married,61223.0,0,1,2013-06-13,46,709,...,0,0,0,0,0,3,11,0,3,0.041146
2236,4001,1946,PhD,Together,64014.0,2,1,2014-06-10,56,406,...,0,0,1,0,0,3,11,0,2,0.126946
2237,7270,1981,Graduation,Divorced,56981.0,0,0,2014-01-25,91,908,...,1,0,0,0,0,3,11,0,5,0.004473
2238,8235,1956,Master,Together,69245.0,0,1,2014-01-24,8,428,...,0,0,0,0,0,3,11,0,4,0.013269


In [27]:
# Saving dataset
raw_data.to_csv(save_data_path, index=False)

# Saving model
with open(save_model_path,'wb') as f:
    pickle.dump(model,f)