In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import category_encoders as ce

date_format = "%m/%d/%Y"

<h1>I. Data Understanding</h1>
<p>Bla bla bla</p>

<h1>II. Data Preparation</h1>
<p>We put together all preparations done in the previous file in a function, so we can execute again</p>

In [17]:
def dataProcessing(file):
    df = pd.read_excel(file, index_col="Custid")
    # DROPING COLUMNS
    cols_to_drop = ["Year_Birth", "Dt_Customer", "Kidhome", "Teenhome", "MntLighting", "MntCameras", "MntDoor_Locks", "MntThermostats", "MntSecurity_Systems", "MntPremium", "Z_CostContact", "Z_Revenue", "MntTotal", "AcceptedCmp2", "AcceptedCmp3", "AcceptedCmp4", "AcceptedCmp5", "Education", "Marital_Status"]

    # Droping group info
    df.drop(['Group', 'Element1', 'Element2', 'Element3', 'Element4', 'Element5'], axis=1, inplace=True)

    #Creating columns for age and days as a Customer instead of using the regular values
    df['Age'] = 2020 - df['Year_Birth']
    df['DaysAsCustomer'] = (datetime.strptime('03/18/2021', date_format) - df['Dt_Customer']).dt.days

    #Changing the columns teenhome and kidhome to boolean
    df["Kidhome"] = np.where(df["Kidhome"] == 0, df["Kidhome"], 1)
    df["Teenhome"] = np.where(df["Teenhome"] == 0, df["Teenhome"], 1)
    #Creating new column to know if the person has kid, teen, both or none
    df["Children"] = df["Kidhome"] + df["Teenhome"]

    #Dealing with missing values
    #Income
    df['Income'] = df.groupby(['Education', 'Children'])['Income'].apply(lambda x: x.fillna(x.median()))
    #Filling NaN on MntSecurity_Systems and MntPremium by 0
    df['MntSecurity_Systems'].fillna(0, inplace=True)
    df['MntPremium'].fillna(0, inplace=True)
    
    #Preparation DF
    df_prep = df.copy(deep=True)

    #Loggin the Mnt Values
    df_prep["logLighting"] = np.where(df_prep["MntLighting"] == 0, df_prep["MntLighting"], np.log(df_prep["MntLighting"]))
    df_prep["logCameras"] = np.where(df_prep["MntCameras"] == 0, df_prep["MntCameras"], np.log(df_prep["MntCameras"]))
    df_prep["logDoor_Locks"] = np.where(df_prep["MntDoor_Locks"] == 0, df_prep["MntDoor_Locks"], np.log(df_prep["MntDoor_Locks"]))
    df_prep["logThermostats"] = np.where(df_prep["MntThermostats"] == 0, df_prep["MntThermostats"], np.log(df_prep["MntThermostats"]))
    df_prep["logSecurity_Systems"] = np.where(df_prep["MntSecurity_Systems"] == 0, df_prep["MntSecurity_Systems"], np.log(df_prep["MntSecurity_Systems"]))
    df_prep["logPremium"] = np.where(df_prep["MntPremium"] == 0, df_prep["MntPremium"], np.log(df_prep["MntPremium"]))

    #Creating a column with the sum of the Mnt Values
    df_prep["MntTotal"] = df_prep["MntLighting"] + df_prep["MntCameras"] + df_prep["MntDoor_Locks"] + df_prep["MntThermostats"] + df_prep["MntSecurity_Systems"] + df_prep["MntPremium"]
    df_prep["logTotal"] = df_prep["logLighting"] + df_prep["logCameras"] + df_prep["logDoor_Locks"] + df_prep["logThermostats"] + df_prep["logSecurity_Systems"] + df_prep["logPremium"]

    incomeCap = 150000
    df_prep = df_prep[df_prep["Income"] < incomeCap]

    #Summing the Acceptance Campaigns
    df_prep["AcceptCmpTotal"] = df_prep["AcceptedCmp2"] + df_prep["AcceptedCmp3"] + df_prep["AcceptedCmp4"] + df_prep["AcceptedCmp5"]

    #Encoding categorica columns
    # Enconding the columns that are strings or categorical
    colsToEncode = ["Education", "Marital_Status"]
    dummies = pd.get_dummies(df_prep[colsToEncode])
    df_prep = pd.concat([df_prep, dummies], axis=1)

    #Droping original columns
    df_prep.drop(cols_to_drop, axis=1, inplace=True)

    return df_prep

In [18]:
campaign = "Group35_SmartHome_Campaign.xlsx"

df_campaign = dataProcessing(campaign)

  warn("Workbook contains no default style, apply openpyxl's default")
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [19]:
df_campaign.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2492 entries, 350001144 to 350011119
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Income                   2492 non-null   float64
 1   Recency                  2492 non-null   int64  
 2   NumDealsPurchases        2492 non-null   int64  
 3   NumWebPurchases          2492 non-null   int64  
 4   NumCatalogPurchases      2492 non-null   int64  
 5   NumStorePurchases        2492 non-null   int64  
 6   NumWebVisitsMonth        2492 non-null   int64  
 7   AcceptedCmp1             2492 non-null   int64  
 8   Complain                 2492 non-null   int64  
 9   DepVar                   2492 non-null   int64  
 10  Age                      2492 non-null   int64  
 11  DaysAsCustomer           2492 non-null   int64  
 12  Children                 2492 non-null   int64  
 13  logLighting              2492 non-null   float64
 14  logCameras 

<h1>Modeling</h1>

In [20]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import learning_curve
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix

<h4>Useful Functions</h4>

In [21]:
def get_revenue(y_test, y_pred, revenue, cost):
    from sklearn.metrics import confusion_matrix 
    cm = confusion_matrix(y_test, y_pred)
    total_revenue = (cm[1][1]) * revenue
    return total_revenue

In [22]:
def get_profit(y_test, y_pred, revenue, cost):
    from sklearn.metrics import confusion_matrix 
    cm = confusion_matrix(y_test, y_pred)
    total_cost = (cm[0][1] + cm[1][1]) * cost
    total_revenue = (cm[1][1]) * revenue
    profit = total_revenue - total_cost
    return profit

In [23]:
def get_cost(y_test, y_pred, revenue, cost):
    from sklearn.metrics import confusion_matrix 
    cm = confusion_matrix(y_test, y_pred)
    total_cost = (cm[0][1] + cm[1][1]) * cost
    return total_cost

In [24]:
def get_roi(y_test, y_pred, revenue, cost):
    from sklearn.metrics import confusion_matrix 
    cm = confusion_matrix(y_test, y_pred)
    total_cost = (cm[0][1] + cm[1][1]) * cost
    total_revenue = (cm[1][1]) * revenue
    roi = (total_revenue / total_cost) * 100
    return roi

<h3>X & y | Train/Test Split</h3>

In [25]:
X = df_campaign.copy(deep=True)

y = X["DepVar"]

X.drop("DepVar", axis=1, inplace=True)

In [26]:
# Split the dataset intro train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.75, test_size = 0.25, random_state = 123, stratify=y)

<h3>Learning Courve - Tunning the Model</h3>
<p>In this phase, we will find the best tunning to our Decision Tree. We will twist three main elements:</p>
<ul>
    <li>Maximum Tree Depth: </li>
    <li>Min Samples per Leaf: </li>
    <li>Classifier Threshold: </li>
</ul>

In [27]:
#Best number of depth and leaf

# Create an empty dataframe for the resuls
treePerformance = pd.DataFrame({'Max_Depth': pd.Series([], dtype='string'),
                             'Threshold': pd.Series([], dtype='string'),
                             'Min_Samples_Leaf': pd.Series([], dtype='string'),
                             'Revenue': pd.Series([], dtype='float64'),
                             'Cost': pd.Series([], dtype='float64'),
                             'Profit': pd.Series([], dtype='float64'),
                             'ROI': pd.Series([], dtype='float64')
                             })

D = range(3,15)
for d in D:
    L = range(1,8)

    for l in L:
        T = range(3,6)
        for t in T:
            t = t/10

            dt_regr = tree.DecisionTreeClassifier(random_state=123, max_depth=d, min_samples_leaf=l)
            dt_regr.fit(X_train, y_train)
            model = dt_regr.fit(X_train, y_train)

            # predict y for X_train and X_test
            y_pred_train = (dt_regr.predict_proba(X_train)[:,1] >= t).astype(bool)
            y_pred_test = (dt_regr.predict_proba(X_test)[:,1] >= t).astype(bool)

            rev = get_revenue(y_test, y_pred_test, 14, 4)
            cost = get_cost(y_test, y_pred_test, 14, 4)
            profity = get_profit(y_test, y_pred_test, 14, 4)
            roi = get_roi(y_test, y_pred_test, 14, 4)

            treePerformance = treePerformance.append({'Max_Depth': d,
                                'Threshold': t,
                                'Min_Samples_Leaf': l,
                                'Revenue': rev,
                                'Cost': cost,
                                'Profit': profity,
                                'ROI': roi
                                }, ignore_index=True)

In [30]:
treePerformance

Unnamed: 0,Max_Depth,Threshold,Min_Samples_Leaf,Revenue,Cost,Profit,ROI
0,3.0,0.3,1.0,756.0,508.0,248.0,148.818898
1,3.0,0.4,1.0,448.0,216.0,232.0,207.407407
2,3.0,0.5,1.0,196.0,84.0,112.0,233.333333
3,3.0,0.3,2.0,756.0,508.0,248.0,148.818898
4,3.0,0.4,2.0,448.0,216.0,232.0,207.407407
...,...,...,...,...,...,...,...
247,14.0,0.4,6.0,574.0,360.0,214.0,159.444444
248,14.0,0.5,6.0,574.0,360.0,214.0,159.444444
249,14.0,0.3,7.0,532.0,372.0,160.0,143.010753
250,14.0,0.4,7.0,490.0,312.0,178.0,157.051282


<h3>Create Model</h3>

In [28]:
# Create and train the model
dt_regr = tree.DecisionTreeClassifier(random_state=123, max_depth=8, min_samples_leaf=3)
dt_regr.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=8, min_samples_leaf=3, random_state=123)

<h3>Prediction (y_test, y_train)</h3>

In [29]:
model = dt_regr.fit(X_train, y_train)

# predict y for X_train and X_test
y_pred_train = (dt_regr.predict_proba(X_train)[:,1] >= 0.5).astype(bool)
y_pred_test = (dt_regr.predict_proba(X_test)[:,1] >= 0.5).astype(bool)

<h2>Evaluate the Model</h2>