## Predicting the Success of a Banks Marketing Campaign

This model was built with a real life dataset, from http://archive.ics.uci.edu/ml/datasets/Bank+Marketing#

The classification goal is to predict if the client will subscribe a term deposit (variable y).

In [25]:
# Libraries Needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

In [26]:
# Reading the data from the csv file
raw_data = pd.read_csv('bank-additional-full.csv')
raw_data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [27]:
# checking for missing numerical values
raw_data.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

#### Convert categorical variables to numeric

In [28]:
data = raw_data.copy()

In [29]:
data['y']

0         no
1         no
2         no
3         no
4         no
        ... 
41183    yes
41184     no
41185     no
41186    yes
41187     no
Name: y, Length: 41188, dtype: object

In [30]:
# converting the Y variable to numerical data
data['y'] = data['y'].map({'yes':1, 'no':0})

In [31]:
# converting the other numerical variables, with get_dummies funtion
data_with_dummies = pd.get_dummies(data)

In [32]:
data_with_dummies.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'mon

#### Declaring the Independent and dependent variable

Rearranging the variables, to make y first

In [33]:
cols = ['y', 'age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu',
       'day_of_week_tue', 'day_of_week_wed', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success']

In [34]:
data_preprocessed = data_with_dummies[cols]

In [35]:
y = data_preprocessed['y']
x1 = data_preprocessed.drop(['y'], axis=1)
X = sm.add_constant(x1)

#### Create the logistic regression

In [36]:
# This is to split the data for training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0)

In [37]:
reg_log = sm.Logit(y_train,X_train)
# This is to train the model with the fit function
results_log = reg_log.fit(maxiter=100)
# results_log.summary()

         Current function value: 0.208692
         Iterations: 100




### Testing the Model Accuracy

In [38]:
# This is used to describe the performance of our classification model
def confusion_matrix(data,actual_values,model):
 
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [39]:
confusion_matrix(X_train,y_train,results_log)

(array([[26668.,   741.],
        [ 2016.,  1466.]]),
 0.9107507040885695)

In [40]:
cm, accuracy = confusion_matrix(X_test,y_test,results_log)

In [41]:
correct_pred = cm[0,0] + cm[1,1]
print(f'Correct Predictions: {correct_pred}')

total_pred = cm.sum()
print(f'Total Predictions: {total_pred}')

Correct Predictions: 9395.0
Total Predictions: 10297.0


In [42]:
print(f'The model got {correct_pred} predictions right out of {total_pred} observations.\n')
print(f'Therefore the accuracy of this model is {accuracy.round(2)*(100/1)}%')

The model got 9395.0 predictions right out of 10297.0 observations.

Therefore the accuracy of this model is 91.0%
