# Testing the model

Using your solution so far, test the model on new data.

The new data is located in the ‘Bank_data_testing.csv’.

Good luck!

## Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

## Load the data

Load the ‘Bank_data.csv’ dataset.

In [2]:
# load the new dataset
df = pd.read_csv('../data/bank_data_full.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.120,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no
...,...,...,...,...,...,...,...,...
513,513,1.334,0.0,1.0,0.0,0.0,204.0,no
514,514,0.861,0.0,0.0,2.0,1.0,806.0,yes
515,515,0.879,0.0,0.0,0.0,0.0,290.0,no
516,516,0.877,0.0,0.0,5.0,1.0,473.0,yes


### Declare the dependent and independent variables

Use 'duration' as the independet variable.

In [4]:
# variables
x1 = df['duration']
y = df['y'].map({'yes':1, 'no':0})

### Simple Logistic Regression

Run the regression and graph the scatter plot.

In [5]:
# logistic regression
x = sm.add_constant(x1)
reg = sm.Logit(y, x)
results = reg.fit()

Optimization terminated successfully.
         Current function value: 0.546118
         Iterations 7


In [6]:
# interpretation
results.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,516.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 19 Feb 2021",Pseudo R-squ.:,0.2121
Time:,22:07:33,Log-Likelihood:,-282.89
converged:,True,LL-Null:,-359.05
Covariance Type:,nonrobust,LLR p-value:,5.387e-35

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.7001,0.192,-8.863,0.000,-2.076,-1.324
duration,0.0051,0.001,9.159,0.000,0.004,0.006


## Expand the model

We can be omitting many causal factors in our simple logistic model, so we instead switch to a multivariate logistic regression model. Add the ‘interest_rate’, ‘march’, ‘credit’ and ‘previous’ estimators to our model and run the regression again. 

### Declare the independent variable(s)

In [16]:
# Expand the model to include more features
x1 = df[['interest_rate', 'march', 'credit', 'previous', 'duration']]
y = df['y'].map({'yes':1, 'no':0})

In [17]:
# logistic regression
x = sm.add_constant(x1)
reg = sm.Logit(y, x)
results = reg.fit()
results.summary2()

Optimization terminated successfully.
         Current function value: 0.336664
         Iterations 7


0,1,2,3
Model:,Logit,Pseudo R-squared:,0.514
Dependent Variable:,y,AIC:,360.7836
Date:,2021-02-19 22:14,BIC:,386.2834
No. Observations:,518,Log-Likelihood:,-174.39
Df Model:,5,LL-Null:,-359.05
Df Residuals:,512,LLR p-value:,1.2114e-77
Converged:,1.0000,Scale:,1.0
No. Iterations:,7.0000,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
const,-0.0211,0.3113,-0.0677,0.9460,-0.6313,0.5891
interest_rate,-0.8001,0.0895,-8.9434,0.0000,-0.9755,-0.6248
march,-1.8322,0.3297,-5.5563,0.0000,-2.4785,-1.1859
credit,2.3585,1.0875,2.1688,0.0301,0.2271,4.4900
previous,1.5363,0.5010,3.0666,0.0022,0.5544,2.5182
duration,0.0070,0.0007,9.3810,0.0000,0.0055,0.0084


### Confusion Matrix

Find the confusion matrix of the model and estimate its accuracy. 

<i> For convenience we have already provided you with a function that finds the confusion matrix and the model accuracy.</i>

In [9]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [18]:
confusion_matrix(x, y, results)

(array([[218.,  41.],
        [ 30., 229.]]),
 0.862934362934363)

## Test the model

Load the test data from the ‘Bank_data_testing.csv’ file provided. (Remember to convert the outcome variable ‘y’ into Boolean). 

### Load new data 

In [28]:
test = pd.read_csv('../data/bank_data_testing.csv')

In [29]:
test['y'] = test['y'].map({'yes':1, 'no':0})
test = test.drop(['Unnamed: 0'], axis=1)
test

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,1.313,0.0,1.0,0.0,0.0,487.0,0
1,4.961,0.0,0.0,0.0,0.0,132.0,0
2,4.856,0.0,1.0,0.0,0.0,92.0,0
3,4.120,0.0,0.0,0.0,0.0,1468.0,1
4,4.963,0.0,0.0,0.0,0.0,36.0,0
...,...,...,...,...,...,...,...
217,4.963,0.0,0.0,0.0,0.0,458.0,1
218,1.264,0.0,1.0,1.0,0.0,397.0,1
219,1.281,0.0,1.0,0.0,0.0,34.0,0
220,0.739,0.0,0.0,2.0,0.0,233.0,0


### Declare the dependent and the independent variables

In [30]:
y

0      0
1      1
2      0
3      1
4      0
      ..
513    0
514    1
515    0
516    1
517    0
Name: y, Length: 518, dtype: int64

In [31]:
x

Unnamed: 0,const,interest_rate,march,credit,previous,duration
0,1.0,1.334,1.0,0.0,0.0,117.0
1,1.0,0.767,0.0,0.0,1.0,274.0
2,1.0,4.858,1.0,0.0,0.0,167.0
3,1.0,4.120,0.0,0.0,0.0,686.0
4,1.0,4.856,1.0,0.0,0.0,157.0
...,...,...,...,...,...,...
513,1.0,1.334,1.0,0.0,0.0,204.0
514,1.0,0.861,0.0,0.0,1.0,806.0
515,1.0,0.879,0.0,0.0,0.0,290.0
516,1.0,0.877,0.0,0.0,1.0,473.0


Determine the test confusion matrix and the test accuracy and compare them with the train confusion matrix and the train accuracy.

In [33]:
test_actual = test['y']
test_data = test[['interest_rate', 'march', 'credit', 'previous', 'duration']]
test_data = sm.add_constant(test_data)
test_data

Unnamed: 0,const,interest_rate,march,credit,previous,duration
0,1.0,1.313,1.0,0.0,0.0,487.0
1,1.0,4.961,0.0,0.0,0.0,132.0
2,1.0,4.856,1.0,0.0,0.0,92.0
3,1.0,4.120,0.0,0.0,0.0,1468.0
4,1.0,4.963,0.0,0.0,0.0,36.0
...,...,...,...,...,...,...
217,1.0,4.963,0.0,0.0,0.0,458.0
218,1.0,1.264,1.0,0.0,0.0,397.0
219,1.0,1.281,1.0,0.0,0.0,34.0
220,1.0,0.739,0.0,0.0,0.0,233.0


In [34]:
def confusion_matrix(data, actual_values, model):
    
    pred_values = model.predict(data)
    bins = np.array([0, 0.5, 1])
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
    accuracy = (cm[0,0] + cm[1,1]) / cm.sum()
    return cm, accuracy

In [39]:
cm = confusion_matrix(test_data, test_actual, results)
cm

(array([[93., 18.],
        [13., 98.]]),
 0.8603603603603603)

In [41]:
# misclassification rate = # misclassified / # elements
C = cm[0]
mr = (C[0,1] + C[1,0]) / C.sum()
mr

0.13963963963963963