# Testing the model

Using your solution so far, test the model on new data.

The new data is located in the ‘Bank_data_testing.csv’.

Good luck!

## Import the relevant libraries

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

## Load the data

Load the ‘Bank_data.csv’ dataset.

In [2]:
raw_data = pd.read_csv('Bank-data.csv')
raw_data.head()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.334,0.0,1.0,0.0,0.0,117.0,no
1,1,0.767,0.0,0.0,2.0,1.0,274.0,yes
2,2,4.858,0.0,1.0,0.0,0.0,167.0,no
3,3,4.12,0.0,0.0,0.0,0.0,686.0,yes
4,4,4.856,0.0,1.0,0.0,0.0,157.0,no


In [3]:
data = raw_data.copy()
data = data.drop(['Unnamed: 0'],axis = 1)
data['y'] = data['y'].map({'yes':1,'no':0})

In [4]:
data.describe()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,518.0,518.0,518.0,518.0,518.0,518.0,518.0
mean,2.835776,0.034749,0.266409,0.388031,0.127413,382.177606,0.5
std,1.876903,0.183321,0.442508,0.814527,0.333758,344.29599,0.500483
min,0.635,0.0,0.0,0.0,0.0,9.0,0.0
25%,1.04275,0.0,0.0,0.0,0.0,155.0,0.0
50%,1.466,0.0,0.0,0.0,0.0,266.5,0.5
75%,4.9565,0.0,1.0,0.0,0.0,482.75,1.0
max,4.97,1.0,1.0,5.0,1.0,2653.0,1.0


### Declare the dependent and independent variables

Use 'duration' as the independet variable.

In [23]:
x1 = data['duration']
y = data['y']

### Simple Logistic Regression

Run the regression and graph the scatter plot.

In [6]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.546118
         Iterations 7


In [7]:
results_log.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,516.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 30 Jul 2019",Pseudo R-squ.:,0.2121
Time:,19:59:32,Log-Likelihood:,-282.89
converged:,True,LL-Null:,-359.05
,,LLR p-value:,5.387e-35

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.7001,0.192,-8.863,0.000,-2.076,-1.324
duration,0.0051,0.001,9.159,0.000,0.004,0.006


## Expand the model

We can be omitting many causal factors in our simple logistic model, so we instead switch to a multivariate logistic regression model. Add the ‘interest_rate’, ‘march’, ‘credit’ and ‘previous’ estimators to our model and run the regression again. 

### Declare the independent variable(s)

In [28]:
X1 = data.drop(['y'],axis = 1)

In [29]:
X = sm.add_constant(X1)
reg_log = sm.Logit(y,X)
results_log = reg_log.fit()

Optimization terminated successfully.
         Current function value: 0.335942
         Iterations 7


In [30]:
results_log.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,518.0
Model:,Logit,Df Residuals:,511.0
Method:,MLE,Df Model:,6.0
Date:,"Tue, 30 Jul 2019",Pseudo R-squ.:,0.5153
Time:,21:23:33,Log-Likelihood:,-174.02
converged:,True,LL-Null:,-359.05
,,LLR p-value:,7.579e-77

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1385,0.339,-0.408,0.683,-0.804,0.527
interest_rate,-0.7802,0.092,-8.471,0.000,-0.961,-0.600
credit,2.4028,1.090,2.205,0.027,0.267,4.538
march,-1.8097,0.332,-5.459,0.000,-2.459,-1.160
may,0.1946,0.229,0.849,0.396,-0.255,0.644
previous,1.2746,0.583,2.186,0.029,0.132,2.417
duration,0.0070,0.001,9.386,0.000,0.006,0.008


### Confusion Matrix

Find the confusion matrix of the model and estimate its accuracy. 

<i> For convenience we have already provided you with a function that finds the confusion matrix and the model accuracy.</i>

In [12]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [31]:
confusion_matrix(X,y,results_log)

(array([[220.,  39.],
        [ 31., 228.]]), 0.8648648648648649)

## Test the model

Load the test data from the ‘Bank_data_testing.csv’ file provided. (Remember to convert the outcome variable ‘y’ into Boolean). 

### Load new data 

In [32]:
test = pd.read_csv('Bank-data-testing.csv')
test.head()

Unnamed: 0.1,Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
0,0,1.313,0.0,1.0,0.0,0.0,487.0,no
1,1,4.961,0.0,0.0,0.0,0.0,132.0,no
2,2,4.856,0.0,1.0,0.0,0.0,92.0,no
3,3,4.12,0.0,0.0,0.0,0.0,1468.0,yes
4,4,4.963,0.0,0.0,0.0,0.0,36.0,no


In [33]:
test_data = test.copy()
test_data = test_data.drop(['Unnamed: 0'],axis = 1)
test_data['y'] = test_data['y'].map({'yes':1,'no':0})
test_data.describe()

Unnamed: 0,interest_rate,credit,march,may,previous,duration,y
count,222.0,222.0,222.0,222.0,222.0,222.0,222.0
mean,2.922095,0.031532,0.274775,0.346847,0.099099,398.86036,0.5
std,1.891766,0.175144,0.44741,0.75595,0.29947,410.565798,0.50113
min,0.639,0.0,0.0,0.0,0.0,6.0,0.0
25%,1.04925,0.0,0.0,0.0,0.0,144.75,0.0
50%,1.714,0.0,0.0,0.0,0.0,255.5,0.5
75%,4.96,0.0,1.0,0.0,0.0,525.25,1.0
max,4.968,1.0,1.0,4.0,1.0,3643.0,1.0


### Declare the dependent and the independent variables

In [34]:
X1_test = test_data.drop(['y'],axis=1)
X_test = sm.add_constant(X1_test)
y_test = test_data['y']

Determine the test confusion matrix and the test accuracy and compare them with the train confusion matrix and the train accuracy.

In [35]:
confusion_matrix(X_test,y_test,results_log)

(array([[94., 17.],
        [12., 99.]]), 0.8693693693693694)

In [36]:
confusion_matrix(X,y,results_log)

(array([[220.,  39.],
        [ 31., 228.]]), 0.8648648648648649)