### 1. Import modules and data

In [13]:
import pandas as pd

cc_apps = pd.read_csv('Documents/Credit_Cards/cc_approvals.csv', header = None)

cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


### 2. Data Inspection

In [14]:
# Inspecting the applications and the data
print(cc_apps.describe())

print('\n')

print(cc_apps.info())

print('\n')

print(cc_apps.tail())

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
0     690 non-null object
1     690 non-null object
2     690 non-null float64
3     690 non-null object
4     690 non-null object
5     690 non-null object
6     690 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null object
14    690 non-null int64

### 3. Handling missing values

In [15]:
# A lot of values have ? Let's replace these values with NaN
import numpy as np

cc_apps = cc_apps.replace('?', np.NaN)

cc_apps.tail(17)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


In [16]:
# Filling the missing values with the mean
cc_apps.fillna(cc_apps.mean(), inplace = True)

# Verify if we don't have missing values any more
print('number of NaN :')
print(cc_apps.isnull().values.sum())
print('\n')
print(cc_apps.isnull().sum())

number of NaN :
67


0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


In [17]:
# Iterate over each column of cc_apps to replace the NaN values in non numeric columns
for col in cc_apps.columns:
    if cc_apps[col].dtypes == 'object':
        # Take the most frequent value in the column to replace the NaN
        cc_apps = cc_apps.fillna(cc_apps[col].value_counts().index[0])
        
# Verify if we don't have NaN values any more
print('number of NaN :')
print(cc_apps.isnull().values.sum())
print('\n')
print(cc_apps.isnull().sum())

number of NaN :
0


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


### 4. Preprocessing the data and feature engineering

In [18]:
# We will use label encoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in cc_apps.columns:
    if cc_apps[col].values.dtype == 'object':
        # Use LabelEncoder to do the numeric transformation
        cc_apps[col] = le.fit_transform(cc_apps[col])

cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,1,156,0.0,2,1,13,8,1.25,1,1,1,0,0,68,0,0
1,0,328,4.46,2,1,11,4,3.04,1,1,6,0,0,11,560,0
2,0,89,0.5,2,1,11,4,1.5,1,0,0,0,0,96,824,0
3,1,125,1.54,2,1,13,8,3.75,1,1,5,1,0,31,3,0
4,1,43,5.625,2,1,13,8,1.71,1,0,0,0,2,37,0,0


### 5. Split the data into Train and Test datasets

In [19]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Drop the features 11 (Drivers Licence) and 13 (Zip code), they are not useful for the prediction
cc_apps = cc_apps.drop([11, 13], axis = 1)
# Transform the DataFrame into a numpy array
cc_apps = cc_apps.values

# We will separate the features (X) that we will use in the training and the prediction (y) column (column 13)
X, y = cc_apps[:, 0:13], cc_apps[:,13]

# Split into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [20]:
# Import MinMaxScaler to scale all the features from 0 to 1
from sklearn.preprocessing import MinMaxScaler

# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range = (0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

### 6. Fit a logistic regression model

In [21]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

# Training the model
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### 7. Making predictions and evaluating the model

In [22]:
# Import confusion_matrix
from sklearn.metrics import confusion_matrix

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test, y_pred)

Accuracy of logistic regression classifier:  0.8421052631578947


array([[95,  8],
       [28, 97]])

For the confusion matrix, the first element of the of the first row of the confusion matrix denotes the true negatives meaning the number of negative instances (denied applications) predicted by the model correctly. And the last element of the second row of the confusion matrix denotes the true positives meaning the number of positive instances (approved applications) predicted by the model correctly.

### 8. making the model perform better

In [23]:
# Import GridSearchCV
from sklearn.model_selection import GridSearchCV

# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol = tol, max_iter = max_iter)

In [24]:
# ---Finding the best performing model---

# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.853623 using {'max_iter': 100, 'tol': 0.01}


