# Importing libraries

In [157]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import wget
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [68]:
column_names=['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore', 'DriversLicense', 'Citizen', 'ZipCode', 'Income', 'ApprovalStatus']

In [69]:
cc_app= pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data', header=None, names=column_names)

In [70]:
cc_app.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [71]:
cc_app.describe()

Unnamed: 0,Debt,YearsEmployed,CreditScore,Income
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [72]:
cc_app.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
Gender            690 non-null object
Age               690 non-null object
Debt              690 non-null float64
Married           690 non-null object
BankCustomer      690 non-null object
EducationLevel    690 non-null object
Ethnicity         690 non-null object
YearsEmployed     690 non-null float64
PriorDefault      690 non-null object
Employed          690 non-null object
CreditScore       690 non-null int64
DriversLicense    690 non-null object
Citizen           690 non-null object
ZipCode           690 non-null object
Income            690 non-null int64
ApprovalStatus    690 non-null object
dtypes: float64(2), int64(2), object(12)
memory usage: 86.3+ KB


In [73]:
cc_app.tail(17)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


## Replacing missing values from '?' to NaN

In [82]:
cc_app=cc_app.replace(to_replace=['?'], value=np.NaN)
cc_app.tail(17)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


### Impute the missing values with mean imputation

In [83]:
cc_app.fillna(cc_app.mean(), inplace=True)

In [84]:
pd.isna(cc_app)
cc_app.tail(17)

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


In [92]:
# Iterate over each column of cc_apps
for col in cc_app:
    # Check if the column is of object type
    if cc_app[col].dtype == 'object':
        # Impute with the most frequent value
        cc_app = cc_app.fillna(cc_app[col].value_counts().index[0])

In [98]:
cc_app.isna().sum()

Gender            0
Age               0
Debt              0
Married           0
BankCustomer      0
EducationLevel    0
Ethnicity         0
YearsEmployed     0
PriorDefault      0
Employed          0
CreditScore       0
DriversLicense    0
Citizen           0
ZipCode           0
Income            0
ApprovalStatus    0
dtype: int64

### Preprocessing data

### converting non-numerical data into numerical using LabelEncoder

In [101]:
le= LabelEncoder()

In [107]:
for col in cc_app: 
    # compare if the column is object
    if cc_app[col].dtype==object:
        # use encoder to do the numeric transformation
        cc_app[col]=le.fit_transform(cc_app[col])
cc_app.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,ApprovalStatus
0,1,156,0.0,2,1,13,8,1.25,1,1,1,0,0,68,0,0
1,0,328,4.46,2,1,11,4,3.04,1,1,6,0,0,11,560,0
2,0,89,0.5,2,1,11,4,1.5,1,0,0,0,0,96,824,0
3,1,125,1.54,2,1,13,8,3.75,1,1,5,1,0,31,3,0
4,1,43,5.625,2,1,13,8,1.71,1,0,0,0,2,37,0,0


## Feature selecting
### As we know that DriverLicence and ZipCode has minimal affect in determining the credit card approvals when compared to the other features. Hence we will drop the 2 features 

In [110]:
cc_app= cc_app.drop(columns=['DriversLicense', 'ZipCode'], axis= 1)

In [112]:
cc_app.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Citizen,Income,ApprovalStatus
0,1,156,0.0,2,1,13,8,1.25,1,1,1,0,0,0
1,0,328,4.46,2,1,11,4,3.04,1,1,6,0,560,0
2,0,89,0.5,2,1,11,4,1.5,1,0,0,0,824,0
3,1,125,1.54,2,1,13,8,3.75,1,1,5,0,3,0
4,1,43,5.625,2,1,13,8,1.71,1,0,0,2,0,0


## Train- Test Split

In [113]:
y = cc_app['ApprovalStatus']

In [115]:
cc_app=cc_app.drop('ApprovalStatus', axis = 1)

In [116]:
cc_app.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,EducationLevel,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Citizen,Income
0,1,156,0.0,2,1,13,8,1.25,1,1,1,0,0
1,0,328,4.46,2,1,11,4,3.04,1,1,6,0,560
2,0,89,0.5,2,1,11,4,1.5,1,0,0,0,824
3,1,125,1.54,2,1,13,8,3.75,1,1,5,0,3
4,1,43,5.625,2,1,13,8,1.71,1,0,0,2,0


In [117]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: ApprovalStatus, dtype: int64

In [119]:
x_train, x_test, y_train, y_test = train_test_split(cc_app, y, test_size = 0.33, random_state = 53)

### Feature Scaling

In [131]:
scaler= MinMaxScaler(feature_range=(0,1))

In [132]:
rescaledX_train= scaler.fit_transform(x_train)

In [135]:
rescaledX_test = scaler.fit_transform(x_test)

### Build Classification Model

In [136]:
logreg= LogisticRegression()

In [137]:
logreg.fit(rescaledX_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [146]:
y_pred = logreg.predict(rescaledX_test)

In [158]:
score = accuracy_score(y_test, y_pred)

In [159]:
print(score)

0.8596491228070176


In [160]:
cm= confusion_matrix(y_test, y_pred)

In [161]:
print(cm)

[[ 75  17]
 [ 15 121]]


## Hyperparameter tunning using GridSearchCV

In [162]:
tol = [0.01, 0.001, 0.0001]

In [163]:
max_iter = [100, 150, 200]

In [168]:
param_grid = dict(tol= tol ,max_iter= max_iter)

In [171]:
param_grid

{'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]}

In [172]:
grid_model= GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

In [173]:
rescaledX = scaler.fit_transform(cc_app)

In [174]:
grid_model_result= grid_model.fit(rescaledX, y)

In [177]:
grid_model_result

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'tol': [0.01, 0.001, 0.0001], 'max_iter': [100, 150, 200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [178]:
best_scores, best_param= grid_model_result.best_score_, grid_model_result.best_params_

In [180]:
print("best score:{}, best parameters: {}".format(best_scores, best_param))

best score:0.8536231884057971, best parameters: {'max_iter': 100, 'tol': 0.01}
