In [1]:
#importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#import warnings
#warnings.filterwarnings("ignore")

#### Read data
In this demo, we are going to work on credit_risk dataset, where based on the different parameters, a customer is going to be marked with the class label - good / bad

In [2]:
# reading input data from csv file
credit_data = pd.read_csv("datasets/credit_risk.csv")

In [3]:
#viewing sample data
credit_data.head()

Unnamed: 0,over_draft,credit_usage,credit_history,purpose,current_balance,Average_Credit_Balance,employment,location,personal_status,other_parties,...,property_magnitude,cc_age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6,critical/other existing credit,radio/tv,1169,no known savings,>=7,4,male single,none,...,real estate,67,none,own,2,skilled,1,yes,yes,good
1,0<=X<200,48,existing paid,radio/tv,5951,<100,1<=X<4,2,female div/dep/mar,none,...,real estate,22,none,own,1,skilled,1,none,yes,bad
2,no checking,12,critical/other existing credit,education,2096,<100,4<=X<7,2,male single,none,...,real estate,49,none,own,1,unskilled resident,2,none,yes,good
3,<0,42,existing paid,furniture/equipment,7882,<100,4<=X<7,2,male single,guarantor,...,life insurance,45,none,for free,1,skilled,2,none,yes,good
4,<0,24,delayed previously,new car,4870,<100,1<=X<4,3,male single,none,...,no known property,53,none,for free,2,skilled,2,none,yes,bad


#### Feature Engineering
Let us now look at the data to get insights on it, which will help us build a good model.

In [4]:
#checking if any null values are present in data, if present - data preprocessing has to be done.
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   over_draft              1000 non-null   object
 1   credit_usage            1000 non-null   int64 
 2   credit_history          1000 non-null   object
 3   purpose                 1000 non-null   object
 4   current_balance         1000 non-null   int64 
 5   Average_Credit_Balance  1000 non-null   object
 6   employment              1000 non-null   object
 7   location                1000 non-null   int64 
 8   personal_status         1000 non-null   object
 9   other_parties           1000 non-null   object
 10  residence_since         1000 non-null   int64 
 11  property_magnitude      1000 non-null   object
 12  cc_age                  1000 non-null   int64 
 13  other_payment_plans     1000 non-null   object
 14  housing                 1000 non-null   object
 15  exist

In [5]:
# Understanding the values the 'class' column (our target column in this analysis) can take
credit_data['class'].unique()

array(['good', 'bad'], dtype=object)

Consider all the columns except class as features. 
Class is considered as the target

In [6]:
# Selecting predictors as all columns except the 'class' column
X = credit_data.columns.drop("class")

# Setting the target as the 'class' column
y = credit_data['class']

#### Encoding the categorical variables

In [7]:
# Encoding all the features/predictor variables using the get_dummies method() to convert categorical values to numerical values.
credit_data_encoded = pd.get_dummies(credit_data[X])


In [8]:
#notice that, after encoding, the number of predictors/features columns have increased.
credit_data_encoded.shape

(1000, 61)

In [9]:
credit_data_encoded.columns

Index(['credit_usage', 'current_balance', 'location', 'residence_since',
       'cc_age', 'existing_credits', 'num_dependents', 'over_draft_0<=X<200',
       'over_draft_<0', 'over_draft_>=200', 'over_draft_no checking',
       'credit_history_all paid',
       'credit_history_critical/other existing credit',
       'credit_history_delayed previously', 'credit_history_existing paid',
       'credit_history_no credits/all paid', 'purpose_business',
       'purpose_domestic appliance', 'purpose_education',
       'purpose_furniture/equipment', 'purpose_new car', 'purpose_other',
       'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining',
       'purpose_used car', 'Average_Credit_Balance_100<=X<500',
       'Average_Credit_Balance_500<=X<1000', 'Average_Credit_Balance_<100',
       'Average_Credit_Balance_>=1000',
       'Average_Credit_Balance_no known savings', 'employment_1<=X<4',
       'employment_4<=X<7', 'employment_<1', 'employment_>=7',
       'employment_unemployed', 'p

#### Splitting the data into train and test set in a ratio of 85:15

In [10]:
# Importing the required module
from sklearn.model_selection import train_test_split

#splitting data into train and test datasets
X_train,X_test,y_train,y_test = train_test_split(credit_data_encoded, y,test_size=0.15,random_state=100)

# Checking the shapes of the resulting datasets
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (850, 61)
Shape of y_train: (850,)
Shape of X_test: (150, 61)
Shape of y_test: (150,)


#### Building a logistic regression model

In [11]:
#importing required module for model building
from sklearn.linear_model import LogisticRegression

# instantiating the required algorithm for model building
model = LogisticRegression(max_iter=400)

# Building model based on training data
model.fit(X_train,y_train)

LogisticRegression(max_iter=400)

In [12]:
#predicting targets based on the model built
train_predictions = model.predict(X_train)
test_predictions = model.predict(X_test)

#### Evaluate model performance on train and test data

In [13]:
# Accuracy = no. of correct predictions / total predictions
# score() method is used to find the accuracy of a model

train_accuracy = model.score(X_train,y_train)
print("Train accuracy = ", train_accuracy)

test_accuracy = model.score(X_test,y_test)
print("Test accuracy = ", test_accuracy)

Train accuracy =  0.788235294117647
Test accuracy =  0.7333333333333333


#### Confusion Matrix
Confusion matrix helps to assess how good the model works on individual classes in the outcome.
confusion matrix compares the actual target values and the predicted target values.

In [14]:
# Importing the required function
from sklearn.metrics import confusion_matrix

# Creating a confusion matrix on the training data
train_conf_matrix = confusion_matrix(y_train,train_predictions)

# Converting the train_conf_matrix into a DataFrame for better readability
pd.DataFrame(train_conf_matrix,columns=model.classes_,index=model.classes_)

Unnamed: 0,bad,good
bad,135,122
good,58,535


In [15]:
#confusion matrix on the test data
test_conf_matrix = confusion_matrix(y_test,test_predictions)
pd.DataFrame(test_conf_matrix,columns=model.classes_,index=model.classes_)

Unnamed: 0,bad,good
bad,18,25
good,15,92


$
\begin{align}
Accuracy = \frac{No. of correct predictions}{No. of predictions made}
\end{align}
$

In [16]:
# Calculating train accuracy calculated from confusion matrix
train_correct_predictions = train_conf_matrix[0][0]+train_conf_matrix[1][1]
train_total_predictions = train_conf_matrix.sum()
train_accuracy = train_correct_predictions/train_total_predictions
print("Train accuracy = ", train_accuracy)

Train accuracy =  0.788235294117647


In [17]:
# Calculating test accuracy from confusion matrix
test_correct_predictions = test_conf_matrix[0][0]+test_conf_matrix[1][1]
total_predictions = test_conf_matrix.sum()
test_accuracy = test_correct_predictions/total_predictions

print(test_accuracy)

0.7333333333333333


#### Classification report
* precision for class 'A' = (number of outcomes correctly predicted as class 'A' by the model) /  (total number of instances predicted as class 'A' by the model)
* recall for class 'A' = (number of outcomes correctly predicted as class 'A' by the model) /  (total number of class 'A' instances present in the dataset)
* f1-score for class 'A'- harmonic mean of precision and recall for class 'A'
* support for class 'A'- number of instances classified as class 'A'

In [18]:
# Importing the required function
from sklearn.metrics import classification_report

# Generating the report and printing the same
print(classification_report(y_test,test_predictions))

              precision    recall  f1-score   support

         bad       0.55      0.42      0.47        43
        good       0.79      0.86      0.82       107

    accuracy                           0.73       150
   macro avg       0.67      0.64      0.65       150
weighted avg       0.72      0.73      0.72       150

