In [1]:
# The relevant libraries
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import seaborn as sns
# sns.set()

In [37]:
data = pd.read_csv('Loan_Default_Dataset.csv')
data.columns

Index(['ID', 'isOffender', 'Term', 'Gender', 'Purpose', 'Home_Ownership',
       'Age', 'Credit Score'],
      dtype='object')

In [3]:
data = data.drop('ID', axis=1)

### Making sure the data is properly cleaned

In [4]:
data['Age'].unique()

array(['>25', '20-25'], dtype=object)

In [5]:
data['Credit Score'].unique()

array(['300-500', '>500'], dtype=object)

In [6]:
data['Home_Ownership'].unique()

array(['Mortgage', 'Rent', 'Own'], dtype=object)

In [7]:
data['Purpose'].unique()

array(['House', 'Car', 'Other', 'Personal', 'Wedding', 'Medical', 'other'],
      dtype=object)

Here, Other and other are two different factors, so i would merge them together.

In [8]:
# This is to merge the other variables spelt differently together
import fuzzywuzzy
from fuzzywuzzy import process
import chardet



In [9]:
purpose = data['Purpose']
matches = fuzzywuzzy.process.extract("Other", purpose, limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

In [10]:
matches

[('Other', 100, 11),
 ('Other', 100, 15),
 ('Other', 100, 41),
 ('Other', 100, 50),
 ('Other', 100, 62),
 ('Other', 100, 87),
 ('Other', 100, 91),
 ('Other', 100, 97),
 ('Other', 100, 103),
 ('Other', 100, 104)]

In [11]:
def replace_matches_in_column(df, column, string_to_match, min_ratio = 47):
    # get a list of unique strings
    strings = df[column].unique()
    
    # get the top 10 closest matches to our input string
    matches = fuzzywuzzy.process.extract(string_to_match, strings, 
                                         limit=10, scorer=fuzzywuzzy.fuzz.token_sort_ratio)

    # only get matches with a ratio > 90
    close_matches = [matches[0] for matches in matches if matches[1] >= min_ratio]

    # get the rows of all the close matches in our dataframe
    rows_with_matches = df[column].isin(close_matches)

    # replace all rows with close matches with the input matches 
    df.loc[rows_with_matches, column] = string_to_match
    
    # let us know the function's done
    print("All done!")

In [12]:
replace_matches_in_column(df=data, column='Purpose', string_to_match="other")


All done!


In [13]:
data['Purpose'].unique()

array(['House', 'Car', 'other', 'Personal', 'Wedding', 'Medical'],
      dtype=object)

In [14]:
data['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [15]:
data['Term'].unique()

array(['36 months', '60 months'], dtype=object)

In [16]:
data.isnull().sum()

isOffender        0
Term              0
Gender            0
Purpose           0
Home_Ownership    0
Age               0
Credit Score      0
dtype: int64

In [17]:
data.columns

Index(['isOffender', 'Term', 'Gender', 'Purpose', 'Home_Ownership', 'Age',
       'Credit Score'],
      dtype='object')

In [18]:
cleaned_data = data

The dataset is ready for further analysis

### Convert categorical variables to numeric

In [19]:
cleaned_data.head()

Unnamed: 0,isOffender,Term,Gender,Purpose,Home_Ownership,Age,Credit Score
0,1,36 months,Female,House,Mortgage,>25,300-500
1,0,36 months,Female,House,Rent,20-25,>500
2,1,36 months,Female,House,Rent,>25,300-500
3,1,36 months,Female,Car,Mortgage,>25,300-500
4,1,36 months,Female,House,Rent,>25,300-500


In [20]:
data_with_dummies = pd.get_dummies(cleaned_data, drop_first = True)

In [36]:
data_with_dummies.columns

Index(['isOffender', 'Term_60 months', 'Gender_Male', 'Purpose_House',
       'Purpose_Medical', 'Purpose_Personal', 'Purpose_Wedding',
       'Purpose_other', 'Home_Ownership_Own', 'Home_Ownership_Rent', 'Age_>25',
       'Credit Score_>500'],
      dtype='object')

### Selecting the Dependent and Independent variables

In [35]:
len(data_with_dummies.columns)

12

In [23]:
y = data_with_dummies['isOffender']
x1 = data_with_dummies[['Term_60 months', 'Gender_Male', 'Purpose_House',
       'Purpose_Medical', 'Purpose_Personal', 'Purpose_Wedding',
       'Purpose_other', 'Home_Ownership_Own', 'Home_Ownership_Rent', 'Age_>25',
       'Credit Score_>500']]
X = sm.add_constant(x1)

### Create the Logistic regression

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 45)

In [25]:
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

y_pred=logreg.predict(X_test)

### Testing the model

In [26]:
# This is used to describe the performance of the classification model
def confusion_matrix(data,actual_values,model):
 
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [27]:
confusion_matrix(X_train,y_train,logreg)

(array([[1853., 1026.],
        [ 743., 5039.]]),
 0.7957510680060039)

In [28]:
cm, accuracy = confusion_matrix(X_test,y_test,logreg)

In [29]:
# Test Confusion Matrix
cm

array([[ 616.,  332.],
       [ 250., 1689.]])

In [30]:
correct_pred = cm[0,0] + cm[1,1]
print(f'Correct Predictions: {correct_pred}')

total_pred = cm.sum()
print(f'Total Predictions: {total_pred}')

Correct Predictions: 2305.0
Total Predictions: 2887.0


In [31]:
print(f'The model got {correct_pred} predictions right out of {total_pred} observations.\n')
print(f'Therefore the accuracy of this model is {accuracy.round(2)*(100/1)}%')

The model got 2305.0 predictions right out of 2887.0 observations.

Therefore the accuracy of this model is 80.0%
