Import libraries

In [1]:
import pandas as pd
import numpy as np

Load the dataset

In [2]:
data_path='/Users/starboy/Documents/Projects/Bank_loan_status/Dataset/credit_train.csv'

Normalise column names

In [3]:
def load_normalize(path):
    df = pd.read_csv(path)
    df.columns = [c.lower().strip().replace(' ','_') for c in df.columns]
    return df

In [4]:
df=load_normalize(data_path)

Basic Exploration

In [5]:
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst 5 rows: ")
print(df.head())

Dataset shape: (100514, 19)
Columns: ['loan_id', 'customer_id', 'loan_status', 'current_loan_amount', 'term', 'credit_score', 'annual_income', 'years_in_current_job', 'home_ownership', 'purpose', 'monthly_debt', 'years_of_credit_history', 'months_since_last_delinquent', 'number_of_open_accounts', 'number_of_credit_problems', 'current_credit_balance', 'maximum_open_credit', 'bankruptcies', 'tax_liens']

First 5 rows: 
                                loan_id                           customer_id  \
0  14dd8831-6af5-400b-83ec-68e61888a048  981165ec-3274-42f5-a3b4-d104041a9ca9   
1  4771cc26-131a-45db-b5aa-537ea4ba5342  2de017a3-2e01-49cb-a581-08169e83be29   
2  4eed4e6a-aa2f-4c91-8651-ce984ee8fb26  5efb2b2b-bf11-4dfd-a572-3761a2694725   
3  77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a  e777faab-98ae-45af-9a86-7ce5b33b1011   
4  d4062e70-befa-4995-8643-a0de73938182  81536ad9-5ccf-4eb8-befb-47a4d608658e   

  loan_status  current_loan_amount        term  credit_score  annual_income  \
0  Fully Paid

Check Datatypes

In [6]:
print(f"\nData info: {df.info()}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100514 entries, 0 to 100513
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   loan_id                       100000 non-null  object 
 1   customer_id                   100000 non-null  object 
 2   loan_status                   100000 non-null  object 
 3   current_loan_amount           100000 non-null  float64
 4   term                          100000 non-null  object 
 5   credit_score                  80846 non-null   float64
 6   annual_income                 80846 non-null   float64
 7   years_in_current_job          95778 non-null   object 
 8   home_ownership                100000 non-null  object 
 9   purpose                       100000 non-null  object 
 10  monthly_debt                  100000 non-null  float64
 11  years_of_credit_history       100000 non-null  float64
 12  months_since_last_delinquent  46859 non-null

In [8]:
print(f"\nMissing Values: ")
print(df.isnull().sum())


Missing Values: 
loan_id                           514
customer_id                       514
loan_status                       514
current_loan_amount               514
term                              514
credit_score                    19668
annual_income                   19668
years_in_current_job             4736
home_ownership                    514
purpose                           514
monthly_debt                      514
years_of_credit_history           514
months_since_last_delinquent    53655
number_of_open_accounts           514
number_of_credit_problems         514
current_credit_balance            514
maximum_open_credit               516
bankruptcies                      718
tax_liens                         524
dtype: int64


Create a copy for cleaning

In [9]:
dfc = df.copy()

In [10]:
dfc=dfc.dropna(subset=['loan_status'])
print(f"After removing missing loan status: {len(dfc)} records")

After removing missing loan status: 100000 records


In [11]:
print(dfc.isnull().sum())

loan_id                             0
customer_id                         0
loan_status                         0
current_loan_amount                 0
term                                0
credit_score                    19154
annual_income                   19154
years_in_current_job             4222
home_ownership                      0
purpose                             0
monthly_debt                        0
years_of_credit_history             0
months_since_last_delinquent    53141
number_of_open_accounts             0
number_of_credit_problems           0
current_credit_balance              0
maximum_open_credit                 2
bankruptcies                      204
tax_liens                          10
dtype: int64


In [12]:
numerical_cols=[
                'current_loan_amount',
                'credit_score',
                'annual_income',
                'monthly_debt',
                'years_of_credit_history',
                'months_since_last_delinquent',
                'number_of_open_accounts',
                'number_of_credit_problems',
                'current_credit_balance',
                'maximum_open_credit',
                'bankruptcies',
                'tax_liens']

In [13]:
for col in numerical_cols:
    dfc[col].fillna(dfc[col].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfc[col].fillna(dfc[col].median(),inplace=True)


Categorical imputation

In [14]:
dfc.years_in_current_job.fillna("Unknown",inplace=True)
dfc.term.fillna(dfc.term.mode()[0],inplace=True)
dfc.home_ownership.fillna('Unknown',inplace=True)
dfc.purpose.fillna('Unknown',inplace=True)


In [15]:
print(dfc.isnull().sum())

loan_id                         0
customer_id                     0
loan_status                     0
current_loan_amount             0
term                            0
credit_score                    0
annual_income                   0
years_in_current_job            0
home_ownership                  0
purpose                         0
monthly_debt                    0
years_of_credit_history         0
months_since_last_delinquent    0
number_of_open_accounts         0
number_of_credit_problems       0
current_credit_balance          0
maximum_open_credit             0
bankruptcies                    0
tax_liens                       0
dtype: int64


In [16]:
dfc.months_since_last_delinquent.fillna(999,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dfc.months_since_last_delinquent.fillna(999,inplace=True)


In [17]:
dfc.to_csv("Cleaned_data.csv",index=False)
print("Cleaned dataset is saved")

Cleaned dataset is saved
