# Loan Default Prediction

In [1]:
import pandas as pd

# Load the dataset
file_path = 'data/lending-club-loan-fix.csv'
loan = pd.read_csv(file_path)

## Data Exploration and Understanding

In [2]:
# Display the first few rows of the dataframe
print(loan.head())

   loan_amnt        term  int_rate  installment grade sub_grade  \
0     8000.0   36 months     16.20       282.05     C        C4   
1     9000.0   36 months     14.33       309.05     C        C1   
2    12000.0   60 months     12.49       269.92     B        B5   
3    20000.0   36 months      6.62       614.08     A        A2   
4    13600.0   36 months      7.29       421.74     A        A4   

             emp_title emp_length home_ownership  annual_inc  ... open_acc  \
0            Counselor    7 years           RENT     50000.0  ...     13.0   
1  American Portfolios    4 years           RENT     40000.0  ...     13.0   
2       Senior Analyst    4 years       MORTGAGE     73000.0  ...     11.0   
3           QA Manager    9 years       MORTGAGE    110000.0  ...      8.0   
4    Caxton Associates    7 years       MORTGAGE    115000.0  ...      9.0   

  pub_rec revol_bal revol_util total_acc  initial_list_status  \
0     1.0    9034.0       68.4      30.0                    w  

In [3]:
# Display the basic information about the dataframe
print(loan.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79206 entries, 0 to 79205
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   loan_amnt             79206 non-null  float64
 1   term                  79206 non-null  object 
 2   int_rate              79206 non-null  float64
 3   installment           79206 non-null  float64
 4   grade                 79206 non-null  object 
 5   sub_grade             79206 non-null  object 
 6   emp_title             74679 non-null  object 
 7   emp_length            75542 non-null  object 
 8   home_ownership        79206 non-null  object 
 9   annual_inc            79206 non-null  float64
 10  verification_status   79206 non-null  object 
 11  issue_d               79206 non-null  object 
 12  loan_status           79206 non-null  object 
 13  purpose               79206 non-null  object 
 14  title                 78852 non-null  object 
 15  dti                

In [4]:
# Display the summary statistics of the dataframe
print(loan.describe())

          loan_amnt      int_rate   installment    annual_inc           dti  \
count  79206.000000  79206.000000  79206.000000  7.920600e+04  79206.000000   
mean   14090.385198     13.639490    431.035773  7.399969e+04     17.408581   
std     8343.716293      4.461456    249.793983  5.718671e+04      8.119132   
min      500.000000      5.320000     16.080000  4.200000e+03      0.000000   
25%     8000.000000     10.490000    250.290000  4.500000e+04     11.340000   
50%    12000.000000     13.330000    374.970000  6.400000e+04     16.980000   
75%    20000.000000     16.490000    566.560000  9.000000e+04     23.010000   
max    40000.000000     30.990000   1428.700000  5.000000e+06     93.860000   

           open_acc       pub_rec     revol_bal    revol_util     total_acc  \
count  79206.000000  79206.000000  7.920600e+04  79147.000000  79206.000000   
mean      11.340075      0.174734  1.588698e+04     53.938152     25.458867   
std        5.137967      0.504655  2.076770e+04    

In [5]:
# Display the data types of each column
print(loan.dtypes)

loan_amnt               float64
term                     object
int_rate                float64
installment             float64
grade                    object
sub_grade                object
emp_title                object
emp_length               object
home_ownership           object
annual_inc              float64
verification_status      object
issue_d                  object
loan_status              object
purpose                  object
title                    object
dti                     float64
earliest_cr_line         object
open_acc                float64
pub_rec                 float64
revol_bal               float64
revol_util              float64
total_acc               float64
initial_list_status      object
application_type         object
mort_acc                float64
pub_rec_bankruptcies    float64
address                  object
dtype: object


In [6]:
# Display the unique values in the target column to understand its distribution
print(loan['loan_status'].unique())

['Fully Paid' 'Charged Off']


In [7]:
# Display the count of each unique value in the target column
print(loan['loan_status'].value_counts())

loan_status
Fully Paid     63556
Charged Off    15650
Name: count, dtype: int64


In [8]:
# Check for missing values
print(loan.isnull().sum())

loan_amnt                  0
term                       0
int_rate                   0
installment                0
grade                      0
sub_grade                  0
emp_title               4527
emp_length              3664
home_ownership             0
annual_inc                 0
verification_status        0
issue_d                    0
loan_status                0
purpose                    0
title                    354
dti                        0
earliest_cr_line           0
open_acc                   0
pub_rec                    0
revol_bal                  0
revol_util                59
total_acc                  0
initial_list_status        0
application_type           0
mort_acc                7573
pub_rec_bankruptcies      97
address                    0
dtype: int64


In [9]:
# Dropping rows with missing values
loan = loan.dropna()

In [10]:
# Check for duplicate records
print(loan.duplicated().sum())

0


In [11]:
for column in loan.columns:
    unique_values = loan[column].unique()
    print(f"Column: {column}")
    print(f"Unique values: {unique_values}\n")

Column: loan_amnt
Unique values: [ 8000.  9000. 12000. ... 30175. 34600. 27950.]

Column: term
Unique values: [' 36 months' ' 60 months']

Column: int_rate
Unique values: [16.2  14.33 12.49  6.62 10.64 17.27 15.1  14.65  9.99 17.77 19.05 13.53
 22.99 25.8  12.69 12.12 10.16 18.49  9.17 18.75 17.57 15.31 16.29 12.88
  6.24  7.89 13.11 10.99 20.99 15.61 17.14 19.52 11.53 17.86 20.49 14.98
 21.18 11.14  5.32  8.9  13.99 18.25 13.33 16.99  7.9  13.98 11.99 20.31
 12.99 18.85 15.59 19.2  12.39 14.99 14.09 14.31 16.55 10.15 22.4  18.92
 17.76 18.24  7.99  8.38 24.5  17.56 15.8  15.99  8.39 19.99 15.88 17.99
 18.55 13.67 20.75  8.18  6.03 14.16 13.49 24.24 11.67  7.62 20.5  23.83
  8.67 12.35 14.3  11.39  9.25 19.19 21.   22.39 10.74  8.24  8.6  27.34
 17.1  17.97 22.15  9.71 21.98 16.49  6.68 14.47 21.99  9.49 12.29 14.49
 16.78 12.59 11.49  8.19 23.13 24.08  7.49 10.75  6.89 16.59 13.35 26.77
 23.43 12.85 23.99 13.05 10.49 11.44  7.69  6.49  8.49 18.99 11.47  6.92
 11.48 13.66 13.18 11.55 1

In [12]:
num_columns = len(loan.columns)
print(f"Number of columns: {num_columns}")

Number of columns: 27
