In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

file_name = "credit_risk_dataset.csv"  
df = pd.read_csv(file_name)
if df is None:
    print("Failed to load the DataFrame")
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           31686 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               29465 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [5]:
df.shape


(32581, 12)

In [6]:
df.columns


Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

In [7]:
df['loan_status'].value_counts()

loan_status
0    25473
1     7108
Name: count, dtype: int64

In [8]:
df.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [9]:
df = df.dropna(axis=0, subset=['loan_int_rate'])

In [10]:
df = df.dropna(axis = 0, subset = ['person_emp_length'])

In [11]:
df.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [12]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [13]:
df['loan_status'].value_counts()

loan_status
0    22435
1     6203
Name: count, dtype: int64

In [14]:
df.loan_status.value_counts(normalize=True).mul(100).round(1).astype(str) + '%' 

loan_status
0    78.3%
1    21.7%
Name: proportion, dtype: object

In [15]:
df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length
count,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0,28638.0
mean,27.727216,66649.37,4.788672,9656.493121,11.039867,0.2166,0.169488,5.793736
std,6.310441,62356.45,4.154627,6329.683361,3.229372,0.411935,0.106393,4.038483
min,20.0,4000.0,0.0,500.0,5.42,0.0,0.0,2.0
25%,23.0,39480.0,2.0,5000.0,7.9,0.0,0.09,3.0
50%,26.0,55956.0,4.0,8000.0,10.99,0.0,0.15,4.0
75%,30.0,80000.0,7.0,12500.0,13.48,0.0,0.23,8.0
max,144.0,6000000.0,123.0,35000.0,23.22,1.0,0.83,30.0


In [16]:
df.duplicated().sum()

np.int64(137)

In [17]:
df.drop_duplicates(inplace=True)

In [18]:
df.duplicated().sum()

np.int64(0)

In [19]:
df[df['person_age']>70].count()

person_age                    10
person_income                 10
person_home_ownership         10
person_emp_length             10
loan_intent                   10
loan_grade                    10
loan_amnt                     10
loan_int_rate                 10
loan_status                   10
loan_percent_income           10
cb_person_default_on_file     10
cb_person_cred_hist_length    10
dtype: int64

In [20]:
df = df[df['person_age'] <= 70]

In [21]:
df = df[df['person_emp_length']<100]

In [22]:
df[df['loan_int_rate']>21.85].count()

person_age                    6
person_income                 6
person_home_ownership         6
person_emp_length             6
loan_intent                   6
loan_grade                    6
loan_amnt                     6
loan_int_rate                 6
loan_status                   6
loan_percent_income           6
cb_person_default_on_file     6
cb_person_cred_hist_length    6
dtype: int64

In [23]:
df = df[df['loan_int_rate'] <= 21.85]

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28483 entries, 1 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  28483 non-null  int64  
 1   person_income               28483 non-null  int64  
 2   person_home_ownership       28483 non-null  object 
 3   person_emp_length           28483 non-null  float64
 4   loan_intent                 28483 non-null  object 
 5   loan_grade                  28483 non-null  object 
 6   loan_amnt                   28483 non-null  int64  
 7   loan_int_rate               28483 non-null  float64
 8   loan_status                 28483 non-null  int64  
 9   loan_percent_income         28483 non-null  float64
 10  cb_person_default_on_file   28483 non-null  object 
 11  cb_person_cred_hist_length  28483 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 2.8+ MB


In [25]:
home_owner = pd.get_dummies(df['person_home_ownership'], dtype=int, drop_first=True)
loan_intent = pd.get_dummies(df['loan_intent'],dtype=int, drop_first=True)
loan_grade = pd.get_dummies(df['loan_grade'],dtype=int, drop_first=True)
def_on_file = pd.get_dummies(df['cb_person_default_on_file'],dtype=int, drop_first=True)

df = pd.concat([df, home_owner, loan_intent, loan_grade, def_on_file], axis = 1)
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,...,MEDICAL,PERSONAL,VENTURE,B,C,D,E,F,G,Y
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,...,0,0,0,1,0,0,0,0,0,0
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,...,1,0,0,0,1,0,0,0,0,0
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,...,1,0,0,0,1,0,0,0,0,0
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,...,1,0,0,0,1,0,0,0,0,1
5,21,9900,OWN,2.0,VENTURE,A,2500,7.14,1,0.25,...,0,0,1,0,0,0,0,0,0,0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28483 entries, 1 to 32580
Data columns (total 27 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  28483 non-null  int64  
 1   person_income               28483 non-null  int64  
 2   person_home_ownership       28483 non-null  object 
 3   person_emp_length           28483 non-null  float64
 4   loan_intent                 28483 non-null  object 
 5   loan_grade                  28483 non-null  object 
 6   loan_amnt                   28483 non-null  int64  
 7   loan_int_rate               28483 non-null  float64
 8   loan_status                 28483 non-null  int64  
 9   loan_percent_income         28483 non-null  float64
 10  cb_person_default_on_file   28483 non-null  object 
 11  cb_person_cred_hist_length  28483 non-null  int64  
 12  OTHER                       28483 non-null  int64  
 13  OWN                         28483 no

In [27]:
columns_to_drop = df.select_dtypes(include=['object']).columns
df = df.drop(columns=columns_to_drop)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28483 entries, 1 to 32580
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  28483 non-null  int64  
 1   person_income               28483 non-null  int64  
 2   person_emp_length           28483 non-null  float64
 3   loan_amnt                   28483 non-null  int64  
 4   loan_int_rate               28483 non-null  float64
 5   loan_status                 28483 non-null  int64  
 6   loan_percent_income         28483 non-null  float64
 7   cb_person_cred_hist_length  28483 non-null  int64  
 8   OTHER                       28483 non-null  int64  
 9   OWN                         28483 non-null  int64  
 10  RENT                        28483 non-null  int64  
 11  EDUCATION                   28483 non-null  int64  
 12  HOMEIMPROVEMENT             28483 non-null  int64  
 13  MEDICAL                     28483 no

In [29]:
df.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,...,MEDICAL,PERSONAL,VENTURE,B,C,D,E,F,G,Y
1,21,9600,5.0,1000,11.14,0,0.1,2,0,1,...,0,0,0,1,0,0,0,0,0,0
2,25,9600,1.0,5500,12.87,1,0.57,3,0,0,...,1,0,0,0,1,0,0,0,0,0
3,23,65500,4.0,35000,15.23,1,0.53,2,0,0,...,1,0,0,0,1,0,0,0,0,0
4,24,54400,8.0,35000,14.27,1,0.55,4,0,0,...,1,0,0,0,1,0,0,0,0,1
5,21,9900,2.0,2500,7.14,1,0.25,2,0,1,...,0,0,1,0,0,0,0,0,0,0


In [30]:

# x = df[['person_age', 'person_income',
#        'person_emp_length', 'loan_amnt',
#        'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'OTHER',
#        'OWN', 'RENT', 'EDUCATION', 'HOMEIMPROVEMENT', 'MEDICAL', 'PERSONAL',
#        'VENTURE', 'B', 'C', 'D', 'E', 'F', 'G', 'Y']]
# y = df['loan_status']

y = df.loan_status   
list = ['loan_status']                      
x = df.drop(list,axis = 1 )
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None) 
x.head()


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,OTHER,OWN,RENT,EDUCATION,HOMEIMPROVEMENT,MEDICAL,PERSONAL,VENTURE,B,C,D,E,F,G,Y
1,21,9600,5.0,1000,11.14,0.1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0
2,25,9600,1.0,5500,12.87,0.57,3,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
3,23,65500,4.0,35000,15.23,0.53,2,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0
4,24,54400,8.0,35000,14.27,0.55,4,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1
5,21,9900,2.0,2500,7.14,0.25,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0


In [34]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
oversample = SMOTE()

x, y = oversample.fit_resample(x, y)

In [35]:
y.value_counts()

loan_status
0    22301
1    22301
Name: count, dtype: int64

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.3,random_state=0)

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [37]:
regressor = LinearRegression()
regressor.fit(X_train_scale,Y_train)

In [38]:
len(X_test)

13381

In [39]:
X_train.shape


(31221, 22)

In [40]:
len(X_train)

31221

In [41]:
Y_pred = regressor.predict(X_test_scale)

In [42]:
from sklearn.metrics import mean_absolute_error

mae_lr = mean_absolute_error(Y_test, Y_pred)
print("Mean Absolute Error:", mae_lr)

Mean Absolute Error: 0.25793907977402136


In [43]:
from sklearn.metrics import mean_squared_error
mse_lr = mean_squared_error(Y_test, Y_pred)  
print("Mean Square Error:",mse_lr)

Mean Square Error: 0.10832018105910078


In [44]:
from sklearn.metrics import r2_score
r2_lr = r2_score(Y_test, Y_pred)
print("R-Squared:", r2_lr)

R-Squared: 0.5666915688880867


In [45]:
print("Linear Regression Results:")
print("Mean Absolute Error (LR):", mae_lr)
print("Mean Squared Error (LR):", mse_lr)
print("R-Squared (LR):", r2_lr)

Linear Regression Results:
Mean Absolute Error (LR): 0.25793907977402136
Mean Squared Error (LR): 0.10832018105910078
R-Squared (LR): 0.5666915688880867


In [46]:
from sklearn.metrics import accuracy_score, classification_report
Y_pred_binary = [1 if pred >= 0.5 else 0 for pred in Y_pred]
Accuracy_lr =  accuracy_score(Y_test, Y_pred_binary)
print("Accuracy_LR:",Accuracy_lr)
print(classification_report(Y_test, Y_pred_binary, zero_division=0))

Accuracy_LR: 0.8649577759509752
              precision    recall  f1-score   support

           0       0.84      0.90      0.87      6637
           1       0.89      0.83      0.86      6744

    accuracy                           0.86     13381
   macro avg       0.87      0.87      0.86     13381
weighted avg       0.87      0.86      0.86     13381



In [47]:
trial_data = np.array([[21,9600,5.0,1000,11.14,0.10,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0]])

In [48]:
print("Prediction for Trial Data:", regressor.predict(trial_data)[0])

Prediction for Trial Data: -9.09584209663752


In [417]:
def get_user_input(column_names):
    input_data = []
    for column_name in column_names:
        input_value = float(input(f"Enter the value for {column_name}: "))
        input_data.append(input_value)
    # return pd.DataFrame([input_data], columns=column_names)
    return np.asarray(input_data).reshape(1, -1)
    
column_names = df.columns[:-1]  

user_input = get_user_input(column_names)
# 21,9600,5.0,1000,11.14,0.10,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0


Enter the value for person_age:  21
Enter the value for person_income:  9600
Enter the value for person_emp_length:  5.0
Enter the value for loan_amnt:  1000
Enter the value for loan_int_rate:  11.14
Enter the value for loan_status:  0.10
Enter the value for loan_percent_income:  2
Enter the value for cb_person_cred_hist_length:  0
Enter the value for OTHER:  1
Enter the value for OWN:  0
Enter the value for RENT:  1
Enter the value for EDUCATION:  0
Enter the value for HOMEIMPROVEMENT:  0
Enter the value for MEDICAL:  0
Enter the value for PERSONAL:  0
Enter the value for VENTURE:  1
Enter the value for B:  0
Enter the value for C:  0
Enter the value for D:  0
Enter the value for E:  0
Enter the value for F:  0
Enter the value for G:  0


In [418]:
prediction = regressor.predict(user_input)
threshold = 0.5
predicted_class = 1 if prediction[0] >= threshold else 0  

if predicted_class == 1:
    print("predicted outcome:",predicted_class, "," , " the loan is accepted. ")
else:
    print("predicted outcome:" ,predicted_class , "," , " the loan isn't accepted. ")
    


predicted outcome: 0 ,  the loan isn't accepted. 
