In [1]:
#pip install -r requirements.txt

In [20]:
import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import requests
# Import visualization library
import matplotlib.pyplot as plt


torch.__version__

'2.1.2+cpu'

In [3]:
# Check to see if we have a GPU to use for training
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('A {} device was detected.'.format(device))


if device=='cuda':
    print (torch.cuda.get_device_name(device=device))

A cpu device was detected.


In [4]:
# Use Pandas to do our dataprocessing on the dataset
api_url= "https://debt-api-4301881a2ff8.herokuapp.com/loan/get_all"
res = requests.get(api_url)
df = pd.DataFrame(res.json()["data"])
test_data = pd.read_csv('test.csv')

In [5]:
df.head()

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,applicantIncome,coapplicant_income,loan_amount,loan_amount_term,credit_history,property_area,loan_status
0,LP001002,Male,No,0,Graduate,No,5849,0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0,141.0,360.0,1.0,Urban,Y


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   loan_id             614 non-null    object 
 1   gender              601 non-null    object 
 2   married             611 non-null    object 
 3   dependents          599 non-null    object 
 4   education           614 non-null    object 
 5   self_employed       582 non-null    object 
 6   applicantIncome     614 non-null    int64  
 7   coapplicant_income  614 non-null    object 
 8   loan_amount         592 non-null    float64
 9   loan_amount_term    600 non-null    float64
 10  credit_history      564 non-null    float64
 11  property_area       614 non-null    object 
 12  loan_status         614 non-null    object 
dtypes: float64(3), int64(1), object(9)
memory usage: 62.5+ KB


In [7]:
df['coapplicant_income'] = df['coapplicant_income'].astype(float)
print(df['coapplicant_income'].dtype)
df['applicantIncome'] = df['applicantIncome'].astype(float)
print(df['coapplicant_income'].dtype)

float64
float64


In [8]:
df.dtypes

loan_id                object
gender                 object
married                object
dependents             object
education              object
self_employed          object
applicantIncome       float64
coapplicant_income    float64
loan_amount           float64
loan_amount_term      float64
credit_history        float64
property_area          object
loan_status            object
dtype: object

In [None]:
plt.figure(figsize=(8, 8))
explode_values = [0.1] * len(df.dtypes.unique())
df.dtypes.value_counts().plot.pie(explode=explode_values, autopct='%1.1f%%')

plt.title('Data Type %')
plt.show()


In [10]:
df.isnull().sum()

loan_id                0
gender                13
married                3
dependents            15
education              0
self_employed         32
applicantIncome        0
coapplicant_income     0
loan_amount           22
loan_amount_term      14
credit_history        50
property_area          0
loan_status            0
dtype: int64

In [11]:
df['gender'] = df['gender'].fillna(df['gender'].mode()[0])
df['married'] = df['married'].fillna(df['married'].mode()[0])
df['dependents'] = df['dependents'].fillna(df['dependents'].mode()[0])
df['self_employed'] = df['self_employed'].fillna(df['self_employed'].mode()[0])

In [12]:
df['loan_amount'] = df['loan_amount'].fillna(df['loan_amount'].mean())
df['loan_amount_term'] = df['loan_amount_term'].fillna(df['loan_amount_term'].mean())
df['credit_history'] = df['credit_history'].fillna(df['credit_history'].mean())

In [13]:
features = ['gender', 'married', 'dependents', 'education', 'self_employed',
            'applicantIncome', 'coapplicant_income', 'loan_amount', 'loan_amount_term',
            'credit_history', 'property_area']

X = df[features]
y = df['loan_status']

In [14]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0,applicantIncome,coapplicant_income,loan_amount,loan_amount_term,credit_history,gender_Female,gender_Male,married_No,married_Yes,dependents_0,dependents_1,dependents_2,dependents_3+,education_Graduate,education_Not Graduate,self_employed_No,self_employed_Yes,property_area_Rural,property_area_Semiurban,property_area_Urban
0,5849.0,0.0,146.412162,360.0,1.0,False,True,True,False,True,False,False,False,True,False,True,False,False,False,True
1,4583.0,1508.0,128.0,360.0,1.0,False,True,False,True,False,True,False,False,True,False,True,False,True,False,False
2,3000.0,0.0,66.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,False,True,False,False,True
3,2583.0,2358.0,120.0,360.0,1.0,False,True,False,True,True,False,False,False,False,True,True,False,False,False,True
4,6000.0,0.0,141.0,360.0,1.0,False,True,True,False,True,False,False,False,True,False,True,False,False,False,True


In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X_train.head()

Unnamed: 0,applicantIncome,coapplicant_income,loan_amount,loan_amount_term,credit_history,gender_Female,gender_Male,married_No,married_Yes,dependents_0,dependents_1,dependents_2,dependents_3+,education_Graduate,education_Not Graduate,self_employed_No,self_employed_Yes,property_area_Rural,property_area_Semiurban,property_area_Urban
83,6000.0,2250.0,265.0,360.0,0.842199,False,True,False,True,True,False,False,False,True,False,True,False,False,True,False
90,2958.0,2900.0,131.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,True,False,False,True,False
227,6250.0,1695.0,210.0,360.0,1.0,False,True,False,True,False,False,True,False,True,False,True,False,False,True,False
482,2083.0,3150.0,128.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,True,False,False,True,False
464,4166.0,0.0,98.0,360.0,0.0,False,True,True,False,True,False,False,False,True,False,True,False,False,True,False


In [17]:
y_train.head()

83     N
90     Y
227    Y
482    Y
464    N
Name: loan_status, dtype: object

In [18]:
X_train.shape, X_val.shape,y_train.shape, y_val.shape

((491, 20), (123, 20), (491,), (123,))

In [21]:
param_grid_nb = {
    'var_smoothing': np.logspace(0,-9, num=100)
}

In [24]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
naive_bayes_model = GridSearchCV(estimator=GaussianNB(), param_grid=param_grid_nb, verbose=1, cv=10, n_jobs=-1)
naive_bayes_model.fit(X_train, y_train)
print(naive_bayes_model.best_estimator_)

GaussianNB(priors=None, var_smoothing=1.0)

Fitting 10 folds for each of 100 candidates, totalling 1000 fits


GaussianNB(var_smoothing=1.519911082952933e-09)


In [25]:
y_pred = naive_bayes_model.predict(X_val)
print(y_pred)

['Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N'
 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'N' 'N' 'Y' 'Y' 'N' 'Y'
 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y'
 'Y' 'Y' 'N' 'Y' 'Y' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'Y'
 'Y' 'N' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N' 'Y' 'N'
 'Y' 'Y' 'Y' 'N' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'Y' 'N']


In [29]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_val, y_pred), ": is the confusion matrix")
from sklearn.metrics import accuracy_score
print(accuracy_score(y_val, y_pred), ": is the accuracy score")
from sklearn.metrics import precision_score
#print(precision_score(y_val, y_pred), ": is the precision score")
from sklearn.metrics import recall_score
#print(recall_score(y_val, y_pred), ": is the recall score")
from sklearn.metrics import f1_score
#print(f1_score(y_val, y_pred), ": is the f1 score")

[[18 25]
 [ 2 78]] : is the confusion matrix
0.7804878048780488 : is the accuracy score


In [30]:
import pickle
#with open('naive_bayes_model.pkl', 'wb') as file:
 #   pickle.dump(naive_bayes_model, file)

In [31]:
with open('naive_bayes_model.pkl', 'rb') as file:
    naive_bayes_model = pickle.load(file)

In [32]:
for index, row in df.iterrows():
    if pd.isnull(row['gender']):
        row['gender'] = 'Male'
    if pd.isnull(row['married']):
        row['married'] = 'No'
    if pd.isnull(row['dependents']):
        row['dependents'] = 0
    if pd.isnull(row['education']):
        row['education'] = 'Not Graduate'
    if pd.isnull(row['self_employed']):
        row['self_employed'] = 'No'
    if pd.isnull(row['applicantIncome']):
        row['applicantIncome'] = 0
    if pd.isnull(row['coapplicant_income']):
        row['coapplicant_income'] = 0
    if pd.isnull(row['loan_amount']):
        row['loan_amount'] = 0
    if pd.isnull(row['loan_amount_term']):
        row['loan_amount_term'] = 0
    if pd.isnull(row['credit_history']):
        row['credit_history'] = 0
    if pd.isnull(row['property_area']):
        row['property_area'] = 'Rural'

    input_data_df = pd.DataFrame([row])
    input_data_df = pd.get_dummies(input_data_df, columns=['gender', 'married', 'dependents', 'education', 'self_employed', 'property_area'])

    input_data_df = input_data_df.reindex(columns=naive_bayes_model.feature_names_in_, fill_value=0)
    pred = naive_bayes_model.predict(input_data_df)[0]
    pred_boolean = pred
    #pred_label = 'Y' if pred_boolean else 'N'

    print(f'{pred} {row["loan_status"]}')

Y Y
Y N
Y Y
Y Y
Y Y
Y Y
Y Y
N N
Y Y
N N
Y Y
Y Y
N Y
Y N
Y Y
Y Y
Y Y
N N
Y N
Y Y
N N
Y Y
N N
N N
Y N
Y Y
Y Y
Y Y
Y N
Y Y
Y N
Y N
Y N
Y Y
Y N
Y Y
N N
Y Y
N Y
Y Y
Y N
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
N N
Y Y
Y Y
Y Y
Y N
Y N
N N
Y Y
Y Y
Y N
Y Y
Y Y
Y Y
Y Y
N N
N N
N N
Y N
N N
Y Y
N Y
N N
Y Y
Y Y
Y Y
N N
Y Y
Y N
Y N
Y N
N N
Y Y
Y Y
Y Y
Y N
Y N
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y N
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y N
N N
Y Y
Y Y
Y Y
N N
Y Y
Y Y
Y Y
Y Y
Y Y
Y N
Y Y
Y Y
Y Y
N Y
Y Y
Y Y
Y Y
N Y
Y Y
N N
Y N
N Y
Y Y
Y Y
Y Y
Y Y
Y N
Y N
Y Y
N N
Y N
Y N
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y N
Y Y
N N
Y Y
Y N
Y N
Y Y
N Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y N
N N
Y Y
Y Y
Y Y
Y N
Y Y
N N
Y Y
Y Y
N Y
Y N
Y Y
Y N
Y Y
Y Y
N N
Y Y
N N
N N
Y N
Y Y
N N
Y Y
N Y
N N
Y Y
Y Y
Y Y
Y Y
Y N
Y N
Y Y
Y Y
Y N
Y Y
Y Y
Y Y
Y N
Y Y
N Y
Y N
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y N
Y N
N N
Y Y
Y Y
Y Y
Y Y
Y N
Y Y
N N
Y Y
N N
Y Y
Y Y
Y Y
Y Y
Y N
Y N
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y Y
Y N
Y Y
Y Y
Y Y
Y Y
Y N
N Y
Y Y
Y Y
Y N
Y Y
Y Y
Y Y
Y Y
