In [None]:
# Hides all warnings
import warnings
warnings.filterwarnings("ignore")

# 1. Introduction

## Data Set Problems

The company seeks to automate (in real time) the loan qualifying procedure based on information given by customers while filling out an online application form. It is expected that the development of ML models that can help the company predict loan approval will accelerate the decision-making process for determining whether an applicant is eligible for a loan or not.

## Objectives of Notebook

**This notebook aims to:**
*   Analyze customer data provided in data set (EDA)
*   Build various ML models that can predict loan approval

**The machine learning models used in this project are:** 
1. Logistic Regression
2. K-Nearest Neighbour (KNN)
3. Support Vector Machine (SVM)
4. Naive Bayes
5. Decision Tree
6. Random Forest
7. Gradient Boost

## Data Set Description

There are 13 variables in this data set:
*   8 categorical variables
*   4 continuous variables
*   1 ID variable

The structure of the data set is the following:
<br>
<table style="width:100%">
<thead>
<tr>
<th style="text-align:center; font-weight: bold; font-size:14px">Variable Name</th>
<th style="text-align:center; font-weight: bold; font-size:14px">Description</th>
<th style="text-align:center; font-weight: bold; font-size:14px">Sample Data</th>
</tr>
</thead>
<tbody>
<tr>
<td><b>Loan_ID</b></td>
<td>Loan reference number <br> (unique ID)</td>
<td>LP001002; LP001003; ...</td>
</tr>
<tr>
<td><b>Gender</b></td>
<td>Applicant gender <br> (Male or Female)</td>
<td>Male; Female</td>
</tr>
<tr>
<td><b>Married</b></td>
<td>Applicant marital status <br> (Married or not married)</td>
<td>Married; Not Married</td>
</tr>
<tr>
<td><b>Dependents</b></td>
<td>Number of family members</td>
<td>0; 1; 2; 3+</td>
</tr>
<tr>
<td><b>Education</b></td>
<td>Applicant education/qualification <br> (graduate or not graduate)</td>
<td>Graduate; Under Graduate</td>
</tr>
<tr>
<td><b>Self_Employed</b></td>
<td>Applicant employment status <br> (yes for self-employed, no for employed/others)</td>
<td>Yes; No</td>
</tr>
<tr>
<td><b>ApplicantIncome</b></td>
<td>Applicant's monthly salary/income</td>
<td>5849; 4583; ...</td>
</tr>
<tr>
<td><b>CoapplicantIncome</b></td>
<td>Additional applicant's monthly salary/income</td>
<td>1508; 2358; ...</td>
</tr>
<tr>
<td><b>LoanAmount</b></td>
<td>Loan amount</td>
<td>128; 66; ...</td>
</tr>
<tr>
<td><b>Loan_Amount_Term</b></td>
<td>The loan's repayment period (in days)</td>
<td>360; 120; ...</td>
</tr>
<tr>
<td><b>Credit_History</b></td>
<td>Records of previous credit history <br> (0: bad credit history, 1: good credit history)</td>
<td>0; 1</td>
</tr>
<tr>
<td><b>Property_Area</b></td>
<td>The location of property <br> (Rural/Semiurban/Urban)</td>
<td>Rural; Semiurban; Urban</td>
</tr>
<tr>
<td><b>Loan_Status</b></td>
<td>Status of loan <br> (Y: accepted, N: not accepted)</td>
<td>Y; N</td>
</tr>
</tbody>
</table>

# 2. Importing Libraries

In [None]:
!pip install missingno imblearn xgboost skl2onnx onnx onnxruntime

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import missingno as mso
import seaborn as sns
import warnings
import os

import scipy
from scipy import stats
from scipy.stats import pearsonr
from scipy.stats import ttest_ind

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from imblearn.over_sampling import SMOTE

import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

import onnx
import onnxruntime as ort

# 3. Reading Data Set

In [None]:
df = pd.read_csv("../dataset/loan_data_set.csv")
df.head()

In [None]:
print(df.shape)

👉 There are **13 columns** and **614 observations** in the data set.

# 4. Data Exploration
This section will perform data exploration of the raw data set that has been imported.

## 4.1 Categorical Variables

### 4.1.1 Loan ID

In [None]:
df.Loan_ID.value_counts(dropna=False)

👉 There are 614 unique IDs in the data set.

### 4.1.2 Gender

In [None]:
df.Gender.value_counts(dropna=False)

In [None]:
sns.countplot(x="Gender", data=df, palette="coolwarm", hue="Gender", legend=False)
plt.show()

In [None]:
countMale = len(df[df.Gender == 'Male'])
countFemale = len(df[df.Gender == 'Female'])
countNull = len(df[df.Gender.isnull()])

print("Percentage of Male applicants: {:.2f}%".format((countMale / (len(df.Gender))*100)))
print("Percentage of Female applicants: {:.2f}%".format((countFemale / (len(df.Gender))*100)))
print("Missing values percentage: {:.2f}%".format((countNull / (len(df.Gender))*100)))

👉 The number of male applicants is higher compared to female applicants. And we can observe that there are missing values in this column.

### 4.1.3 Married

In [None]:
df.Married.value_counts(dropna=False)

In [None]:
sns.countplot(x="Married", data=df, palette="Pastel1", hue="Married", legend=False)
plt.show()

In [None]:
countMarried = len(df[df.Married == 'Yes'])
countNotMarried = len(df[df.Married == 'No'])
countNull = len(df[df.Married.isnull()])

print("Percentage of Married applicants: {:.2f}%".format((countMarried / (len(df.Married))*100)))
print("Percentage of Unmarried applicants: {:.2f}%".format((countNotMarried / (len(df.Married))*100)))
print("Missing values percentage: {:.2f}%".format((countNull / (len(df.Married))*100)))

👉 The number of married applicants is higher compared to unmarried applicants. And we can observe that there are missing values in this column.

### 4.1.4 Education

In [None]:
df.Education.value_counts(dropna=False)

In [None]:
sns.countplot(x="Education", data=df, palette="Pastel2", hue="Education", legend=False)
plt.show()

In [None]:
countGraduate = len(df[df.Education == 'Graduate'])
countNotGraduate = len(df[df.Education == 'Not Graduate'])
countNull = len(df[df.Education.isnull()])

print("Percentage of Graduate applicants: {:.2f}%".format((countGraduate / (len(df.Education))*100)))
print("Percentage of Not Graduate applicants: {:.2f}%".format((countNotGraduate / (len(df.Education))*100)))
print("Missing values percentage: {:.2f}%".format((countNull / (len(df.Education))*100)))

👉 The number of applicants that have graduated is higher than the number of applicants that haven't graduated. There are no missing values in this column.

### 4.1.5 Self Employed

In [None]:
df.Self_Employed.value_counts(dropna=False)

In [None]:
sns.countplot(x="Self_Employed", data=df, palette="Pastel1", hue="Self_Employed", legend=False) 
plt.show()

In [None]:
countNo = len(df[df.Self_Employed == 'No'])
countYes = len(df[df.Self_Employed == 'Yes'])
countNull = len(df[df.Self_Employed.isnull()])

print("Percentage of Not Self-employed applicants: {:.2f}%".format((countNo / (len(df.Self_Employed))*100)))
print("Percentage of Self-employed applicants: {:.2f}%".format((countYes / (len(df.Self_Employed))*100)))
print("Missing values percentage: {:.2f}%".format((countNull / (len(df.Self_Employed))*100)))

👉 The number of self-employed applicants is lower compared to not self-employed applicants. And we can observe that there are missing values in this column.

### 4.1.6 Credit History

In [None]:
df.Credit_History.value_counts(dropna=False)

In [None]:
sns.countplot(x="Credit_History", data=df, palette="Pastel1", hue="Credit_History", legend=False)
plt.show()

In [None]:
count1 = len(df[df.Credit_History == 1])
count0 = len(df[df.Credit_History == 0])
countNull = len(df[df.Credit_History.isnull()])

print("Percentage of Good credit history: {:.2f}%".format((count1 / (len(df.Credit_History))*100)))
print("Percentage of Bad credit history: {:.2f}%".format((count0 / (len(df.Credit_History))*100)))
print("Missing values percentage: {:.2f}%".format((countNull / (len(df.Credit_History))*100)))

👉 The number of applicants who have good credit history is higher compared to applicants who have bad credit history. And we can observe that there are missing values in this column.

### 4.1.7 Property Area

In [None]:
df.Property_Area.value_counts(dropna=False)

In [None]:
sns.countplot(x="Property_Area", data=df, palette="Accent", hue="Property_Area", legend=False)
plt.show()

In [None]:
countUrban = len(df[df.Property_Area == 'Urban'])
countRural = len(df[df.Property_Area == 'Rural'])
countSemiurban = len(df[df.Property_Area == 'Semiurban'])
countNull = len(df[df.Property_Area.isnull()])

print("Percentage of Urban: {:.2f}%".format((countUrban / (len(df.Property_Area))*100)))
print("Percentage of Rural: {:.2f}%".format((countRural / (len(df.Property_Area))*100)))
print("Percentage of Semi-urban: {:.2f}%".format((countSemiurban / (len(df.Property_Area))*100)))
print("Missing values percentage: {:.2f}%".format((countNull / (len(df.Property_Area))*100)))

👉 This column has a balanced distribution between Urban, Rural, and Semi-urban property area. There are no missing values in this column.

### 4.1.8 Loan Status

In [None]:
df.Loan_Status.value_counts(dropna=False)

In [None]:
sns.countplot(x="Loan_Status", data=df, palette="Pastel2", hue="Loan_Status", legend=False)
plt.show()

In [None]:
countY = len(df[df.Loan_Status == 'Y'])
countN = len(df[df.Loan_Status == 'N'])
countNull = len(df[df.Loan_Status.isnull()])

print("Percentage of Approved: {:.2f}%".format((countY / (len(df.Loan_Status))*100)))
print("Percentage of Rejected: {:.2f}%".format((countN / (len(df.Loan_Status))*100)))
print("Missing values percentage: {:.2f}%".format((countNull / (len(df.Loan_Status))*100)))

👉 The number of approved loans is higher than the number of rejected loans. There are no missing values in this column.

### 4.1.9 Loan Amount Term

In [None]:
df.Loan_Amount_Term.value_counts(dropna=False)

In [None]:
sns.countplot(x="Loan_Amount_Term", data=df, palette="RdPu", hue="Loan_Amount_Term", legend=False)
plt.show()

In [None]:
count12 = len(df[df.Loan_Amount_Term == 12.0])
count36 = len(df[df.Loan_Amount_Term == 36.0])
count60 = len(df[df.Loan_Amount_Term == 60.0])
count84 = len(df[df.Loan_Amount_Term == 84.0])
count120 = len(df[df.Loan_Amount_Term == 120.0])
count180 = len(df[df.Loan_Amount_Term == 180.0])
count240 = len(df[df.Loan_Amount_Term == 240.0])
count300 = len(df[df.Loan_Amount_Term == 300.0])
count360 = len(df[df.Loan_Amount_Term == 360.0])
count480 = len(df[df.Loan_Amount_Term == 480.0])
countNull = len(df[df.Loan_Amount_Term.isnull()])

print("Percentage of 12: {:.2f}%".format((count12 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 36: {:.2f}%".format((count36 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 60: {:.2f}%".format((count60 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 84: {:.2f}%".format((count84 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 120: {:.2f}%".format((count120 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 180: {:.2f}%".format((count180 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 240: {:.2f}%".format((count240 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 300: {:.2f}%".format((count300 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 360: {:.2f}%".format((count360 / (len(df.Loan_Amount_Term))*100)))
print("Percentage of 480: {:.2f}%".format((count480 / (len(df.Loan_Amount_Term))*100)))
print("Missing values percentage: {:.2f}%".format((countNull / (len(df.Loan_Amount_Term))*100)))

👉 As we can see from the results, the 360-day loan duration is the most popular compared to other durations.

## 4.2 Numerical Variables

### 4.2.1 Describe Numerical Variables

In [None]:
df[['ApplicantIncome','CoapplicantIncome','LoanAmount']].describe()

### 4.2.2 Distribution of Numerical Variable

#### 4.2.2.1 Histogram Distribution

In [None]:
sns.set(style="darkgrid")

fig, axs = plt.subplots(3, 1, figsize=(20, 15))
sns.histplot(data=df, x="ApplicantIncome", kde=True, ax=axs[0], color='green')
sns.histplot(data=df, x="CoapplicantIncome", kde=True, ax=axs[1], color='skyblue')
sns.histplot(data=df, x="LoanAmount", kde=True, ax=axs[2], color='orange');

#### 4.2.2.2 Violin Plot

In [None]:
sns.set(style="darkgrid")

fig, axs1 = plt.subplots(1, 3, figsize=(20, 8))
sns.violinplot(data=df, y="ApplicantIncome", ax=axs1[0], color='green')
sns.violinplot(data=df, y="CoapplicantIncome", ax=axs1[1], color='skyblue')
sns.violinplot(data=df, y="LoanAmount", ax=axs1[2], color='orange');

👉 The distribution of `ApplicantIncome`, `CoapplicantIncome`, and `LoanAmount` are **positively skewed** and have **outliers**.

## 4.3 Other Exploration

### 4.3.1 Heatmap

In [None]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=[float, int])

# Plot the heatmap
plt.figure(figsize=(10, 7))
sns.heatmap(numeric_df.corr(), annot=True, cmap='RdPu')
plt.show()

👉 There is a **positive correlation** between `LoanAmount` and `ApplicantIncome`.

### 4.3.2 Categorical Vs Categorical Variables

In [None]:
pd.crosstab(df.Gender,df.Married).plot(kind="bar", stacked=True, figsize=(5,5), color=['tab:blue','tab:orange'])
plt.title('Gender vs Married')
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.show()

👉 More male applicants are married compared to female applicants (relatively). Also, the number of unmarried male applicants is overall higher compared to unmarried female applicants.

In [None]:
pd.crosstab(df.Self_Employed,df.Credit_History).plot(kind="bar", stacked=True, figsize=(5,5), color=['tab:purple','gold'])
plt.title('Self Employed vs Credit History')
plt.xlabel('Self Employed')
plt.ylabel('Frequency')
plt.legend(["Bad Credit", "Good Credit"])
plt.xticks(rotation=0)
plt.show()

👉 More self-employed applicants have good credit compared to not self-employed applicants (relatively).

In [None]:
pd.crosstab(df.Property_Area,df.Loan_Status).plot(kind="bar", stacked=True, figsize=(5,5), color=['tab:red','tab:green'])
plt.title('Property Area vs Loan Status')
plt.xlabel('Property Area')
plt.ylabel('Frequency')
plt.xticks(rotation=0)
plt.show()

👉 More loans were accepted for applicants with property in Semiurban areas compared to Urban and Rural areas.

### 4.3.3 Categorical Vs Numerical Variables

In [None]:
sns.violinplot(x="Loan_Status", y="ApplicantIncome", data=df, palette="Pastel2", hue="Loan_Status",legend=False);

👉 There are lots of outliers in `ApplicantIncome`, and the distribution is also positively skewed.

In [None]:
sns.violinplot(x="Loan_Status", y="CoapplicantIncome", data=df, palette="Pastel2", hue="Loan_Status",legend=False);

👉 There are lots of outliers in `CoapplicantIncome`, and the distribution is also positively skewed.

In [None]:
sns.boxplot(x="Loan_Status", y="LoanAmount", data=df, palette="Pastel2", hue="Loan_Status", legend=False);

👉 The column`LoanAmount` has a high number of outliers, and its distribution is also positively skewed.

### 4.3.4 Numerical Vs Numerical Variables

In [None]:
df.plot(x='ApplicantIncome', y='CoapplicantIncome', style='.', legend=False)  
plt.xlabel('ApplicantIncome')
plt.ylabel('CoapplicantIncome')  
plt.show()
print('Correlation:', df['ApplicantIncome'].corr(df['CoapplicantIncome']))

👉 There is a **very slightly negative correlation** between `ApplicantIncome` and `CoapplicantIncome`.

## 4.4 Null Values

In [None]:
df.isnull().sum()

👉 There are a few the null values in the dataset, but they do not appear in all columns.

# 5. Data Preprocessing

## 5.1 Drop Unecessary Variables

In [43]:
if 'Loan_ID' in df.columns:
    df = df.drop(['Loan_ID'], axis=1)

## 5.2 Set Variable types

In [44]:
col_types = {'Gender':"str",
             'Married':"str",
             'Dependents':"str",
             'Education':"str",
             'Self_Employed':"str",
             'ApplicantIncome':"float",
             'CoapplicantIncome':"float",
             'LoanAmount':"float",
             'Loan_Amount_Term':"float",
             'Credit_History':"bool",
             'Property_Area':"str",
             'Loan_Status':"str"}

df = df.astype(dtype = col_types)

In [45]:
df = df.replace("nan", None)

## 5.3 Data Imputation
*Imputation* is a technique for substituting an estimated value for missing values in a dataset.

### 5.3.1 Categorical Variables
In this section, the imputation for categorical variables will be performed using the **mode**.

In [46]:
columns_to_fill = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Credit_History', 'Loan_Amount_Term']

for column in columns_to_fill:
    df[column] = df[column].fillna(df[column].mode()[0])

In [None]:
df.isnull().sum()

### 5.3.2 Numerical Variables
In this section, the imputation for categorical variables will be performed using the **mean**.

In [48]:
columns_to_fill = ['LoanAmount']

for column in columns_to_fill:
    df[column] = df[column].fillna(df[column].mean())

In [None]:
df.isnull().sum()

## 5.4 One-hot Encoding of Categorical Variables
In this section, we will transform categorical variables into binary columns that can be processed by ML algorithms.

In [50]:
df['Gender'] = df['Gender'].replace({'Female': False, 'Male': True})
df['Married'] = df['Married'].replace({'No': False, 'Yes': True})
df['Education'] = df['Education'].replace({'Not Graduate': False, 'Graduate': True})
df['Self_Employed'] = df['Self_Employed'].replace({'No': False, 'Yes': True})
df['Loan_Status'] = df['Loan_Status'].replace({'N': False, 'Y': True})

In [51]:
df = pd.get_dummies(df, columns=["Dependents","Property_Area"])

## 5.5 Remove Outliers for Numerical Variables

In [52]:
# Select only numeric columns except Loan_Amount_Term which is categorical
numeric_df = df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']]

# Compute mean and standard deviation
mu = np.mean(numeric_df, axis=0)
sigma = np.std(numeric_df, axis=0)

# Filter out the outliers
df = df[~((numeric_df < (mu - 2 * sigma)) | (numeric_df > (mu + 2 * sigma))).any(axis=1)]

In [None]:
sns.set(style="darkgrid")

fig, axs = plt.subplots(3, 1, figsize=(20, 15))
sns.histplot(data=df, x="ApplicantIncome", kde=True, ax=axs[0], color='green')
sns.histplot(data=df, x="CoapplicantIncome", kde=True, ax=axs[1], color='skyblue')
sns.histplot(data=df, x="LoanAmount", kde=True, ax=axs[2], color='orange');

## 5.6 Skewed Distribution Treatment
We will use **square root transformation** to normalize the distribution.

In [54]:
# Apply square root transformation using .loc to avoid SettingWithCopyWarning
df.loc[:, 'ApplicantIncome'] = np.sqrt(df['ApplicantIncome'])
df.loc[:, 'CoapplicantIncome'] = np.sqrt(df['CoapplicantIncome'])
df.loc[:, 'LoanAmount'] = np.sqrt(df['LoanAmount'])

In [None]:
sns.set(style="darkgrid")

fig, axs = plt.subplots(3, 1, figsize=(20, 15))
sns.histplot(data=df, x="ApplicantIncome", kde=True, ax=axs[0], color='green')
sns.histplot(data=df, x="CoapplicantIncome", kde=True, ax=axs[1], color='skyblue')
sns.histplot(data=df, x="LoanAmount", kde=True, ax=axs[2], color='orange');

## 5.7 Features Separating
Target features will be seperated from input features.

In [56]:
X = df.drop(["Loan_Status"], axis=1)
y = df["Loan_Status"]

## 5.8 Balance Data Set
In previous exploration, we saw that the number between approved and rejected loan is imbalanced. In this section, we will use the **SMOTE oversampling technique** to avoid overfitting.

In [57]:
X, y = SMOTE().fit_resample(X, y)

In [None]:
sns.set_theme(style="darkgrid")
sns.countplot(y=y, data=df, palette="PiYG", hue=y, legend=False)
plt.ylabel('Loan Status')
plt.xlabel('Total')
plt.show()

## 5.9 Data Normalization Over Data Set

In [59]:
scaler = MinMaxScaler().fit(X)
X = scaler.transform(X)

## 5.10 Splitting Data Set
The data set will be split into **80% train and 20% test**.

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# 6. Models

## 6.1 Logistic Regression

In [61]:
LRclassifier = LogisticRegression(solver='saga', max_iter=500, random_state=1)
LRclassifier.fit(X_train, y_train)

y_pred = LRclassifier.predict(X_test)

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, cmap='RdPu');

In [None]:
print(classification_report(y_test, y_pred))

from sklearn.metrics import accuracy_score
print('Accuracy: {:.2f}%'.format(accuracy_score(y_pred,y_test)*100))

In [65]:
from skl2onnx import to_onnx
# CAST to FP32
X_cast = X.astype(np.float32)
# Disable zipmap for compatibility
options = {id(LRclassifier): {"zipmap": False}}
onx = to_onnx(LRclassifier, X_cast[:1], options=options, target_opset=19)
with open("model.onnx", "wb") as f:
    f.write(onx.SerializeToString())
# Load model in onnx format
onnx_model = onnx.load("model.onnx")
onnx.checker.check_model(onnx_model)
# Max IR_VERSION is 9
onnx_model.ir_version = 9
onnx.save_model(onnx_model, "model.onnx")
ort_session = ort.InferenceSession("model.onnx")
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name

In [67]:
# Simulate data received in json
data = {
    "Loan_ID": "LP001002",
    "Gender": "Male",
    "Married": "No",
    "Dependents": "0",
    "Education": "Graduate",
    "Self_Employed": "No",
    "ApplicantIncome": 5849,
    "CoapplicantIncome": 0.0,
    "LoanAmount": 120,
    "Loan_Amount_Term": 360.0,
    "Credit_History": 1.0,
    "Property_Area": "Urban",
}

In [68]:
def transform(data):
    df = pd.DataFrame.from_dict(data, orient='index').transpose()
    
    # Drop Unecessary Variables
    if 'Loan_ID' in df.columns:
        df = df.drop(['Loan_ID'], axis=1)
    
    # Set Variable types
    df = df.astype(dtype = {key: value for key, value in col_types.items() if key != 'Loan_Status'})
    df = df.replace("nan", None)
    
    # One-hot Encoding of Categorical Variables
    df['Gender'] = df['Gender'].replace({'Female': False, 'Male': True})
    df['Married'] = df['Married'].replace({'No': False, 'Yes': True})
    df['Education'] = df['Education'].replace({'Not Graduate': False, 'Graduate': True})
    df['Self_Employed'] = df['Self_Employed'].replace({'No': False, 'Yes': True})
    df['Dependents_0'] = df['Dependents'].apply(lambda x: True if x == '0' else False)
    df['Dependents_1'] = df['Dependents'].apply(lambda x: True if x == '1' else False)
    df['Dependents_2'] = df['Dependents'].apply(lambda x: True if x == '2' else False)
    df['Dependents_3+'] = df['Dependents'].apply(lambda x: True if x == '3+' else False)
    df = df.drop('Dependents', axis=1)
    df['Property_Area_Rural'] = df['Property_Area'].apply(lambda x: True if x == 'Rural' else False)
    df['Property_Area_Semiurban'] = df['Property_Area'].apply(lambda x: True if x == 'Semiurban' else False)
    df['Property_Area_Urban'] = df['Property_Area'].apply(lambda x: True if x == 'Urban' else False)
    df = df.drop('Property_Area', axis=1)
    
    # Data Normalization Over Data Set
    df = scaler.transform(df)

    return df.astype(np.float32)

In [None]:
candidat_test = transform(data)
outputs = ort_session.run(None, {input_name: candidat_test})
predictions = outputs[0]

print("Predictions:", predictions)