In [None]:
!pip install streamlit
!pip install pickle5

In [None]:
#import required libraries.
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn import metrics
import matplotlib.pyplot as plt
from scipy import stats as st
from imblearn import over_sampling
from sklearn.preprocessing import StandardScaler    
from sklearn.metrics import recall_score,precision_score,accuracy_score,f1_score,roc_curve,roc_auc_score
from sklearn.naive_bayes import GaussianNB 
import re
import pickle
import streamlit as slt


In [None]:
#define function to calculate precision,accuracy,recall and F1_Score metrics as indicators of models' performance.
def conf_metrics_plot_cal(Y_test,Y_pred,cnf_matrix):
    
    #accuracy = (true positive + true negative)/(true positive+false positive + false negative + true negative).
    accuracy = accuracy_score(Y_test,Y_pred)#
   
    #recall = true positive / (true positive + false negative).
    recall = recall_score(Y_test,Y_pred)
    
    #precision = true positive / (true positive+false positive).
    precision = precision_score(Y_test,Y_pred)
    
    #F1_score =  2* Precision Score * Recall Score/ (Precision Score + Recall Score).
    f1 = f1_score(Y_test,Y_pred)

    print(f'''accuracy is :{accuracy} \n
          \n precision is :{precision}
          \n recall is :{recall}
          \nF1_Score is : {f1}''')
    class_names=[-1,1] # name  of classes 

    fig, ax = plt.subplots() # figure to plot confusion matrix.

    tick_marks = [-1,1]

    # create heatmap and plot confusion matrix.

    sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g',xticklabels=['-1','1'],yticklabels=['-1','1']) 
    

    plt.tight_layout() 

    plt.title('Confusion matrix', y=1.1) 

    plt.ylabel('Actual label') 

    plt.xlabel('Predicted label')

 

In [None]:
#read the dataset as pandas dataframe using the library pandas.
data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/loan_data_set.csv")


In [None]:
#repalce categorical values with their corresponding numeric values 1 and -1.
data.replace(["Male","Yes","Graduate","Y"],1,inplace=True)
data.replace(["Female","No","Not Graduate","N"],-1,inplace=True)

In [None]:
#replace Null values in LoanAmount column with the mean value of this column.
data["LoanAmount"].fillna(np.mean(data["LoanAmount"]),inplace=True)


In [None]:
#remove Loan_ID column from the dataframe.
data.drop("Loan_ID","columns",inplace=True)

In [None]:
#Replace each string number in Dependents column with its corresponding float number and convert integers numbers to float numbers .
for (i,j) in data["Dependents"].iteritems() :
  if type(j) == str:
    data["Dependents"][i] = float(re.sub('[^0-9]',"",j))
  elif type(j) == int :
     data["Dependents"][i]=float(j)

In [None]:
#Replace every null value with the mode value of its corresponding column in the dataframe.
for column in  data.loc[:, data.columns != "LoanAmount"]:
   data[column].fillna((st.mode(data[column])[0]).item(),inplace=True)

In [None]:
#ِSeperate dataframe columns to input data (all columns except the last one) and output data (Loan_Status column).
X = data.loc[:, data.columns != "Loan_Status"]
Y = data["Loan_Status"]


In [None]:
#Plot a pie chart that depict the percentage distribution of accepted and refused loan applications.
ones = sum([1 for i in Y if i == 1])
zeros = sum([1 for i in Y if i == -1])
fig = plt.figure(figsize =(10, 7))
plt.pie([ones,zeros], labels = ["accepted","refused"],autopct='%1.1f%%')
plt.show()

In [None]:
#Plot a pie chart that depict the percentage distribution of accepted and refused loan application after  classes (accept,refuse) balancing.
a= over_sampling.SMOTE()
X, Y = a.fit_resample(X, Y)
ones = sum([1 for i in Y if i == 1])
zeros = sum([1 for i in Y if i == -1])
fig = plt.figure(figsize =(10, 7))
plt.pie([ones,zeros], labels = ["accepted","refused"],autopct='%1.1f%%')
plt.show()

In [None]:
#ٍSeperate data into 80% for training (X_train,Y_train) and 20% for testing (X_test,Y_test).
X_train,X_test,Y_train,Y_test =  sklearn.model_selection.train_test_split(X, Y, test_size=0.2, random_state=40)
#Feature-scaling train and test input values.
st_x= StandardScaler()  
X_train= st_x.fit_transform(X_train)    
X_test= st_x.transform(X_test) 

In [None]:
#logistic regression alogrithm programming.
logistic_regression = sklearn.linear_model.LogisticRegression(solver='liblinear') 
logistic_regression.fit(X_train,Y_train)

#save the model using pickle library.
filename1 = "Completed_llogistic_regression_model.joblib"
pickle.dump(logistic_regression, open(filename1, 'wb'))

#Predict the outcomes of input values in the test group.
Y_pred1=logistic_regression.predict(X_test) 

#Comput and plot confusion matrix using the predicted outcome and the real outcome.
cnf_matrix_1 =metrics.confusion_matrix(Y_test, Y_pred1) 
conf_metrics_plot_cal(Y_test,Y_pred1,cnf_matrix_1)

#Plot the ROC(Receiver Operating Characteristics) curve and calculate the AUC(Area Under Curve) value.
fpr, tpr, _ = roc_curve(Y_test, Y_pred1)
auc = roc_auc_score(Y_test,Y_pred1)
fig2 = plt.figure("Figure 2")
plt.plot(fpr, tpr, marker='.', label=f'AUC : {auc}')
plt.ylabel('TruePositiveRate')
plt.xlabel('FalsePositiveRate')
plt.legend()


In [None]:
#Decision tree alogrithm programming.
tree = sklearn.tree.DecisionTreeClassifier() 
tree.fit(X_train,Y_train)

#save the model using pickle library.
filename2 = "Completed_tree_model.joblib"
pickle.dump(logistic_regression, open(filename2, 'wb'))

#Predict the outcomes of input values in the test group.
y_pred2=tree.predict(X_test) 

#Comput and plot confusion matrix using the predicted outcome and the real outcome.
cnf_matrix_2 =metrics.confusion_matrix(Y_test, y_pred2) 
conf_metrics_plot_cal(Y_test,y_pred2,cnf_matrix_2)

#Plot the ROC(Receiver Operating Characteristics) curve and calculate the AUC(Area Under Curve) value.
fpr, tpr, _ = roc_curve(Y_test, y_pred2)
auc = roc_auc_score(Y_test,y_pred2)
fig2 = plt.figure("Figure 2")
plt.plot(fpr, tpr, marker='.', label=f'AUC : {auc}')
plt.ylabel('TruePositiveRate')
plt.xlabel('FalsePositiveRate')
plt.legend()

In [None]:
#Bayes regression alogrithm programming.
GNB = GaussianNB() 
GNB.fit(X_train, Y_train) 

#save the model using pickle library.
filename3 = "Completed_Bayesian_model.joblib"
pickle.dump(logistic_regression, open(filename3, 'wb'))

#Predict the outcomes of input values in the test group.
y_pred3 = GNB.predict(X_test) 

#Comput and plot confusion matrix using the predicted outcome and the real outcome.
cnf_matrix_3 =metrics.confusion_matrix(Y_test, y_pred3) 
conf_metrics_plot_cal(Y_test,y_pred3,cnf_matrix_3)

#Plot the ROC(Receiver Operating Characteristics) curve and calculate the AUC(Area Under Curve) value.
fpr, tpr, _ = roc_curve(Y_test, y_pred3)
auc = roc_auc_score(Y_test,y_pred3)
fig2 = plt.figure("Figure 2")
plt.plot(fpr, tpr, marker='.', label=f'AUC : {auc}')
plt.ylabel('TruePositiveRate')
plt.xlabel('FalsePositiveRate')
plt.legend()