In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [3]:
data = pd.read_csv('./../../files/csv/train_data.csv')

In [4]:
data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0.0,Graduate,No,5849,0.0,12000,12.0,1.0,Urban,Y
1,LP001003,Male,Yes,1.0,Graduate,No,4583,1508.0,12800,12.0,1.0,Rural,N
2,LP001005,Male,Yes,0.0,Graduate,Yes,3000,0.0,6600,12.0,1.0,Urban,Y
3,LP001006,Male,Yes,0.0,Not Graduate,No,2583,2358.0,12000,12.0,1.0,Urban,Y
4,LP001008,Male,No,0.0,Graduate,No,6000,0.0,14100,12.0,1.0,Urban,Y


In [5]:
data.shape

(614, 13)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    float64
 4   Education          614 non-null    object 
 5   Self_Employed      612 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         614 non-null    int64  
 9   Loan_Amount_Term   614 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(2), object(7)
memory usage: 62.5+ KB


In [7]:
data.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed         2
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [8]:
data.isnull().sum()*100 / len(data)

Loan_ID              0.000000
Gender               2.117264
Married              0.488599
Dependents           2.442997
Education            0.000000
Self_Employed        0.325733
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.143322
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [9]:
data = data.drop('Loan_ID',axis=1)

In [10]:
columns = ['Gender','Dependents','LoanAmount','Loan_Amount_Term']

In [11]:
data = data.dropna(subset=columns)

In [12]:
data.isnull().sum()*100 / len(data)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        0.341297
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.361775
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [13]:
data['Self_Employed'].mode()[0]

'No'

In [14]:
data['Self_Employed'] =data['Self_Employed'].fillna(data['Self_Employed'].mode()[0])

In [15]:
data.isnull().sum()*100 / len(data)

Gender               0.000000
Married              0.000000
Dependents           0.000000
Education            0.000000
Self_Employed        0.000000
ApplicantIncome      0.000000
CoapplicantIncome    0.000000
LoanAmount           0.000000
Loan_Amount_Term     0.000000
Credit_History       8.361775
Property_Area        0.000000
Loan_Status          0.000000
dtype: float64

In [16]:
data['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [17]:
data['Self_Employed'].unique()

array(['No', 'Yes'], dtype=object)

In [18]:

data['Credit_History'].mode()[0]

1.0

In [19]:
data['Credit_History'] =data['Credit_History'].fillna(data['Credit_History'].mode()[0])

In [20]:
data.isnull().sum()*100 / len(data)

Gender               0.0
Married              0.0
Dependents           0.0
Education            0.0
Self_Employed        0.0
ApplicantIncome      0.0
CoapplicantIncome    0.0
LoanAmount           0.0
Loan_Amount_Term     0.0
Credit_History       0.0
Property_Area        0.0
Loan_Status          0.0
dtype: float64

In [21]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0.0,Graduate,No,5849,0.0,12000,12.0,1.0,Urban,Y
1,Male,Yes,1.0,Graduate,No,4583,1508.0,12800,12.0,1.0,Rural,N
2,Male,Yes,0.0,Graduate,Yes,3000,0.0,6600,12.0,1.0,Urban,Y
3,Male,Yes,0.0,Not Graduate,No,2583,2358.0,12000,12.0,1.0,Urban,Y
4,Male,No,0.0,Graduate,No,6000,0.0,14100,12.0,1.0,Urban,Y


In [22]:
data.shape

(586, 12)

In [23]:
data['Dependents'].unique()

array([0., 1., 2., 3.])

In [24]:
data['Gender'] = data['Gender'].map({'Male':1,'Female':0}).astype('int')
data['Married'] = data['Married'].map({'Yes':1,'No':0}).astype('int')
data['Education'] = data['Education'].map({'Graduate':1,'Not Graduate':0}).astype('int')
data['Self_Employed'] = data['Self_Employed'].map({'Yes':1,'No':0}).astype('int')
data['Property_Area'] = data['Property_Area'].map({'Rural':0,'Semiurban':2,'Urban':1}).astype('int')
data['Loan_Status'] = data['Loan_Status'].map({'Y':1,'N':0}).astype('int')

In [25]:
data.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,1,0,0.0,1,0,5849,0.0,12000,12.0,1.0,1,1
1,1,1,1.0,1,0,4583,1508.0,12800,12.0,1.0,0,0
2,1,1,0.0,1,1,3000,0.0,6600,12.0,1.0,1,1
3,1,1,0.0,0,0,2583,2358.0,12000,12.0,1.0,1,1
4,1,0,0.0,1,0,6000,0.0,14100,12.0,1.0,1,1


In [26]:
y = data['Loan_Status']

In [27]:
y

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 586, dtype: int64

In [28]:
X = data.drop('Loan_Status',axis=1)

In [29]:
cols = ['Gender','Married','Dependents','Education','Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']

In [30]:
from sklearn.preprocessing import StandardScaler
st = StandardScaler()
X[cols]=st.fit_transform(X[cols])

In [31]:
X

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,0.478029,-1.358183,-0.748542,0.531296,-0.397621,0.093956,-0.551193,-0.280006,0.281452,1.0,1
1,0.478029,0.736278,0.248380,0.531296,-0.397621,-0.122556,-0.043947,-0.183493,0.281452,1.0,0
2,0.478029,0.736278,-0.748542,0.531296,2.514955,-0.393281,-0.551193,-0.931467,0.281452,1.0,1
3,0.478029,0.736278,-0.748542,-1.882189,-0.397621,-0.464597,0.241968,-0.280006,0.281452,1.0,1
4,0.478029,-1.358183,-0.748542,0.531296,-0.397621,0.119780,-0.551193,-0.026660,0.281452,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...
609,-2.091924,-1.358183,-0.748542,0.531296,-0.397621,-0.410383,-0.551193,-0.871146,0.281452,1.0,0
610,0.478029,0.736278,2.242223,0.531296,-0.397621,-0.204132,-0.551193,-1.245133,-2.507682,1.0,0
611,0.478029,0.736278,0.248380,0.531296,-0.397621,0.474135,-0.470464,1.324517,0.281452,1.0,1
612,0.478029,0.736278,1.245301,0.531296,-0.397621,0.390506,-0.551193,0.528288,0.281452,1.0,1


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np

model_df={}
def model_val(model,X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,
                                                   test_size=0.20,
                                                   random_state=42)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(f"{model} accuracy is {accuracy_score(y_test,y_pred)}")

    score = cross_val_score(model,X,y,cv=5)
    print(f"{model} Avg cross val score is {np.mean(score)}")
    model_df[model]=round(np.mean(score)*100,2)

Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
model_val(lr_model,X,y)

LogisticRegression() accuracy is 0.7542372881355932
LogisticRegression() Avg cross val score is 0.8054613935969869


Decision Tree

In [37]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
model_val(dt_model,X,y)

DecisionTreeClassifier() accuracy is 0.6949152542372882
DecisionTreeClassifier() Avg cross val score is 0.6859481384905115


Random Forest

In [38]:
from sklearn.ensemble import RandomForestClassifier
rf_model =RandomForestClassifier()
model_val(rf_model,X,y)

RandomForestClassifier() accuracy is 0.7627118644067796
RandomForestClassifier() Avg cross val score is 0.7952339562509054


In [39]:
X = data.drop('Loan_Status',axis=1)
y = data['Loan_Status']

In [40]:
rf_model_optimized = RandomForestClassifier(n_estimators=270,
 min_samples_split=5,
 min_samples_leaf=5,
 max_features='sqrt',
 max_depth=5)
rf_model_optimized.fit(X,y)

In [41]:
import pickle

pickle.dump(rf_model_optimized , open('RF.pkl' , 'wb'))

In [42]:
loaded_model = pickle.load(open('RF.pkl' , 'rb'))

# model = joblib.load('loan_status_predict')
df = pd.DataFrame({
    'Gender':1,
    'Married':1,
    'Dependents':2,
    'Education':0,
    'Self_Employed':0,
    'ApplicantIncome':2889,
    'CoapplicantIncome':0.0,
    'LoanAmount':45,
    'Loan_Amount_Term':180,
    'Credit_History':0,
    'Property_Area':1
},index=[0])

In [43]:
result = loaded_model.predict(df)
if result==1:
    print("Loan Approved")
else:
    print("Loan Not Approved")

Loan Not Approved


In [44]:
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('./../../files/csv/dataset_2_train.csv')

le_gender = LabelEncoder()
df["Gender"] = le_gender.fit_transform(df['Gender'])

In [45]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,1,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,1,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,1,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,1,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [46]:
df['Gender'].unique()

array([1, 0, 2])

In [53]:
le_gender.inverse_transform([1])

array(['Male'], dtype=object)