# Importing packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
%matplotlib inline

**Reading train data**

In [2]:
df=pd.read_csv('../input/loan-prediction-problem-dataset/train_u6lujuX_CVtuZ9i.csv')

In [3]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


# To check null values

In [5]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
df.fillna(df.mean(),inplace=True)

In [7]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

**dropping rows with null object**

In [8]:
df.dropna(how='any',inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 554 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            554 non-null    object 
 1   Gender             554 non-null    object 
 2   Married            554 non-null    object 
 3   Dependents         554 non-null    object 
 4   Education          554 non-null    object 
 5   Self_Employed      554 non-null    object 
 6   ApplicantIncome    554 non-null    int64  
 7   CoapplicantIncome  554 non-null    float64
 8   LoanAmount         554 non-null    float64
 9   Loan_Amount_Term   554 non-null    float64
 10  Credit_History     554 non-null    float64
 11  Property_Area      554 non-null    object 
 12  Loan_Status        554 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 60.6+ KB


In [10]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


**Dropping ID**

In [11]:
df=df.drop(['Loan_ID'],axis=1)

# *creating dummy variables for categorial features*

In [12]:
a=df.select_dtypes('object').columns[:-1]

leaving the last column cause as it is bcoz its our dependent variable we cannot encode it

In [13]:
a

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

**these will be encoded**

In [14]:
df1=pd.DataFrame()

In [15]:
for i in a:
    df2=pd.get_dummies(df[i],drop_first=True)
    df1=pd.concat([df2,df1],axis=1)
    df=df.drop(i,axis=1)

In [16]:
df=pd.concat([df1,df],axis=1)

In [17]:
df.head()

Unnamed: 0,Semiurban,Urban,Yes,Not Graduate,1,2,3+,Yes.1,Male,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
0,0,1,0,0,0,0,0,0,1,5849,0.0,146.412162,360.0,1.0,Y
1,0,0,0,0,1,0,0,1,1,4583,1508.0,128.0,360.0,1.0,N
2,0,1,1,0,0,0,0,1,1,3000,0.0,66.0,360.0,1.0,Y
3,0,1,0,1,0,0,0,1,1,2583,2358.0,120.0,360.0,1.0,Y
4,0,1,0,0,0,0,0,0,1,6000,0.0,141.0,360.0,1.0,Y


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 554 entries, 0 to 613
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Semiurban          554 non-null    uint8  
 1   Urban              554 non-null    uint8  
 2   Yes                554 non-null    uint8  
 3   Not Graduate       554 non-null    uint8  
 4   1                  554 non-null    uint8  
 5   2                  554 non-null    uint8  
 6   3+                 554 non-null    uint8  
 7   Yes                554 non-null    uint8  
 8   Male               554 non-null    uint8  
 9   ApplicantIncome    554 non-null    int64  
 10  CoapplicantIncome  554 non-null    float64
 11  LoanAmount         554 non-null    float64
 12  Loan_Amount_Term   554 non-null    float64
 13  Credit_History     554 non-null    float64
 14  Loan_Status        554 non-null    object 
dtypes: float64(4), int64(1), object(1), uint8(9)
memory usage: 35.2+ KB


# *CREATING TRAIN TEST SPLIT*

In [19]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [20]:
x.head()

Unnamed: 0,Semiurban,Urban,Yes,Not Graduate,1,2,3+,Yes.1,Male,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
0,0,1,0,0,0,0,0,0,1,5849,0.0,146.412162,360.0,1.0
1,0,0,0,0,1,0,0,1,1,4583,1508.0,128.0,360.0,1.0
2,0,1,1,0,0,0,0,1,1,3000,0.0,66.0,360.0,1.0
3,0,1,0,1,0,0,0,1,1,2583,2358.0,120.0,360.0,1.0
4,0,1,0,0,0,0,0,0,1,6000,0.0,141.0,360.0,1.0


In [21]:
y.head()

0    Y
1    N
2    Y
3    Y
4    Y
Name: Loan_Status, dtype: object

In [22]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)

In [23]:
model=LogisticRegression()

In [24]:
model.fit(x_train,y_train)

LogisticRegression()

In [25]:
y_pred=model.predict(x_test)

In [26]:
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[14 24]
 [ 2 99]]
0.8129496402877698
