# Loan Prediction Problem

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df= pd.read_csv('train_u6lujuX_CVtuZ9i.csv')

## Data Cleaning and Preprocesing

In [24]:
df.head(20)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [25]:
df.shape

(614, 13)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB


### Checking for missing values

In [27]:
# looking if there is some missing values
df.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [28]:
# cheching the missing value percentaje
df.isnull().sum()/df.shape[0]*100

Unnamed: 0,0
Loan_ID,0.0
Gender,2.117264
Married,0.488599
Dependents,2.442997
Education,0.0
Self_Employed,5.211726
ApplicantIncome,0.0
CoapplicantIncome,0.0
LoanAmount,3.583062
Loan_Amount_Term,2.28013


#### Filling missing values using mode imputation

In [29]:
for i in df.columns:
    if df[i].isnull().sum()>0:
        df[i].fillna(df[i].mode()[0], inplace=True)

df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[i].fillna(df[i].mode()[0], inplace=True)


Unnamed: 0,0
Loan_ID,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0


### Checking for outliers

### Perfmorming data spliting and Label Encoding

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#performing label encoding only in categorical features
le = LabelEncoder()
for i in df.columns:
    if df[i].dtype == 'object':
        df[i] = le.fit_transform(df[i])

# spliting the data
x = df.drop(['Loan_Status'], axis=1)
y = df['Loan_Status']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Performing Starnar Scaling

In [35]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

x_train = pd.DataFrame(x_train, columns=x.columns)
x_test = pd.DataFrame(x_test, columns=x.columns)

In [36]:
x_train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,-1.287819,0.483393,0.757442,-0.723275,-0.534173,-0.397516,0.083915,0.180667,1.357619,0.287611,0.407763,-0.046951
1,-1.248124,0.483393,0.757442,-0.723275,-0.534173,-0.397516,-0.429338,0.38543,-0.194424,0.287611,0.407763,-0.046951
2,-0.471236,0.483393,0.757442,1.260681,-0.534173,-0.397516,0.126095,0.005831,0.720586,0.287611,0.407763,-0.046951
3,0.974798,0.483393,0.757442,-0.723275,-0.534173,-0.397516,-0.57697,0.464185,-0.229171,0.287611,0.407763,-0.046951
4,0.872725,0.483393,-1.320234,-0.723275,-0.534173,-0.397516,-0.225522,-0.528127,-0.576644,0.287611,-2.452404,-0.046951
