In [173]:
# Adding libraries
from sklearn import linear_model  
from sklearn.linear_model import LinearRegression  
import csv          
import requests     
import numpy as np  
import pandas as pd 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import os
from sklearn import metrics

In [174]:
# Accessing dataset
cwd = os.getcwd() + '/dataset/loan_train.csv'

my_dataset = pd.read_csv(cwd)

print(my_dataset.dtypes)
my_dataset.head()


Gender                 object
Married                object
Dependents             object
Education              object
Self_Employed          object
Applicant_Income        int64
Coapplicant_Income    float64
Loan_Amount             int64
Term                  float64
Credit_History        float64
Area                   object
Status                 object
dtype: object


Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,Male,No,0,Graduate,No,584900,0.0,15000000,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,458300,150800.0,12800000,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,300000,0.0,6600000,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,258300,235800.0,12000000,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,600000,0.0,14100000,360.0,1.0,Urban,Y


In [175]:
# Cleaning rows with NaN
my_dataset_cleaned = my_dataset.dropna()
my_dataset_cleaned = my_dataset.dropna(axis=0)
my_dataset_cleaned = my_dataset.dropna().reset_index(drop=True)


# Transforming categorical data into int
# Gender | Male = 1 / Female = 2
my_dataset_cleaned.loc[my_dataset_cleaned['Gender'] == 'Male', 'Gender'] = 1 
my_dataset_cleaned.loc[my_dataset_cleaned['Gender'] == 'Female', 'Gender'] = 2 
my_dataset_cleaned['Gender'] = my_dataset_cleaned['Gender'].astype(int)

# Married | Yes = 1 / No = 0
my_dataset_cleaned.loc[my_dataset_cleaned['Married'] == 'Yes', 'Married'] = 1 
my_dataset_cleaned.loc[my_dataset_cleaned['Married'] == 'No', 'Married'] = 0
my_dataset_cleaned['Married'] = my_dataset_cleaned['Married'].astype(int)

# Dependents | +3 = 4
my_dataset_cleaned.loc[my_dataset_cleaned['Dependents'] == '3+', 'Dependents'] = 4 
my_dataset_cleaned['Dependents'] = my_dataset_cleaned['Dependents'].astype(int)

# Education | Graduated = 1 / Not Graduate = 0
my_dataset_cleaned.loc[my_dataset_cleaned['Education'] == 'Graduate', 'Education'] = 1
my_dataset_cleaned.loc[my_dataset_cleaned['Education'] == 'Not Graduate', 'Education'] = 0
my_dataset_cleaned['Education'] = my_dataset_cleaned['Education'].astype(int)

# Self_employed | Yes = 1 / No = 0
my_dataset_cleaned.loc[my_dataset_cleaned['Self_Employed'] == 'Yes', 'Self_Employed'] = 1 
my_dataset_cleaned.loc[my_dataset_cleaned['Self_Employed'] == 'No', 'Self_Employed'] = 0
my_dataset_cleaned['Self_Employed'] = my_dataset_cleaned['Self_Employed'].astype(int)

# Status (approval) | Y = 1 / N = 0
my_dataset_cleaned.loc[my_dataset_cleaned['Status'] == 'Y', 'Status'] = 1 
my_dataset_cleaned.loc[my_dataset_cleaned['Status'] == 'N', 'Status'] = 0
my_dataset_cleaned['Status'] = my_dataset_cleaned['Status'].astype(int)



In [171]:
my_dataset_cleaned.head(30)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Term,Credit_History,Area,Status
0,1,0,0,1,0,584900,0.0,15000000,360.0,1.0,Urban,1
1,1,1,1,1,0,458300,150800.0,12800000,360.0,1.0,Rural,0
2,1,1,0,1,1,300000,0.0,6600000,360.0,1.0,Urban,1
3,1,1,0,0,0,258300,235800.0,12000000,360.0,1.0,Urban,1
4,1,0,0,1,0,600000,0.0,14100000,360.0,1.0,Urban,1
5,1,1,2,1,1,541700,419600.0,26700000,360.0,1.0,Urban,1
6,1,1,0,0,0,233300,151600.0,9500000,360.0,1.0,Urban,1
7,1,1,4,1,0,303600,250400.0,15800000,360.0,0.0,Semiurban,0
8,1,1,2,1,0,400600,152600.0,16800000,360.0,1.0,Urban,1
9,1,1,1,1,0,1284100,1096800.0,34900000,360.0,1.0,Semiurban,0


In [176]:
# Reducing dataset, getting only most important attributes
reduced_dataset = my_dataset_cleaned[['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Coapplicant_Income', 'Loan_Amount', 'Credit_History', 'Status']]

# Statistical information
print(reduced_dataset.describe())
print(reduced_dataset.info())

reduced_dataset.head(30)

           Gender     Married  Dependents   Education  Self_Employed   
count  499.000000  499.000000  499.000000  499.000000     499.000000  \
mean     1.176353    0.649299    0.865731    0.791583       0.138277   
std      0.381502    0.477668    1.227205    0.406584       0.345536   
min      1.000000    0.000000    0.000000    0.000000       0.000000   
25%      1.000000    0.000000    0.000000    1.000000       0.000000   
50%      1.000000    1.000000    0.000000    1.000000       0.000000   
75%      1.000000    1.000000    2.000000    1.000000       0.000000   
max      2.000000    1.000000    4.000000    1.000000       1.000000   

       Coapplicant_Income   Loan_Amount  Credit_History      Status  
count        4.990000e+02  4.990000e+02      499.000000  499.000000  
mean         1.566996e+05  1.395251e+07        0.851703    0.683367  
std          2.580955e+05  8.345237e+06        0.355750    0.465630  
min          0.000000e+00  0.000000e+00        0.000000    0.000000  
2

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,Coapplicant_Income,Loan_Amount,Credit_History,Status
0,1,0,0,1,0,0.0,15000000,1.0,1
1,1,1,1,1,0,150800.0,12800000,1.0,0
2,1,1,0,1,1,0.0,6600000,1.0,1
3,1,1,0,0,0,235800.0,12000000,1.0,1
4,1,0,0,1,0,0.0,14100000,1.0,1
5,1,1,2,1,1,419600.0,26700000,1.0,1
6,1,1,0,0,0,151600.0,9500000,1.0,1
7,1,1,4,1,0,250400.0,15800000,0.0,0
8,1,1,2,1,0,152600.0,16800000,1.0,1
9,1,1,1,1,0,1096800.0,34900000,1.0,0


In [None]:
# Data normalisation
