In [1]:
# Dependencies and Setup
import os
import pandas as pd
import datetime
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import requests
import json

# API keys, if necessary
#from secret import gkey

In [2]:
#read the data sets from resources directory
csv_path = "Resources/train.csv"
train_df = pd.read_csv(csv_path)

In [3]:
csv_path = "Resources/test.csv"
test_df = pd.read_csv(csv_path)

## train data cleaning 

In [4]:
#check data
train_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
#check for null values
train_df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [6]:
#drop the null values
    ##included the null values might lead to inaccurate results
train_df = train_df.dropna()

In [7]:
train_df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [8]:
#data types check
train_df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [9]:
#convert data types for further analysis
train_df['CoapplicantIncome'] = train_df['CoapplicantIncome'].astype(int)
train_df['LoanAmount'] = train_df['LoanAmount'].astype(int)
train_df['Loan_Amount_Term'] = train_df['Loan_Amount_Term'].astype(int)
train_df['Credit_History'] = train_df['Credit_History'].astype(int)

In [10]:
train_df['LoanAmount'] = train_df['LoanAmount'].multiply(1000)
train_df['ApplicantIncome'] = train_df['ApplicantIncome'].multiply(12)
train_df['CoapplicantIncome'] = train_df['CoapplicantIncome'].multiply(12)

In [11]:
train_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
1,LP001003,Male,Yes,1,Graduate,No,54996,18096,128000,360,1,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,36000,0,66000,360,1,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,30996,28296,120000,360,1,Urban,Y
4,LP001008,Male,No,0,Graduate,No,72000,0,141000,360,1,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,65004,50352,267000,360,1,Urban,Y


## test data cleaning

In [12]:
#check data
test_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [13]:
#check for null values
test_df.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Dependents           10
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [14]:
#drop the null values
##included the null values might lead to inaccurate results
test_df = test_df.dropna()

In [15]:
test_df.isnull().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
dtype: int64

In [16]:
#data types check
test_df.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome      int64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
dtype: object

In [17]:
#convert data types for further analysis
test_df['CoapplicantIncome'] = test_df['CoapplicantIncome'].astype(int)
test_df['LoanAmount'] = test_df['LoanAmount'].astype(int)
test_df['Loan_Amount_Term'] = test_df['Loan_Amount_Term'].astype(int)
test_df['Credit_History'] = test_df['Credit_History'].astype(int)

In [18]:
test_df['LoanAmount'] = test_df['LoanAmount'].multiply(1000)
test_df['ApplicantIncome'] = test_df['ApplicantIncome'].multiply(12)
test_df['CoapplicantIncome'] = test_df['CoapplicantIncome'].multiply(12)

In [19]:
test_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,68640,0,110000,360,1,Urban
1,LP001022,Male,Yes,1,Graduate,No,36912,18000,126000,360,1,Urban
2,LP001031,Male,Yes,2,Graduate,No,60000,21600,208000,360,1,Urban
4,LP001051,Male,No,0,Not Graduate,No,39312,0,78000,360,1,Urban
5,LP001054,Male,Yes,0,Not Graduate,Yes,25980,41064,152000,360,1,Urban


In [20]:
# Saved to csv files
train_df.to_csv('static/data/train_cleaned.csv', index=False, header=True)
test_df.to_csv('static/data/test_cleaned.csv', index=False, header=True)