In [None]:
## import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
#Read a comma-separated values (csv) file into DataFrame.
loanInfo = pd.read_csv("loan.csv")
# This function returns the first n rows for the object based on position, 
# default value is 5 hence first 5 rows
loanInfo.head()

In [None]:
# It generates descriptive statistics.
loanInfo.describe()

In [None]:
# This method prints information about a DataFrame including 
# the index dtype and columns, non-null values and memory usage.
loanInfo.info()

In [None]:
loanInfo.shape

In [None]:
# it detects missing values.
loanInfo.isnull().sum()

In [None]:
# As one can see that there are many Null Values 
# hence as part of good practice first task would be to remove all the Null values
# so we will use dropna() function on data frame i.e. loanInfo.
# it removes missing values.
# - axis = 1 means column 
# - how = all’ : If all values are NA, drop that row or column.
# - inplace = True : modify the DataFrame 
loanInfo.dropna(axis = 1, how = 'all', inplace = True)
loanInfo.head()

In [None]:
loanInfo.info()

In [None]:

# loanInfo.drop(['pymnt_plan', "initial_list_status",'collections_12_mths_ex_med','policy_code','acc_now_delinq', 'application_type', 'pub_rec_bankruptcies', 'tax_liens', 'delinq_amnt'], axis = 1, inplace = True)
loanInfo.drop(['collections_12_mths_ex_med','policy_code','application_type', 'acc_now_delinq','delinq_amnt','tax_liens'], axis = 1, inplace = True)
loanInfo.head()

In [None]:
loanInfo.info()
loanInfo.drop(['pub_rec_bankruptcies'], axis = 1, inplace = True)
loanInfo.head()

In [None]:
loanInfo.info()
loanInfo.drop(['pymnt_plan','initial_list_status'], axis = 1, inplace = True)
loanInfo.head()

In [None]:
loanInfo.info()
# Few columns such as "id", "member_id", "url", "title", "emp_title", "zip_code", "last_credit_pull_d", "addr_state". 
# are off no use while analyzing the user details and the driving factors of loan defaulting before approving loan
# Its good to avoid them from data frame
loanInfo.drop(["id", "member_id", "url", "title", "emp_title", "zip_code", "last_credit_pull_d", "addr_state"], axis = 1, inplace = True)
loanInfo.head()

In [None]:
loanInfo.info()

In [None]:
# "desc" primarily a text data which is not useful at present hence better to remove it.
# "funded_amnt" vs "funded_amnt_inv" , earlier one is not needed because we only need info as to how much is funded in actual, hence remove funded_amnt
# "out_prncp_inv" and "total_pymnt_inv" are not required while loan defaulting analysis hence removing 
loanInfo.drop(["desc","funded_amnt","out_prncp_inv","total_pymnt_inv"], axis = 1, inplace = True)
loanInfo.head()
loanInfo.info()

In [None]:
#  There are few columns which are not required for Pre Loan approval analysis hence better to drop them
loanInfo.drop(["delinq_2yrs", "revol_bal", "out_prncp", "total_pymnt", "total_rec_prncp", "total_rec_int", "total_rec_late_fee", "recoveries", "mths_since_last_delinq", "mths_since_last_record","collection_recovery_fee","last_pymnt_d", "last_pymnt_amnt","next_pymnt_d","chargeoff_within_12_mths"], axis = 1, inplace = True)
loanInfo.head()
loanInfo.info()

In [None]:
(loanInfo.loan_status.value_counts()*100)/len(loanInfo)
loanInfo.loan_status.unique()


In [None]:
# Knock off any load which is ongoing hence it wont help us anyways as loan payment is under progress
loanInfo = loanInfo[loanInfo.loan_status != "Current"]
loanInfo.loan_status.unique()

In [None]:
# Check missing values
(loanInfo.isna().sum()/len(loanInfo.index))*100

In [None]:
print("Mode : " + loanInfo.emp_length.mode()[0])
loanInfo.emp_length.value_counts()

In [None]:
# Mode value which is 1-+ years has 8488 which is much higher than next value i.e. <1 year> so its safe to say that we can replace all null values with Mode values
loanInfo.emp_length.fillna(loanInfo.emp_length.mode()[0], inplace = True)
loanInfo.emp_length.isna().sum()

In [None]:
(loanInfo.isna().sum()/len(loanInfo.index))*100

In [None]:
# evol_util   39667 non-null  object which has continuous values
loanInfo.dropna(axis = 0, subset = ['revol_util'] , inplace = True)
loanInfo.revol_util.isna().sum()

In [None]:
(loanInfo.isna().sum()/len(loanInfo.index))*100

In [None]:
loanInfo.revol_util = pd.to_numeric(loanInfo.revol_util.apply(lambda x : x.split('%')[0]))
loanInfo.int_rate = pd.to_numeric(loanInfo.int_rate.apply(lambda x : x.split('%')[0]))
loanInfo.emp_length = pd.to_numeric(loanInfo.emp_length.apply(lambda x: 0 if "<" in x else (x.split('+')[0] if "+" in x else x.split()[0])))
loanInfo.head()

In [None]:
# Lets take a look on important variable loan status in total loans issued
(loanInfo.loan_status.value_counts()*100)/len(loanInfo)

In [None]:
# Lets take a look on important variable purpose of loan in total loans issued

(loanInfo.purpose.value_counts()*100)/len(loanInfo)

Univariate Analysis -

In [None]:
loanInfo['loan_amnt'].describe()

In [None]:
sns.boxplot(x=loanInfo.annual_inc)

In [None]:
quantile_info = loanInfo.annual_inc.quantile([0.5, 0.75,0.90, 0.95, 0.97,0.98, 0.99])
quantile_info

In [None]:
per_95_annual_inc = loanInfo['annual_inc'].quantile(0.95)
loanInfo = loanInfo[loanInfo.annual_inc <= per_95_annual_inc]
sns.boxplot(x=loanInfo.annual_inc)