In [63]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
import re

In [42]:
pd.options.display.max_colwidth = 200

### Load and inspect data

In [149]:
loan_data_df = pd.read_csv('loan.csv', low_memory=False)
column_desc_df = pd.read_excel('Data_Dictionary.xlsx')
loan_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39717 entries, 0 to 39716
Columns: 111 entries, id to total_il_high_credit_limit
dtypes: float64(74), int64(13), object(24)
memory usage: 33.6+ MB


### Data Clean up

#### Delete all empty columns

In [150]:
empty_columns = loan_data_df.isna().all().sum()
print(f"Number of columns where all data is empty: {empty_columns}")
# Remove all columns with null values
loan_data_df = loan_data_df.dropna(axis=1, how='all')
# Remove all columns with 90% null values
loan_data_df = loan_data_df.loc[:, (loan_data_df.isnull().sum(axis=0) <= loan_data_df.shape[0] * 0.9)]

Number of columns where all data is empty: 54


#### Remove columns which have constant as a value. 
These will become metadata for the dataset because the dataset has only one value for them.
eg. all values in column: pymnt_plan are `n`.

In [151]:
constant_cols = []
for col in loan_data_df.columns:
    if loan_data_df[col].value_counts().count() == 1:
        constant_cols.append(col)
        print(f"Column: {col} has value: {loan_data_df[col].value_counts().index[0]}")
print(f"Columns to be removed: {constant_cols}")
loan_data_df.drop(columns=constant_cols, inplace=True) 

Column: pymnt_plan has value: n
Column: initial_list_status has value: f
Column: collections_12_mths_ex_med has value: 0.0
Column: policy_code has value: 1
Column: application_type has value: INDIVIDUAL
Column: acc_now_delinq has value: 0
Column: chargeoff_within_12_mths has value: 0.0
Column: delinq_amnt has value: 0
Column: tax_liens has value: 0.0
Columns to be removed: ['pymnt_plan', 'initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens']


#### Dataset after removed columns

In [152]:
loan_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39717 entries, 0 to 39716
Data columns (total 46 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       39717 non-null  int64  
 1   member_id                39717 non-null  int64  
 2   loan_amnt                39717 non-null  int64  
 3   funded_amnt              39717 non-null  int64  
 4   funded_amnt_inv          39717 non-null  float64
 5   term                     39717 non-null  object 
 6   int_rate                 39717 non-null  object 
 7   installment              39717 non-null  float64
 8   grade                    39717 non-null  object 
 9   sub_grade                39717 non-null  object 
 10  emp_title                37258 non-null  object 
 11  emp_length               38642 non-null  object 
 12  home_ownership           39717 non-null  object 
 13  annual_inc               39717 non-null  float64
 14  verification_status   

In [153]:
column_desc_df = column_desc_df[column_desc_df.LoanStatNew.isin(loan_data_df.columns)]
column_desc_df.reset_index(inplace=True)

In [154]:
column_desc_df.loc[:, ['LoanStatNew', 'Description']]

Unnamed: 0,LoanStatNew,Description
0,addr_state,The state provided by the borrower in the loan application
1,annual_inc,The self-reported annual income provided by the borrower during registration.
2,collection_recovery_fee,post charge off collection fee
3,delinq_2yrs,The number of 30+ days past-due incidences of delinquency in the borrower's credit file for the past 2 years
4,desc,Loan description provided by the borrower
5,dti,"A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly inc..."
6,earliest_cr_line,The month the borrower's earliest reported credit line was opened
7,emp_length,Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
8,emp_title,The job title supplied by the Borrower when applying for the loan.*
9,funded_amnt,The total amount committed to that loan at that point in time.


#### Convert the columns to correct datatypes

In [155]:
def convert_emp_length(l):
    if pd.isna(l):
        return np.nan
    elif l == '< 1 year':
        return 0
    elif l == '10+ years':
        return 10
    else:
        return int(str(l)[0])

In [156]:
loan_data_df.term       = loan_data_df.term.apply(lambda x: int(str(x)[:3]))
loan_data_df.int_rate   = loan_data_df.int_rate.apply(lambda x: float(str(x)[:-1])/100)
loan_data_df.emp_length = loan_data_df.emp_length.apply(convert_emp_length) 
loan_data_df.issue_d    = pd.to_datetime(loan_data_df.issue_d, format='%b-%y')