## Data Source
* https://resources.lendingclub.com/LoanStats_2019Q1.csv.zip
* https://resources.lendingclub.com/LoanStats_2019Q2.csv.zip
* https://resources.lendingclub.com/LoanStats_2019Q3.csv.zip
* https://resources.lendingclub.com/LoanStats_2019Q4.csv.zip
* https://resources.lendingclub.com/LoanStats_2020Q1.csv.zip

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

## 2019 data (training data)

In [2]:
# Loading 2019 data
df1 = pd.read_csv(Path('Resources/Data/LoanStats_2019Q1.csv.zip'), skiprows=1)
df2 = pd.read_csv(Path('Resources/Data/LoanStats_2019Q2.csv.zip'), skiprows=1)
df3 = pd.read_csv(Path('Resources/Data/LoanStats_2019Q3.csv.zip'), skiprows=1)
df4 = pd.read_csv(Path('Resources/Data/LoanStats_2019Q4.csv.zip'), skiprows=1)

In [3]:
df1

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,20000.0,20000.0,20000.0,60 months,17.19%,499.10,C,C5,...,,,,N,,,,,,
1,,,21225.0,21225.0,21225.0,60 months,14.74%,502.05,C,C2,...,,,,N,,,,,,
2,,,5000.0,5000.0,5000.0,36 months,17.97%,180.69,D,D1,...,,,,N,,,,,,
3,,,20000.0,20000.0,20000.0,36 months,8.19%,628.49,A,A4,...,,,,N,,,,,,
4,,,12000.0,12000.0,12000.0,60 months,15.57%,289.09,C,C3,...,,,,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115672,,,16000.0,16000.0,16000.0,36 months,16.14%,563.62,C,C4,...,,,,N,,,,,,
115673,,,16000.0,16000.0,16000.0,60 months,11.31%,350.36,B,B3,...,,,,N,,,,,,
115674,,,29250.0,29250.0,29250.0,60 months,18.94%,757.80,D,D2,...,,,,N,,,,,,
115675,Total amount funded in policy code 1: 1928448350,,,,,,,,,,...,,,,,,,,,,


In [4]:
df1.shape

(115677, 144)

In [5]:
#removing last two rows which are nan
df1=df1[:-2]
df2=df2[:-2]
df3=df3[:-2]
df4=df4[:-2]
df1.shape

(115675, 144)

In [6]:
#Concating all 
df = pd.concat([df1, df2, df3, df4])
df

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,,,20000.0,20000.0,20000.0,60 months,17.19%,499.10,C,C5,...,,,,N,,,,,,
1,,,21225.0,21225.0,21225.0,60 months,14.74%,502.05,C,C2,...,,,,N,,,,,,
2,,,5000.0,5000.0,5000.0,36 months,17.97%,180.69,D,D1,...,,,,N,,,,,,
3,,,20000.0,20000.0,20000.0,36 months,8.19%,628.49,A,A4,...,,,,N,,,,,,
4,,,12000.0,12000.0,12000.0,60 months,15.57%,289.09,C,C3,...,,,,N,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128253,,,3000.0,3000.0,3000.0,36 months,17.74%,108.07,C,C5,...,,,,N,,,,,,
128254,,,10000.0,10000.0,10000.0,36 months,6.46%,306.31,A,A1,...,,,,N,,,,,,
128255,,,19000.0,19000.0,19000.0,36 months,6.46%,581.99,A,A1,...,,,,N,,,,,,
128256,,,10000.0,10000.0,10000.0,60 months,28.80%,316.21,D,D5,...,,,,N,,,,,,


In [7]:
#selecting important columns
columns = [
    "loan_amnt", "int_rate", "installment", "home_ownership", "annual_inc", 
    "verification_status", "pymnt_plan", "dti", "delinq_2yrs", 
    "inq_last_6mths", "open_acc", "pub_rec", "revol_bal", "total_acc", 
    "initial_list_status", "out_prncp", "out_prncp_inv", "total_pymnt", 
    "total_pymnt_inv", "total_rec_prncp", "total_rec_int", 
    "total_rec_late_fee", "recoveries", "collection_recovery_fee", 
    "last_pymnt_amnt", "collections_12_mths_ex_med", "policy_code", 
    "application_type", "acc_now_delinq", "tot_coll_amt", "tot_cur_bal", 
    "open_acc_6m", "open_act_il", "open_il_12m", "open_il_24m", 
    "mths_since_rcnt_il", "total_bal_il", "il_util", "open_rv_12m", 
    "open_rv_24m", "max_bal_bc", "all_util", "total_rev_hi_lim", "inq_fi", 
    "total_cu_tl", "inq_last_12m", "acc_open_past_24mths", "avg_cur_bal", 
    "bc_open_to_buy", "bc_util", "chargeoff_within_12_mths", "delinq_amnt", 
    "mo_sin_old_il_acct", "mo_sin_old_rev_tl_op", "mo_sin_rcnt_rev_tl_op", 
    "mo_sin_rcnt_tl", "mort_acc", "mths_since_recent_bc", 
    "mths_since_recent_inq", "num_accts_ever_120_pd", "num_actv_bc_tl",
    "num_actv_rev_tl", "num_bc_sats", "num_bc_tl", "num_il_tl", 
    "num_op_rev_tl", "num_rev_accts", "num_rev_tl_bal_gt_0", "num_sats", 
    "num_tl_120dpd_2m", "num_tl_30dpd", "num_tl_90g_dpd_24m", 
    "num_tl_op_past_12m", "pct_tl_nvr_dlq", "percent_bc_gt_75", 
    "pub_rec_bankruptcies", "tax_liens", "tot_hi_cred_lim", 
    "total_bal_ex_mort", "total_bc_limit", "total_il_high_credit_limit", 
    "hardship_flag", "debt_settlement_flag",
    "loan_status"
]

target = "loan_status"

In [8]:
# only selecting important columns
df = df.loc[:, columns].copy()
df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,loan_status
0,20000.0,17.19%,499.10,RENT,47000.0,Source Verified,n,14.02,0.0,1.0,...,12.5,0.0,0.0,75824.0,31546.0,33800.0,21524.0,N,N,Issued
1,21225.0,14.74%,502.05,MORTGAGE,225000.0,Not Verified,n,16.80,0.0,2.0,...,50.0,1.0,0.0,747075.0,209426.0,53500.0,128175.0,N,N,Issued
2,5000.0,17.97%,180.69,MORTGAGE,62000.0,Not Verified,n,19.82,1.0,0.0,...,50.0,0.0,0.0,255738.0,31615.0,9400.0,39938.0,N,N,Issued
3,20000.0,8.19%,628.49,MORTGAGE,200000.0,Not Verified,n,22.66,1.0,0.0,...,22.2,0.0,0.0,448069.0,84744.0,49400.0,105180.0,N,N,Issued
4,12000.0,15.57%,289.09,MORTGAGE,49000.0,Source Verified,n,13.47,0.0,0.0,...,14.3,0.0,0.0,189260.0,106025.0,24400.0,68860.0,N,N,Issued
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128253,3000.0,17.74%,108.07,OWN,44000.0,Not Verified,n,30.01,0.0,1.0,...,0.0,0.0,0.0,63484.0,51462.0,4800.0,50684.0,N,N,Fully Paid
128254,10000.0,6.46%,306.31,RENT,60000.0,Not Verified,n,14.18,0.0,0.0,...,0.0,0.0,0.0,78282.0,23872.0,66700.0,11282.0,N,N,Current
128255,19000.0,6.46%,581.99,MORTGAGE,67350.0,Source Verified,n,6.00,0.0,0.0,...,33.3,0.0,0.0,47700.0,21809.0,47700.0,0.0,N,N,Fully Paid
128256,10000.0,28.80%,316.21,MORTGAGE,40000.0,Not Verified,n,2.10,0.0,0.0,...,33.3,1.0,0.0,7400.0,1947.0,4600.0,0.0,N,N,Current


In [9]:
#looking for nan value
df.isna().sum()

loan_amnt                         0
int_rate                          0
installment                       0
home_ownership                    0
annual_inc                        0
                              ...  
total_bc_limit                    0
total_il_high_credit_limit        0
hardship_flag                 26866
debt_settlement_flag              0
loan_status                       0
Length: 84, dtype: int64

In [10]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

df.isna().sum()

loan_amnt                     0
int_rate                      0
installment                   0
home_ownership                0
annual_inc                    0
                             ..
total_bc_limit                0
total_il_high_credit_limit    0
hardship_flag                 0
debt_settlement_flag          0
loan_status                   0
Length: 84, dtype: int64

In [11]:
df.shape

(369038, 84)

In [12]:
# convert interest rate to numerical
df['int_rate'] = df['int_rate'].str.replace('%', '')
df['int_rate'] = df['int_rate'].astype('float') / 100

In [13]:
# Remove the `Issued` loan status
df = df[df['loan_status'] != 'Issued']

In [14]:
df['loan_status'].value_counts()

Current               274072
Fully Paid             63199
Charged Off            11957
Late (31-120 days)      2745
In Grace Period         2584
Late (16-30 days)        709
Default                   52
Name: loan_status, dtype: int64

In [15]:
# Convert the target column values to low_risk and high_risk based on their values
x = {'Current': 'low_risk'}   
df = df.replace(x)
#using dict.fromkeys in which first will be keys and second will be value
x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period'], 'high_risk')    
df = df.replace(x)

df['loan_status'].value_counts()

low_risk       274072
Fully Paid      63199
Charged Off     11957
high_risk        6090
Name: loan_status, dtype: int64

In [16]:
#Only considering low risk and high risk rows
low_risk_rows = df[df['loan_status'] == 'low_risk']
high_risk_rows = df[df['loan_status'] == 'high_risk']
low_risk_rows

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,loan_status
93,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,n,27.24,0.0,0.0,...,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N,low_risk
99,25000.0,0.2000,929.09,MORTGAGE,105000.0,Verified,n,20.23,0.0,0.0,...,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N,low_risk
132,20000.0,0.2000,529.88,MORTGAGE,56000.0,Verified,n,24.26,0.0,0.0,...,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N,low_risk
133,10000.0,0.1640,353.55,RENT,92000.0,Verified,n,31.44,0.0,1.0,...,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N,low_risk
140,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,n,18.76,0.0,1.0,...,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128244,34025.0,0.1240,763.77,RENT,92000.0,Source Verified,n,17.57,0.0,0.0,...,25.0,0.0,0.0,85020.0,43307.0,26700.0,30999.0,N,N,low_risk
128245,1200.0,0.1171,39.70,MORTGAGE,65000.0,Verified,n,21.23,0.0,0.0,...,40.0,0.0,0.0,246116.0,42308.0,35700.0,41477.0,N,N,low_risk
128246,24000.0,0.1695,595.82,MORTGAGE,95000.0,Verified,n,22.83,0.0,1.0,...,0.0,0.0,0.0,358528.0,74346.0,31500.0,86474.0,N,N,low_risk
128247,17175.0,0.1862,626.28,MORTGAGE,115000.0,Not Verified,n,7.53,4.0,1.0,...,0.0,0.0,0.0,277357.0,9285.0,14500.0,25029.0,N,N,low_risk


In [17]:
#under sampling
#df = pd.concat([low_risk_rows, high_risk_rows.sample(n=len(low_risk_rows), replace=True)])
df = pd.concat([low_risk_rows.sample(n=len(high_risk_rows), random_state=42), high_risk_rows])
df = df.reset_index(drop=True)
df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,loan_status
0,7000.0,0.1894,256.38,MORTGAGE,75000.0,Not Verified,n,28.62,0.0,2.0,...,87.5,0.0,0.0,352260.0,62666.0,35000.0,10000.0,N,N,low_risk
1,40000.0,0.1614,975.71,MORTGAGE,102000.0,Source Verified,n,11.72,2.0,0.0,...,0.0,0.0,0.0,294664.0,109911.0,9000.0,71044.0,N,N,low_risk
2,11000.0,0.2055,294.81,RENT,45000.0,Verified,n,37.25,1.0,3.0,...,7.7,0.0,0.0,92228.0,36007.0,33000.0,46328.0,N,N,low_risk
3,4000.0,0.1612,140.87,MORTGAGE,38000.0,Not Verified,n,42.89,1.0,0.0,...,100.0,0.0,0.0,284273.0,52236.0,13500.0,52017.0,N,N,low_risk
4,14000.0,0.1797,505.93,MORTGAGE,43000.0,Source Verified,n,22.16,1.0,0.0,...,25.0,0.0,0.0,120280.0,88147.0,33300.0,78680.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12175,19975.0,0.2565,801.09,RENT,28000.0,Not Verified,n,28.42,0.0,0.0,...,16.7,0.0,0.0,50055.0,28192.0,18700.0,19055.0,N,N,high_risk
12176,15000.0,0.1774,540.34,RENT,50000.0,Verified,n,23.43,4.0,0.0,...,11.1,0.0,0.0,70324.0,57025.0,13300.0,54824.0,N,N,high_risk
12177,3600.0,0.1862,131.28,RENT,60000.0,Not Verified,n,28.80,0.0,1.0,...,0.0,0.0,0.0,83765.0,55156.0,14800.0,53065.0,N,N,high_risk
12178,15000.0,0.0881,475.68,MORTGAGE,62000.0,Source Verified,n,11.44,0.0,0.0,...,0.0,0.0,0.0,189930.0,23748.0,7000.0,32930.0,N,N,high_risk


In [18]:
df['loan_status'].value_counts()

low_risk     6090
high_risk    6090
Name: loan_status, dtype: int64

In [19]:
#saving the data for modeling
df.to_csv('Resources/cleaned-data/2019loans_us.csv', index=False)

In [42]:
#over sampling
df_over=pd.concat([low_risk_rows, high_risk_rows.sample(n=len(low_risk_rows), replace=True)])
df_over= df_over.reset_index(drop=True)
df_over

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,loan_status
0,10500.0,0.1719,375.35,RENT,66000.0,Source Verified,n,27.24,0.0,0.0,...,100.0,0.0,0.0,65687.0,38199.0,2000.0,61987.0,N,N,low_risk
1,25000.0,0.2000,929.09,MORTGAGE,105000.0,Verified,n,20.23,0.0,0.0,...,50.0,1.0,0.0,271427.0,60641.0,41200.0,49197.0,N,N,low_risk
2,20000.0,0.2000,529.88,MORTGAGE,56000.0,Verified,n,24.26,0.0,0.0,...,50.0,0.0,0.0,60644.0,45684.0,7500.0,43144.0,N,N,low_risk
3,10000.0,0.1640,353.55,RENT,92000.0,Verified,n,31.44,0.0,1.0,...,50.0,1.0,0.0,99506.0,68784.0,19700.0,76506.0,N,N,low_risk
4,22000.0,0.1474,520.39,MORTGAGE,52000.0,Not Verified,n,18.76,0.0,1.0,...,0.0,0.0,0.0,219750.0,25919.0,27600.0,20000.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
548139,4400.0,0.2000,163.52,RENT,41000.0,Not Verified,n,31.82,0.0,0.0,...,0.0,0.0,0.0,38014.0,27390.0,5400.0,25314.0,N,N,high_risk
548140,32000.0,0.1308,1079.44,MORTGAGE,210000.0,Not Verified,n,16.15,0.0,0.0,...,100.0,0.0,0.0,378319.0,101528.0,50700.0,59675.0,N,N,high_risk
548141,40000.0,0.1695,993.03,MORTGAGE,140000.0,Source Verified,n,27.79,0.0,2.0,...,20.0,0.0,0.0,312730.0,104873.0,16500.0,120054.0,N,N,high_risk
548142,5275.0,0.1308,177.94,RENT,19464.0,Not Verified,n,30.21,0.0,0.0,...,100.0,0.0,0.0,16500.0,11497.0,3700.0,3000.0,N,N,high_risk


In [43]:
df_over['loan_status'].value_counts()

low_risk     274072
high_risk    274072
Name: loan_status, dtype: int64

In [44]:
#saving the data for modeling
df_over.to_csv('Resources/cleaned-data/2019loans_os.csv', index=False)

## 2020 data (Test data)

In [23]:
# Load the data
validate_df = pd.read_csv(Path('Resources/Data/LoanStats_2020Q1.csv.zip'), skiprows=1)[:-2]
validate_df = validate_df.loc[:, columns].copy()


In [24]:
# Drop the null columns where all values are null
validate_df = validate_df.dropna(axis='columns', how='all')

# Drop the null rows
validate_df = validate_df.dropna()

In [25]:
# convert interest rate to numerical
validate_df['int_rate'] = validate_df['int_rate'].str.replace('%', '')
validate_df['int_rate'] = validate_df['int_rate'].astype('float') / 100

In [26]:
# Remove the `Issued` loan status
validate_df =validate_df[validate_df['loan_status'] != 'Issued']

In [27]:
# Convert the target column values to low_risk and high_risk based on their values
x = dict.fromkeys(['Current', 'Fully Paid'], 'low_risk') 
validate_df = validate_df.replace(x)

x = dict.fromkeys(['Late (31-120 days)', 'Late (16-30 days)', 'Default', 'In Grace Period', 'Charged Off'], 'high_risk')    
validate_df = validate_df.replace(x)

In [36]:
low_risk_rows1 = validate_df[validate_df['loan_status'] == 'low_risk']
high_risk_rows1 = validate_df[validate_df['loan_status'] == 'high_risk']
low_risk_rows1.shape

(2351, 84)

In [29]:
#under sampling
validate_df = pd.concat([low_risk_rows1.sample(n=len(high_risk_rows1), random_state=37), high_risk_rows1])
validate_df = validate_df.reset_index(drop=True)
validate_df

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,loan_status
0,40000.0,0.1033,856.40,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.1430,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.1430,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.70,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.50,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,30000.0,0.1240,673.42,RENT,140480.0,Source Verified,n,15.74,0.0,0.0,...,28.6,0.0,0.0,159688.0,110873.0,48400.0,107388.0,N,N,high_risk
4698,24000.0,0.0756,747.22,RENT,50000.0,Not Verified,n,26.81,0.0,0.0,...,0.0,0.0,0.0,62375.0,18928.0,13300.0,30775.0,N,N,high_risk
4699,10000.0,0.2305,387.36,RENT,33000.0,Verified,n,38.51,0.0,2.0,...,0.0,0.0,0.0,43250.0,33022.0,8500.0,29550.0,N,N,high_risk
4700,8000.0,0.1862,205.86,RENT,38000.0,Source Verified,n,16.36,0.0,1.0,...,0.0,1.0,0.0,31357.0,19595.0,1500.0,9657.0,N,N,high_risk


In [30]:
#saving data for testing purpose
validate_df.to_csv('Resources/cleaned-data/2020Q1loans_us.csv', index=False)

In [38]:
#Over sampling
validate_df_over = pd.concat([low_risk_rows1, high_risk_rows1.sample(n=len(low_risk_rows1), replace=True)])
validate_df_over  = validate_df_over.reset_index(drop=True)
validate_df_over

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,loan_status
0,40000.0,0.1033,856.40,RENT,128700.0,Source Verified,n,12.47,0.0,1.0,...,57.1,0.0,0.0,63915.0,49510.0,49400.0,14515.0,Y,N,low_risk
1,24450.0,0.1430,572.72,MORTGAGE,44574.0,Not Verified,n,15.05,0.0,1.0,...,0.0,0.0,0.0,136425.0,19439.0,15500.0,18925.0,N,N,low_risk
2,13500.0,0.1430,316.23,OWN,60000.0,Not Verified,n,28.72,0.0,0.0,...,0.0,0.0,0.0,82124.0,65000.0,5400.0,61724.0,Y,N,low_risk
3,10625.0,0.1774,268.31,RENT,60000.0,Verified,n,15.70,0.0,4.0,...,20.0,0.0,0.0,54855.0,50335.0,23200.0,26255.0,N,N,low_risk
4,6375.0,0.1862,232.46,RENT,60000.0,Source Verified,n,35.50,0.0,0.0,...,75.0,0.0,0.0,90445.0,56541.0,15300.0,72345.0,N,N,low_risk
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,20200.0,0.2565,810.11,MORTGAGE,70000.0,Not Verified,n,18.96,2.0,0.0,...,44.4,0.0,0.0,301948.0,41050.0,24900.0,35234.0,N,N,high_risk
4698,4500.0,0.0819,141.41,RENT,30000.0,Source Verified,n,15.80,0.0,1.0,...,50.0,0.0,0.0,21379.0,9664.0,10900.0,10079.0,N,N,high_risk
4699,25000.0,0.2305,705.49,RENT,110000.0,Not Verified,n,14.12,0.0,0.0,...,12.5,0.0,0.0,69274.0,46964.0,43500.0,25774.0,N,N,high_risk
4700,5000.0,0.0819,157.13,RENT,90000.0,Not Verified,n,22.32,0.0,1.0,...,0.0,0.0,0.0,79517.0,47435.0,12800.0,63717.0,N,N,high_risk


In [39]:
#saving data for testing purpose
validate_df_over.to_csv('Resources/cleaned-data/2020Q1loans_os.csv', index=False)