In [None]:
import pandas as pd
import os
import shutil

import warnings
warnings.filterwarnings("ignore")

from deltalake import DeltaTable
from deltalake.writer import write_deltalake

In [None]:
DELTA_LAKE_TABLE = "loans.delta"

In [None]:
if os.path.exists(DELTA_LAKE_TABLE):
    shutil.rmtree(DELTA_LAKE_TABLE, ignore_errors=False, onerror=None)

In [None]:
df = pd.read_csv('./loan.csv')
write_deltalake(DELTA_LAKE_TABLE, df, overwrite_schema=True)

In [None]:
dt = DeltaTable(DELTA_LAKE_TABLE)
print(dt.version())

In [None]:
### Fill version=1 in delta lake with selected features from original dataset.

In [None]:
# Remove columns which missing values > 70%
df_1 = df.dropna(axis=1, thresh=int(0.70*len(df)))

print(
    'The number of columns has reduced from {} to {} columns by removing columns with 70% missing values'.
    format(len(df.columns), len(df_1.columns))
)

In [None]:
selected_loan_status = ['Fully Paid', 'Charged Off', 'Default']
df_2 = df_1[df_1.loan_status.isin(selected_loan_status)]
df_2.loan_status = df_2.loan_status.replace({'Fully Paid' : 'Good Loan'})
df_2.loan_status = df_2.loan_status.replace({'Charged Off' : 'Bad Loan'})
df_2.loan_status = df_2.loan_status.replace({'Default' : 'Bad Loan'})

print(
    'The number of rows has been reduced from {:,.0f} to {:,.0f} by filtering the data with the correlated loan status'.
    format(len(df_1), len(df_2))     
)

In [None]:
#### First Trial - Fewer number of features
df_3 = df_2[[
    'loan_status', 'term','int_rate',
    'installment','grade', 'annual_inc',
    'verification_status','dti'  # These features are just initial guess, you can try to choose any other combination
]]
df_3.head()

In [None]:
# Find missing values in the chosen columns
df_null = pd.DataFrame({'Count': df_3.isnull().sum(), 'Percent': round(100*df_3.isnull().sum()/len(df_3),2)})
df_null[df_null['Count'] != 0]

In [None]:
# Dropping rows with null values
df_clean = df_3.dropna(axis = 0)

In [None]:
# The next step is to transform categorical target variable into integer
df_clean.loan_status = df_clean.loan_status.replace({'Good Loan' : 1})
df_clean.loan_status = df_clean.loan_status.replace({'Bad Loan' : 0})
df_clean.loan_status.unique()

In [None]:
write_deltalake(DELTA_LAKE_TABLE, df_clean, mode='overwrite', overwrite_schema=True)
dt = DeltaTable(DELTA_LAKE_TABLE)
print(dt.version())

In [None]:
### Fill version=2 in delta lake with few more features than version1 from original dataset.

In [None]:
df_4 = df_2

In [None]:
# The next step is to transform categorical target variable into integer
df_4.loan_status = df_4.loan_status.replace({'Good Loan' : 1})
df_4.loan_status = df_4.loan_status.replace({'Bad Loan' : 0})

In [None]:
df_4.columns.to_series().groupby(df_clean.dtypes).groups

In [None]:
# First, dropping categorical features (object type) which have too many options available
df_4 = df_4.drop(['emp_title', 'sub_grade', 'issue_d', 'last_pymnt_d', 'last_credit_pull_d'], axis=1)

In [None]:
# Second, to filter numerical features, we can use .corr() function to select only features with high correlation to the target variable
df_4.corr()['loan_status']

In [None]:
df_clean = df_4[[
    'loan_status', # target variable
    # features (object):
    'term', 'grade','home_ownership', 'verification_status', 'pymnt_plan', 'purpose', 
    'initial_list_status', 'application_type',
    # features (int/float):
    'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'recoveries',                   
    'collection_recovery_fee', 'last_pymnt_amnt', 'int_rate'
]]

In [None]:
df_null = pd.DataFrame({'Count': df_clean.isnull().sum(), 'Percent': round(100*df_clean.isnull().sum()/len(df_clean),2)})
df_null[df_null['Count'] != 0] 

In [None]:
write_deltalake(DELTA_LAKE_TABLE, df_clean, mode='overwrite', overwrite_schema=True)
dt = DeltaTable(DELTA_LAKE_TABLE)
print(dt.version())