In [1]:
# Create spark session
import getpass
from pyspark.sql import SparkSession


username = getpass.getuser()
spark = SparkSession.builder.\
    config('spark.shuffle.useOldFetchProtocol','true').\
    config('spark.ui.port', '0').\
    config('spark.sql.warehouse.dir', f'/user/{username}/warehouse').\
    enableHiveSupport().\
    master('yarn').\
    getOrCreate()

In [3]:
# Load our main data into a dataframe
raw_df = spark.read\
.format("csv")\
.option("header", True)\
.option("inferSchema", True)\
.load("/public/trendytech/datasets/accepted_2007_to_2018Q4.csv")

In [6]:
# Create a spark table from the dataframe above
raw_df.createOrReplaceTempView("lending_club_raw_data")

In [7]:
# Visualize few records
spark.sql("Select * FROM lending_club_raw_data LIMIT 5")

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
68407277,,3600.0,3600.0,3600.0,36 months,13.99,123.03,C,C4,leadman,10+ years,MORTGAGE,55000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingcl...,,debt_consolidation,Debt consolidation,190xx,PA,5.91,0.0,Aug-2003,675.0,679.0,1.0,30.0,,7.0,0.0,2765.0,29.7,13.0,w,0.0,0.0,4421.723916800001,4421.72,3600.0,821.72,0.0,0.0,0.0,Jan-2019,122.67,,Mar-2019,564.0,560.0,0.0,30.0,1.0,Individual,,,,0.0,722.0,144904.0,2.0,2.0,0.0,1.0,21.0,4981.0,36.0,3.0,3.0,722.0,34.0,9300.0,3.0,1.0,4.0,4.0,20701.0,1506.0,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
68355089,,24700.0,24700.0,24700.0,36 months,11.99,820.28,C,C1,Engineer,10+ years,MORTGAGE,65000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingcl...,,small_business,Business,577xx,SD,16.06,1.0,Dec-1999,715.0,719.0,4.0,6.0,,22.0,0.0,21470.0,19.2,38.0,w,0.0,0.0,25679.66,25679.66,24700.0,979.66,0.0,0.0,0.0,Jun-2016,926.35,,Mar-2019,699.0,695.0,0.0,,1.0,Individual,,,,0.0,0.0,204396.0,1.0,1.0,0.0,1.0,19.0,18005.0,73.0,2.0,3.0,6472.0,29.0,111800.0,0.0,0.0,6.0,4.0,9733.0,57830.0,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
68341763,,20000.0,20000.0,20000.0,60 months,10.78,432.66,B,B4,truck driver,10+ years,MORTGAGE,63000.0,Not Verified,Dec-2015,Fully Paid,n,https://lendingcl...,,home_improvement,,605xx,IL,10.78,0.0,Aug-2000,695.0,699.0,0.0,,,6.0,0.0,7869.0,56.2,18.0,w,0.0,0.0,22705.9242938784,22705.92,20000.0,2705.92,0.0,0.0,0.0,Jun-2017,15813.3,,Mar-2019,704.0,700.0,0.0,,1.0,Joint App,71000.0,13.85,Not Verified,0.0,0.0,189699.0,0.0,1.0,0.0,4.0,19.0,10827.0,73.0,0.0,2.0,2081.0,65.0,14000.0,2.0,5.0,1.0,6.0,31617.0,2737.0,55.9,0.0,0.0,125.0,184.0,14.0,14.0,5.0,101.0,,10.0,,0.0,2.0,3.0,2.0,4.0,6.0,4.0,7.0,3.0,6.0,0.0,0.0,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
66310712,,35000.0,35000.0,35000.0,60 months,14.85,829.9,C,C5,Information Syste...,10+ years,MORTGAGE,110000.0,Source Verified,Dec-2015,Current,n,https://lendingcl...,,debt_consolidation,Debt consolidation,076xx,NJ,17.06,0.0,Sep-2008,785.0,789.0,0.0,,,13.0,0.0,7802.0,11.6,17.0,w,15897.65,15897.65,31464.01,31464.01,19102.35,12361.66,0.0,0.0,0.0,Feb-2019,829.9,Apr-2019,Mar-2019,679.0,675.0,0.0,,1.0,Individual,,,,0.0,0.0,301500.0,1.0,1.0,0.0,1.0,23.0,12609.0,70.0,1.0,1.0,6987.0,45.0,67300.0,0.0,1.0,0.0,2.0,23192.0,54962.0,12.1,0.0,0.0,36.0,87.0,2.0,2.0,1.0,2.0,,,,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,
68476807,,10400.0,10400.0,10400.0,60 months,22.45,289.91,F,F1,Contract Specialist,3 years,MORTGAGE,104433.0,Source Verified,Dec-2015,Fully Paid,n,https://lendingcl...,,major_purchase,Major purchase,174xx,PA,25.37,1.0,Jun-1998,695.0,699.0,3.0,12.0,,12.0,0.0,21929.0,64.5,35.0,w,0.0,0.0,11740.5,11740.5,10400.0,1340.5,0.0,0.0,0.0,Jul-2016,10128.96,,Mar-2018,704.0,700.0,0.0,,1.0,Individual,,,,0.0,0.0,331730.0,1.0,3.0,0.0,3.0,14.0,73839.0,84.0,4.0,7.0,9702.0,78.0,34000.0,2.0,1.0,3.0,10.0,27644.0,4567.0,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,,,,,,,,,,,,,,N,,,,,,,,,,,,,,,Cash,N,,,,,,


## Generate member_id for each records

In our dataset above, all entries for column member_id is NULL. But, this is important for our analysis, as we are trying to predict if lending loan to certain member is risky or not. Inorder to tackle this problem, what I decided to do is concatenate emp_title, emp_length, home_ownership, annual_inc, zip_code, addr_state, grade, sub_grade, and verification_status. After concatenation, pass this to a hash function in order to geenerate a unique value which will eventually be used as member_id later on.

For hashing, I used sha2 in this.

In [8]:
from pyspark.sql.functions import sha2, concat_ws

In [13]:
new_df = raw_df.withColumn("member_id_sha2", sha2(concat_ws("||", *["emp_title", "emp_length", "home_ownership", "annual_inc", "zip_code", "addr_state", "grade", "sub_grade","verification_status"]), 256))

## Data Exploration

In [14]:
new_df.createOrReplaceTempView("lending_club_raw_data_new")

In [15]:
# Checking total number of records in the dataset
spark.sql("SELECT COUNT(*) FROM lending_club_raw_data_new")

count(1)
2260701


In [16]:
# Checking total number of distinct members who took the loan
spark.sql("SELECT COUNT(DISTINCT(member_id_sha2)) FROM lending_club_raw_data_new")

count(DISTINCT member_id_sha2)
2257384


In [16]:
# Finding top borrowers
spark.sql("""SELECT 
    member_id_sha2, 
    COUNT(*) AS total_count
FROM lending_club_raw_data_new
GROUP BY member_id_sha2
HAVING total_count > 1
ORDER BY total_count DESC""")

member_id_sha2,total_count
e3b0c44298fc1c149...,33
e4c167053d5418230...,5
3f87585a20f702838...,4
76b577467eda5bdbc...,4
ad8e5d384dae17e06...,4
e7d8d16928817ec8f...,3
f54295a60946dedad...,3
f9fc581aae696e0bd...,3
066ddaa64bee66dff...,3
22593a1870543b2db...,3


### Findings 1

In the above result, we can see that most of the members have got loans 3-5 times. On the other hand there is one member who has taken the loan for 33 times. This is usually an outlier because it is unlikely for a bank to give a single use this many loans. Also, since we had created new member id by concatenating several columns and the hashing them, so values for each column might have been null for a few records which mught have brought this issue. We will confirm this by checking the datas for the outlier value.

In [17]:
spark.sql("SELECT * FROM lending_club_raw_data_new WHERE member_id_sha2 LIKE 'e3b0c44298fc1c149%' ")

id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,issue_d,loan_status,pymnt_plan,url,desc,purpose,title,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,out_prncp,out_prncp_inv,total_pymnt,total_pymnt_inv,total_rec_prncp,total_rec_int,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,next_pymnt_d,last_credit_pull_d,last_fico_range_high,last_fico_range_low,collections_12_mths_ex_med,mths_since_last_major_derog,policy_code,application_type,annual_inc_joint,dti_joint,verification_status_joint,acc_now_delinq,tot_coll_amt,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,il_util,open_rv_12m,open_rv_24m,max_bal_bc,all_util,total_rev_hi_lim,inq_fi,total_cu_tl,inq_last_12m,acc_open_past_24mths,avg_cur_bal,bc_open_to_buy,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,revol_bal_joint,sec_app_fico_range_low,sec_app_fico_range_high,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog,hardship_flag,hardship_type,hardship_reason,hardship_status,deferral_term,hardship_amount,hardship_start_date,hardship_end_date,payment_plan_start_date,hardship_length,hardship_dpd,hardship_loan_status,orig_projected_additional_accrued_interest,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term,member_id_sha2
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...
Total amount fund...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,e3b0c44298fc1c149...


Like I said above, all the columns have NULL values. In order to tackle this, I will diregard these datas from my actual analytical data 

## Generate Customer File

In [22]:
# Create customers table from the main table
spark.sql(""" SELECT
    member_id_sha2 as member_id,
    emp_title,
    emp_length,
    home_ownership,
    annual_inc,
    addr_state,
    zip_code,
    'USA' as country,
    grade,
    sub_grade,
    verification_status,
    tot_hi_cred_lim,
    application_type,
    annual_inc_joint,
    verification_status_joint
FROM lending_club_raw_data_new
""").repartition(1).write\
    .option("header", True)\
    .format("csv")\
    .mode("overwrite")\
    .option("path", "/user/itv008299/lendingclubproject/raw/customers_data_csv")\
    .save()

In [23]:
# Load the recently save customer data and check
customers_df = spark.read\
.format("csv")\
.option("header", True)\
.option("inferSchema", True)\
.load("/user/itv008299/lendingclubproject/raw/customers_data_csv")

In [20]:
customers_df

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


## Generate Loan File

In [24]:
# Create loans table from the main table
spark.sql(""" SELECT
    id as loan_id,
    member_id_sha2 as member_id,
    loan_amnt,
    funded_amnt,
    term,
    int_rate,
    installment,
    issue_d,
    loan_status,
    purpose,
    title
FROM lending_club_raw_data_new
""").repartition(1).write\
    .option("header", True)\
    .format("csv")\
    .mode("overwrite")\
    .option("path", "/user/itv008299/lendingclubproject/raw/loans_data_csv")\
    .save()

In [25]:
# Load the recently saved loan data and check

loans_df = spark.read\
.format("csv")\
.option("header", True)\
.option("inferSchema", True)\
.load("/user/itv008299/lendingclubproject/raw/loans_data_csv")

In [26]:
loans_df

loan_id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,issue_d,loan_status,purpose,title
491699,961ae110ec063761f...,7000.0,7000.0,36 months,12.73,234.97,Mar-2010,Fully Paid,debt_consolidation,Noelle's debt con...
491685,291854d244748c911...,15000.0,15000.0,36 months,10.62,488.4,Mar-2010,Fully Paid,other,Education
491667,b5cf66b90193da9d2...,6400.0,6400.0,36 months,7.88,200.2,Mar-2010,Fully Paid,debt_consolidation,Make money easily
491160,7f53fd1acb7759e5b...,4000.0,4000.0,36 months,14.59,137.86,Mar-2010,Fully Paid,wedding,wedding expenses
491675,12edc8e328b993914...,20000.0,20000.0,36 months,13.85,682.08,Mar-2010,Fully Paid,other,Short Term Tax Loan
491668,e18a847f091332ff6...,6000.0,6000.0,36 months,11.36,197.47,Mar-2010,Charged Off,debt_consolidation,GREAT BORROWER --...
491663,d6fe0000e876ab76d...,5500.0,5500.0,36 months,11.36,181.02,Mar-2010,Fully Paid,credit_card,Road to Success
491632,62c38934f3a2ebcd2...,10000.0,10000.0,36 months,15.7,350.11,Mar-2010,Fully Paid,credit_card,Finish Paying off...
491618,7701762ef5d302b0d...,25000.0,25000.0,36 months,15.33,870.71,Mar-2010,Fully Paid,debt_consolidation,Pay off my high i...
491622,42a2df45b02b21b9a...,25000.0,25000.0,36 months,16.07,879.85,Mar-2010,Fully Paid,debt_consolidation,Debt Pay Off


## Generate Loan Repayments File

In [28]:
# Create loan repayments from the main table
spark.sql(""" SELECT
    id as loan_id,
    total_rec_prncp,
    total_rec_int,
    total_rec_late_fee,
    total_pymnt,
    last_pymnt_amnt,
    last_pymnt_d,
    next_pymnt_d
FROM lending_club_raw_data_new
""").repartition(1).write\
    .option("header", True)\
    .format("csv")\
    .mode("overwrite")\
    .option("path", "/user/itv008299/lendingclubproject/raw/loans_repayments_data_csv")\
    .save()

In [29]:
# Load the recently save loan repayments data and check

loans_repayment_df = spark.read\
.format("csv")\
.option("header", True)\
.option("inferSchema", True)\
.load("/user/itv008299/lendingclubproject/raw/loans_repayments_data_csv")

In [30]:
loans_repayment_df

loan_id,total_rec_prncp,total_rec_int,total_rec_late_fee,total_pymnt,last_pymnt_amnt,last_pymnt_d,next_pymnt_d
68407277,3600.0,821.72,0.0,4421.723916800001,122.67,Jan-2019,
68355089,24700.0,979.66,0.0,25679.66,926.35,Jun-2016,
68341763,20000.0,2705.92,0.0,22705.9242938784,15813.3,Jun-2017,
66310712,19102.35,12361.66,0.0,31464.01,829.9,Feb-2019,Apr-2019
68476807,10400.0,1340.5,0.0,11740.5,10128.96,Jul-2016,
68426831,11950.0,1758.95,0.0,13708.9485297572,7653.56,May-2017,
68476668,20000.0,1393.8,0.0,21393.800000011,15681.05,Nov-2016,
67275481,20000.0,1538.51,0.0,21538.508976797,14618.23,Jan-2017,
68466926,10000.0,998.97,0.0,10998.9715749644,1814.48,Aug-2018,
68616873,8000.0,939.58,0.0,8939.5805031401,4996.24,Apr-2017,


## Generate Loan Defaulter File

In [31]:
# Create loans defaulters table from the main table
spark.sql(""" SELECT
    member_id_sha2 as member_id,
    delinq_2yrs,
    delinq_amnt,
    pub_rec,
    pub_rec_bankruptcies,
    inq_last_6mths,
    total_rec_late_fee,
    mths_since_last_delinq,
    mths_since_last_record 
FROM lending_club_raw_data_new
""").repartition(1).write\
    .option("header", True)\
    .format("csv")\
    .mode("overwrite")\
    .option("path", "/user/itv008299/lendingclubproject/raw/loans_defaulters_data_csv")\
    .save()

In [32]:
# Load the recently save loan defaulters data and check

loans_defaulters_df = spark.read\
.format("csv")\
.option("header", True)\
.option("inferSchema", True)\
.load("/user/itv008299/lendingclubproject/raw/loans_defaulters_data_csv")

In [33]:
loans_defaulters_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
4480925324607267c...,0.0,0.0,0.0,0.0,0.0,0.0,,
b54711d4a553ea330...,0.0,0.0,1.0,0.0,1.0,0.0,,113.0
db06b45a938f1a3b5...,0.0,0.0,0.0,0.0,2.0,0.0,38.0,
ad9d9524477e85c11...,1.0,0.0,0.0,0.0,0.0,0.0,19.0,
c67f6ac3fea6ef46d...,0.0,0.0,1.0,1.0,0.0,0.0,,71.0
bb36e2cb69517fac3...,1.0,0.0,1.0,0.0,2.0,0.0,16.0,107.0
af69a7dff814fb213...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,
c9c794b5025e14a7d...,0.0,0.0,1.0,1.0,0.0,37.8,,56.0
61b48d763bd82369a...,1.0,0.0,1.0,0.0,0.0,0.0,21.0,48.0
adc390ceaa6428ba4...,0.0,0.0,1.0,1.0,1.0,0.0,58.0,115.0


In [104]:
spark.stop()