In [2]:
# Create spark session
import getpass
from pyspark.sql import SparkSession


username = "itv008299"
spark = SparkSession.builder.\
    config('spark.shuffle.useOldFetchProtocol','true').\
    config('spark.ui.port', '0').\
    config('spark.sql.warehouse.dir', f'/user/{username}/warehouse').\
    enableHiveSupport().\
    master('yarn').\
    getOrCreate()

## Requirement 1

Some people from other team wants to use cleaned data for analysis purpose. Need to create a permanent table on top of cleaned data

In [3]:
customers_df = spark.read\
.format("parquet")\
.load("/user/itv008299/lendingclubproject/cleaned/customers_parquet")

In [4]:
customers_df

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
ee35f3dc05bfbafbf...,feeder driver,10,MORTGAGE,96000.0,LA,707xx,USA,E,E3,Verified,205464.0,Individual,,,2023-11-08 16:31:...
7d6f6da9248a96e63...,Senior Analyst,5,RENT,75000.0,GA,303xx,USA,B,B2,Source Verified,66737.0,Individual,,,2023-11-08 16:31:...
22e3215f32f3f7b81...,Administrative As...,3,RENT,40000.0,FL,330xx,USA,B,B5,Not Verified,33169.0,Individual,,,2023-11-08 16:31:...
d359a6e5fb7ef7e10...,Restorative aide,10,RENT,30000.0,WI,532xx,USA,C,C5,Not Verified,24233.0,Individual,,,2023-11-08 16:31:...
dbda328611e10a659...,retail,4,MORTGAGE,38000.0,CA,902xx,USA,C,C3,Not Verified,17000.0,Individual,,,2023-11-08 16:31:...
2364303f424375fed...,Network Engineer,5,MORTGAGE,82400.0,CA,906xx,USA,C,C2,Not Verified,473136.0,Individual,,,2023-11-08 16:31:...
3615d2cff2237a960...,Producer,1,RENT,55000.0,CA,900xx,USA,B,B5,Source Verified,37946.0,Individual,,,2023-11-08 16:31:...
bce7f62b189966a78...,Prestige Manager,1,RENT,20400.0,NC,272xx,USA,C,C1,Source Verified,8400.0,Individual,,,2023-11-08 16:31:...
c60c8ff0ffaa0d738...,Commercial Lender,1,MORTGAGE,110000.0,CO,800xx,USA,B,B1,Verified,542500.0,Individual,,,2023-11-08 16:31:...
958be73355adb951a...,SUPERVISOR,10,MORTGAGE,75000.0,PA,195xx,USA,B,B4,Source Verified,153692.0,Individual,,,2023-11-08 16:31:...


In [5]:
# Create a database to store all our relevant datas
spark.sql("create database itv008299_lending_club")

In [6]:
customers_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = true)



## 1. Create customers table

In [10]:
spark.sql("DROP TABLE IF EXISTS itv008299_lending_club.customers")

In [11]:
spark.sql("""
CREATE EXTERNAL TABLE itv008299_lending_club.customers(
    member_id string,
    emp_title string,
    emp_length int,
    home_ownership string,
    annual_income float,
    address_state string,
    address_zipcode string,
    address_country string,
    grade string,
    sub_grade string,
    verification_status string,
    total_high_credit_limit float,
    application_type string,
    joint_annual_income float,
    verification_status_joint string,
    ingest_date timestamp
) stored as parquet
LOCATION '/user/itv008299/lendingclubproject/cleaned/customers_parquet'
""")

In [12]:
spark.sql("select * from itv008299_lending_club.customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,joint_annual_income,verification_status_joint,ingest_date
ee35f3dc05bfbafbf...,feeder driver,10,MORTGAGE,96000.0,LA,707xx,USA,E,E3,Verified,205464.0,Individual,,,2023-11-08 16:31:...
7d6f6da9248a96e63...,Senior Analyst,5,RENT,75000.0,GA,303xx,USA,B,B2,Source Verified,66737.0,Individual,,,2023-11-08 16:31:...
22e3215f32f3f7b81...,Administrative As...,3,RENT,40000.0,FL,330xx,USA,B,B5,Not Verified,33169.0,Individual,,,2023-11-08 16:31:...
d359a6e5fb7ef7e10...,Restorative aide,10,RENT,30000.0,WI,532xx,USA,C,C5,Not Verified,24233.0,Individual,,,2023-11-08 16:31:...
dbda328611e10a659...,retail,4,MORTGAGE,38000.0,CA,902xx,USA,C,C3,Not Verified,17000.0,Individual,,,2023-11-08 16:31:...
2364303f424375fed...,Network Engineer,5,MORTGAGE,82400.0,CA,906xx,USA,C,C2,Not Verified,473136.0,Individual,,,2023-11-08 16:31:...
3615d2cff2237a960...,Producer,1,RENT,55000.0,CA,900xx,USA,B,B5,Source Verified,37946.0,Individual,,,2023-11-08 16:31:...
bce7f62b189966a78...,Prestige Manager,1,RENT,20400.0,NC,272xx,USA,C,C1,Source Verified,8400.0,Individual,,,2023-11-08 16:31:...
c60c8ff0ffaa0d738...,Commercial Lender,1,MORTGAGE,110000.0,CO,800xx,USA,B,B1,Verified,542500.0,Individual,,,2023-11-08 16:31:...
958be73355adb951a...,SUPERVISOR,10,MORTGAGE,75000.0,PA,195xx,USA,B,B4,Source Verified,153692.0,Individual,,,2023-11-08 16:31:...


## 2. Create loans table

In [13]:
loans_df = spark.read\
.format("parquet")\
.load("/user/itv008299/lendingclubproject/cleaned/loans_parquet")

In [14]:
loans_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = true)



In [15]:
spark.sql("""
CREATE EXTERNAL TABLE itv008299_lending_club.loans(
    loan_id string,
    member_id string,
    loan_amount float,
    funded_amount float,
    loan_term_years int,
    interest_rate float,
    monthly_installment float,
    issue_date string,
    loan_status string,
    loan_purpose string,
    loan_title string,
    ingest_date timestamp
) stored as parquet
LOCATION '/user/itv008299/lendingclubproject/cleaned/loans_parquet'
""")

## 3. Create loan repayments table

In [16]:
loan_repayments_df = spark.read\
.format("parquet")\
.load("/user/itv008299/lendingclubproject/cleaned/loans_repayments_parquet")

In [17]:
loan_repayments_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)
 |-- ingest_date: timestamp (nullable = true)



In [18]:
spark.sql("""
CREATE EXTERNAL TABLE itv008299_lending_club.loan_repayments(
    loan_id string,
    total_principal_received float,
    total_interest_received float,
    total_late_fee_received float,
    total_payment_received float,
    last_payment_amount float,
    last_payment_date string,
    next_payment_date string, 
    ingest_date timestamp
) stored as parquet
LOCATION '/user/itv008299/lendingclubproject/cleaned/loan_repayments_parquet'
""")

## 4. Create loan defaulters delinq

In [19]:
loan_defaulters_delinq_df = spark.read\
.format("parquet")\
.load("/user/itv008299/lendingclubproject/cleaned/loan_defaulters_delinq_parquet")

In [20]:
loan_defaulters_delinq_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: integer (nullable = true)
 |-- delinq_amnt: float (nullable = true)
 |-- mths_since_last_delinq: integer (nullable = true)
 |-- ingest_date: timestamp (nullable = true)



In [21]:
spark.sql("""
CREATE EXTERNAL TABLE itv008299_lending_club.loan_def_delinq(
    member_id string,
    delinq_2yrs int,
    delinq_amnt float,
    mths_since_last_delinq int,
    ingest_date timestamp
) stored as parquet
LOCATION '/user/itv008299/lendingclubproject/cleaned/loan_defaulters_delinq_parquet'
""")

## 5. Create loan defaulters detail enquiry table

In [22]:
loan_defaulters_detail_records_df =  spark.read\
.format("parquet")\
.load("/user/itv008299/lendingclubproject/cleaned/loan_defaulters_detail_records_enq_parquet")

In [23]:
loan_defaulters_detail_records_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- pub_rec: integer (nullable = true)
 |-- pub_rec_bankruptcies: integer (nullable = true)
 |-- inq_last_6mths: integer (nullable = true)
 |-- ingest_date: timestamp (nullable = true)



In [25]:
spark.sql("""
CREATE EXTERNAL TABLE itv008299_lending_club.loan_def_detail_record_enq(
    member_id string,
    pub_rec int,
    pub_rec_bankruptcies int,
    inq_last_6mths int,
    ingest_date timestamp
) stored as parquet
LOCATION '/user/itv008299/lendingclubproject/cleaned/loan_defaulters_detail_records_enq_parquet'
""")

## Requirement 2

- A complete view of these 5 datasets in one single view.
- Need the most upto date data
- New data comes every 24 hrs, the underlying tables are refreshed every 24 hrs
- For this, as a solution we will create view

In [26]:
spark.sql("""
CREATE OR REPLACE VIEW itv008299_lending_club.customers_loan_vw AS 
    SELECT
        l.loan_id,
        c.member_id,
        c.emp_title,
        c.emp_length,
        c.home_ownership,
        c.annual_income,
        c.address_state,
        c.address_zipcode,
        c.address_country,
        c.grade,
        c.sub_grade,
        c.verification_status,
        c.total_high_credit_limit,
        c.application_type,
        c.joint_annual_income,
        c.verification_status_joint,
        l.loan_amount,
        l.funded_amount,
        l.loan_term_years,
        l.interest_rate,
        l.monthly_installment,
        l.issue_date,
        l.loan_status,
        l.loan_purpose,
        lr.total_principal_received,
        lr.total_interest_received,
        lr.total_late_fee_received,
        lr.total_payment_received,
        lr.last_payment_amount,
        lr.last_payment_date,
        lr.next_payment_date,
        ldq.delinq_2yrs,
        ldq.delinq_amnt,
        ldq.mths_since_last_delinq,
        lde.pub_rec,
        lde.pub_rec_bankruptcies,
        lde.inq_last_6mths
    FROM itv008299_lending_club.customers c
    LEFT JOIN itv008299_lending_club.loans l 
        ON c.member_id = l.member_id
    LEFT JOIN itv008299_lending_club.loan_repayments lr
        ON l.loan_id = lr.loan_id
    LEFT JOIN itv008299_lending_club.loan_def_delinq ldq
        ON c.member_id = ldq.member_id
    LEFT JOIN itv008299_lending_club.loan_def_detail_record_enq lde
        ON c.member_id = lde.member_id
""")

## Requirement 3

- Another team came and asked that they would need a very quick access of datas from the view table ceated above.

- Solution 1: we have a job that runs every 7 DAYS ONE TIME, THE JOIN OF 5 TABLE IS DONE AND THE RESULTS ARE PUT IN A TABLE
The results are faster but the data might be older in this case ( max 7 days old ). Since its heavy job, we cannot affor running it daily. So, we need to communicate if the team is okay with this.

In [27]:
spark.sql("""
CREATE TABLE itv008299_lending_club.customers_loan AS 
    SELECT
        l.loan_id,
        c.member_id,
        c.emp_title,
        c.emp_length,
        c.home_ownership,
        c.annual_income,
        c.address_state,
        c.address_zipcode,
        c.address_country,
        c.grade,
        c.sub_grade,
        c.verification_status,
        c.total_high_credit_limit,
        c.application_type,
        c.joint_annual_income,
        c.verification_status_joint,
        l.loan_amount,
        l.funded_amount,
        l.loan_term_years,
        l.interest_rate,
        l.monthly_installment,
        l.issue_date,
        l.loan_status,
        l.loan_purpose,
        lr.total_principal_received,
        lr.total_interest_received,
        lr.total_late_fee_received,
        lr.total_payment_received,
        lr.last_payment_amount,
        lr.last_payment_date,
        lr.next_payment_date,
        ldq.delinq_2yrs,
        ldq.delinq_amnt,
        ldq.mths_since_last_delinq,
        lde.pub_rec,
        lde.pub_rec_bankruptcies,
        lde.inq_last_6mths
    FROM itv008299_lending_club.customers c
    LEFT JOIN itv008299_lending_club.loans l 
        ON c.member_id = l.member_id
    LEFT JOIN itv008299_lending_club.loan_repayments lr
        ON l.loan_id = lr.loan_id
    LEFT JOIN itv008299_lending_club.loan_def_delinq ldq
        ON c.member_id = ldq.member_id
    LEFT JOIN itv008299_lending_club.loan_def_detail_record_enq lde
        ON c.member_id = lde.member_id
""")

In [13]:
spark.sql("SELECT * FROM itv008299_lending_club.customers_loan")

loan_id,member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_credit_limit,application_type,joint_annual_income,verification_status_joint,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,delinq_2yrs,delinq_amnt,mths_since_last_delinq,pub_rec,pub_rec_bankruptcies,inq_last_6mths
71175798,000c8875b71a6b47c...,Director of Gvt O...,1,MORTGAGE,100000.0,CA,920xx,USA,A,A4,Not Verified,,Individual,,,3000.0,3000.0,3,7.39,93.17,Feb-2016,Fully Paid,debt_consolidation,3000.0,114.47,0.0,3114.47,2647.45,Aug-2016,,1.0,0.0,13.0,,,
139313300,000fc98fc1ca5faa3...,coord,10,MORTGAGE,220000.0,CA,952xx,USA,A,A3,Not Verified,,Joint App,,,20000.0,20000.0,3,7.21,619.47,Aug-2018,Current,credit_card,3558.76,769.52,0.0,4328.28,619.47,Mar-2019,Apr-2019,,,,,,
143195702,0012728d9f616bdf2...,,6,OWN,35000.0,SC,294xx,USA,A,A5,Not Verified,,Individual,,,9000.0,9000.0,3,8.46,283.95,Dec-2018,Current,debt_consolidation,666.17,181.45,0.0,847.62,283.95,Mar-2019,Apr-2019,,,,,,
2284564,00151ece27c7ca280...,Northglenn Police...,8,RENT,54000.0,CO,802xx,USA,C,C1,Not Verified,,Individual,,,12800.0,12800.0,3,14.33,439.53,Nov-2012,Fully Paid,debt_consolidation,12800.0,2192.68,0.0,14992.676,7529.95,Jun-2014,,,,,0.0,0.0,1.0
3845340,002800d49886390d2...,RehabCare,2,RENT,60000.0,TX,787xx,USA,D,D3,Not Verified,,Individual,,,15000.0,15000.0,3,18.75,547.95,Mar-2013,Charged Off,debt_consolidation,12856.06,4702.54,0.0,17835.81,547.95,Dec-2015,,,,,,,
360493,003715c2aec34dd43...,Apex Technologies...,1,MORTGAGE,90000.0,PA,171xx,USA,C,C3,Not Verified,,Individual,,,12000.0,12000.0,3,12.41,400.93,Oct-2008,Fully Paid,debt_consolidation,12000.0,208.02,0.0,12208.02,1.07,Apr-2009,,,,,0.0,0.0,2.0
95004059,003769d7f54c7859e...,Trainer,10,MORTGAGE,35000.0,NC,282xx,USA,E,E1,Verified,,Individual,,,10000.0,10000.0,5,22.74,280.42,Dec-2016,Charged Off,medical,90.92,176.87,0.0,1492.05,280.42,Jan-2017,,1.0,0.0,20.0,,,
142641777,0037bb910c0a758f5...,Network Engineer,2,RENT,105000.0,IL,601xx,USA,C,C3,Source Verified,,Individual,,,2000.0,2000.0,3,15.02,69.36,Oct-2018,Current,other,227.25,116.21,0.0,343.46,69.36,Mar-2019,Apr-2019,,,,,,
91518424,003d7bee408492f11...,Flight Attendant,10,RENT,75000.0,AZ,852xx,USA,A,A5,Not Verified,,Individual,,,10000.0,10000.0,3,7.99,313.32,Oct-2016,Current,other,7864.12,1213.28,0.0,9077.4,313.32,Mar-2019,Apr-2019,,,,,,
116883875,003e1e6cbd2920bbb...,Operations Superv...,9,MORTGAGE,92000.0,LA,705xx,USA,A,A5,Source Verified,,Individual,,,10000.0,10000.0,3,7.97,313.23,Aug-2017,Current,debt_consolidation,4980.61,970.21,0.0,5950.82,313.23,Mar-2019,Apr-2019,1.0,0.0,23.0,1.0,1.0,0.0


## Requirement 4 

- Calculate the loan score
- Higher the loan score, higher the chances of getting your loan approved, and vice versa
- 3 Major criteria to calculate loan score
-- loan repayment history (last payment, total payment received)  - only 20% weight
-- loan defaulters history (delinq-delinquet 2 yrs, pub_rec, ub_rec_bankruptcies, inq_last_6mnths) - only 45 %
-- financial health data (home_ownership, loan_status, funded amount, grade pts-) - only 35%

if our credit limit is 40k, and you just take 2 k loan then this is good.


customers - home_ownership, grade, high credit limit

loans - monthly installment, loan_status, funded amount

loans_repayments - last payment, toal payment received

loan_def_delinq - delinq-delinquet 2 yrs

loan_def_detail_record_enq - pub_rec, ub_rec_bankruptcies, inq_last_6mnths


customers - member id should be unique
loan_def_delinq - member id unique
loan_def_detail_record_enq - member id unique

Bad data should be sent to upstream team, and ask what is valid record

In [28]:
# Checking number of records per customer
spark.sql("""
SELECT 
    member_id, 
    COUNT(*) total 
FROM itv008299_lending_club.customers 
GROUP BY member_id 
ORDER BY total DESC""")

member_id,total
e4c167053d5418230...,5
76b577467eda5bdbc...,4
3f87585a20f702838...,4
ad8e5d384dae17e06...,4
498bb6b1f0099cb47...,3
f54295a60946dedad...,3
d9ce4046daf599732...,3
53789bea7edc660ed...,3
819453be77718d747...,3
4ab6205de571ccb7b...,3


In [29]:
spark.sql("""
SELECT * FROM itv008299_lending_club.customers
WHERE member_id LIKE 'e4c167053d5418230%'
""")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,joint_annual_income,verification_status_joint,ingest_date
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,207300.0,Individual,,,2023-11-08 16:31:...
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,171165.0,Individual,,,2023-11-08 16:31:...
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,138780.0,Individual,,,2023-11-08 16:31:...
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,110907.0,Individual,,,2023-11-08 16:31:...
e4c167053d5418230...,,6,MORTGAGE,55000.0,IL,604xx,USA,B,B5,Verified,129833.0,Individual,,,2023-11-08 16:31:...


### In the above data, we can see that for a single member there are different records. As a developer we would not know which record to keep, so we need to compile such records and send it to our clients/upstream teams for further analysis

In [30]:
# Checking number of records per customer

spark.sql("""
SELECT 
    member_id, 
    COUNT(*) total 
FROM itv008299_lending_club.loan_def_delinq 
GROUP BY member_id 
ORDER BY total DESC""")

member_id,total
e4c167053d5418230...,3
55d55d97420671a1c...,2
4f7579700cd9d49d7...,2
ac9a3da3b89f9228e...,2
694c6cb86608015e9...,2
77db5fdf951dd04b2...,2
343344c3c65b023af...,2
c23eb88fed4893030...,2
6918b9861ba5a4c67...,2
6c2b63ff231e520d4...,2


In [31]:
spark.sql("""
SELECT * FROM itv008299_lending_club.loan_def_delinq
WHERE member_id LIKE 'e4c167053d5418230%'
""")

member_id,delinq_2yrs,delinq_amnt,mths_since_last_delinq,ingest_date
e4c167053d5418230...,1,0.0,37,2023-11-08 17:32:...
e4c167053d5418230...,3,0.0,13,2023-11-08 17:32:...
e4c167053d5418230...,1,0.0,16,2023-11-08 17:32:...


In [32]:
spark.sql("""
SELECT 
    member_id, 
    COUNT(*) total 
FROM itv008299_lending_club.loan_def_detail_record_enq 
GROUP BY member_id 
ORDER BY total DESC""")

member_id,total
c80b3e1938d2f7fad...,3
092e68fe2b907d655...,3
a3be755503ee43a84...,3
c0c09bafdbf0655d8...,3
d93573e2883e37904...,3
ee2d0dd6ad9e048b8...,3
5d47bf75f686431f9...,3
1f392553105eebc55...,3
170b46bb6a9d00bb1...,3
e4c167053d5418230...,3


In [33]:
spark.sql("""
SELECT * FROM itv008299_lending_club.loan_def_detail_record_enq
WHERE member_id LIKE 'c80b3e1938d2f7fad%'
""")

member_id,pub_rec,pub_rec_bankruptcies,inq_last_6mths,ingest_date
c80b3e1938d2f7fad...,1,1,0,2023-11-08 17:35:...
c80b3e1938d2f7fad...,0,0,1,2023-11-08 17:35:...
c80b3e1938d2f7fad...,1,1,0,2023-11-08 17:35:...


In [34]:
bad_data_customers_df = spark.sql("""
select member_id from(
    select 
        member_id, count(*) as total 
    from itv008299_lending_club.customers
    group by member_id 
    having total > 1
)
""")

In [35]:
bad_data_customers_df.count()

3157

In [36]:
bad_data_loan_def_delinq_df = spark.sql("""
select member_id from(
    select 
        member_id, count(*) as total 
    from itv008299_lending_club.loan_def_delinq
    group by member_id 
    having total > 1
)
""")

In [37]:
bad_data_loan_def_delinq_df.count()

144

In [38]:
bad_data_loan_def_detail_enq_df = spark.sql("""
select member_id from(
    select 
        member_id, count(*) as total 
    from itv008299_lending_club.loan_def_detail_record_enq
    group by member_id 
    having total > 1
)
""")

In [39]:
bad_data_loan_def_detail_enq_df.count()

848

Store all bad records in a folder to review later on

In [40]:
bad_data_customers_df.repartition(1).write\
.format("csv")\
.mode("overwrite")\
.option("header", True)\
.option("path", "/user/itv008299/lendingclubproject/bad_data/bad_data_customers")\
.save()

In [41]:
bad_data_loan_def_delinq_df.repartition(1).write\
.format("csv")\
.mode("overwrite")\
.option("header", True)\
.option("path", "/user/itv008299/lendingclubproject/bad_data/bad_data_loan_def_delinq")\
.save()

In [42]:
bad_data_loan_def_detail_enq_df.repartition(1).write\
.format("csv")\
.mode("overwrite")\
.option("header", True)\
.option("path", "/user/itv008299/lendingclubproject/bad_data/bad_data_loan_def_detail_enq")\
.save()

### I want to create a consolidated files which have all member ids uniquely from above three

In [43]:
# union followed by distinct
bad_data_distinct_customers_df = bad_data_customers_df.select("member_id")\
.union(bad_data_loan_def_delinq_df.select("member_id"))\
.union(bad_data_loan_def_detail_enq_df.select("member_id"))

In [44]:
bad_data_distinct_final_df = bad_data_distinct_customers_df.distinct()

In [45]:
bad_data_distinct_final_df.write\
.format("csv")\
.mode("overwrite")\
.option("header", True)\
.option("path", "/user/itv008299/lendingclubproject/bad_data/bad_customer_data_final")\
.save()

In [46]:
bad_data_distinct_final_df.createOrReplaceTempView("bad_data_customers")

## For our future analysis

We will remove records of all those customers who fall under bad data category for now

In [47]:
customers_df = spark.sql("""
    select * from itv008299_lending_club.customers 
    where member_id not in (
        select member_id from bad_data_customers
    )
""")

In [48]:
customers_df.write\
.format("parquet")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned_new/customers_parquet")\
.save()

In [49]:
loan_def_delinq_df = spark.sql("""
    select * from itv008299_lending_club.loan_def_delinq 
    where member_id not in (
        select member_id from bad_data_customers
    )
""")

In [50]:
loan_def_delinq_df.write\
.format("parquet")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned_new/loan_def_delinq_parquet")\
.save()

In [51]:
loan_def_detail_record_enq_df = spark.sql("""
    select * from itv008299_lending_club.loan_def_detail_record_enq 
    where member_id not in (
        select member_id from bad_data_customers
    )
""")

In [52]:
loan_def_detail_record_enq_df.write\
.format("parquet")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned_new/loan_def_detail_record_enq_parquet")\
.save()

In [53]:
spark.sql("""
CREATE EXTERNAL TABLE itv008299_lending_club.customers_new(
    member_id string,
    emp_title string,
    emp_length int,
    home_ownership string,
    annual_income float,
    address_state string,
    address_zipcode string,
    address_country string,
    grade string,
    sub_grade string,
    verification_status string,
    total_high_credit_limit float,
    application_type string,
    joint_annual_income float,
    verification_status_joint string,
    ingest_date timestamp
) stored as parquet
LOCATION '/user/itv008299/lendingclubproject/cleaned_new/customers_parquet'
""")

In [54]:
spark.sql("""
CREATE EXTERNAL TABLE itv008299_lending_club.loan_def_delinq_new(
    member_id string,
    delinq_2yrs int,
    delinq_amnt float,
    mths_since_last_delinq int,
    ingest_date timestamp
) stored as parquet
LOCATION '/user/itv008299/lendingclubproject/cleaned_new/loan_def_delinq_parquet'
""")

In [55]:
spark.sql("""
CREATE EXTERNAL TABLE itv008299_lending_club.loan_def_detail_record_enq_new(
    member_id string,
    pub_rec int,
    pub_rec_bankruptcies int,
    inq_last_6mths int,
    ingest_date timestamp
) stored as parquet
LOCATION '/user/itv008299/lendingclubproject/cleaned_new/loan_def_detail_record_enq_parquet'
""")

In [56]:
spark.stop()