# Cleaning loan defaulters data

## Data Cleaning Actions to Take

1. Create dataframe with proper datatypes, and proper column names

2. Insert a new column named as ingestion date (current time )

3. Update column delinq_2yrs to int and replace nulls with zeros

4. Store only member_id for those record having public records


In [1]:
# Create spark session
import getpass
from pyspark.sql import SparkSession


username = getpass.getuser()
spark = SparkSession.builder.\
    config('spark.shuffle.useOldFetchProtocol','true').\
    config('spark.ui.port', '0').\
    config('spark.sql.warehouse.dir', f'/user/{username}/warehouse').\
    enableHiveSupport().\
    master('yarn').\
    getOrCreate()

In [2]:
loan_defaulters_raw_df = spark.read\
.format("csv")\
.option("header", True)\
.option("inferSchema", True)\
.load("/user/itv008299/lendingclubproject/raw/loans_defaulters_data_csv")

## Visualizing some columns in the data

In [3]:
loan_defaulters_raw_df

member_id,delinq_2yrs,delinq_amnt,pub_rec,pub_rec_bankruptcies,inq_last_6mths,total_rec_late_fee,mths_since_last_delinq,mths_since_last_record
4480925324607267c...,0.0,0.0,0.0,0.0,0.0,0.0,,
b54711d4a553ea330...,0.0,0.0,1.0,0.0,1.0,0.0,,113.0
db06b45a938f1a3b5...,0.0,0.0,0.0,0.0,2.0,0.0,38.0,
ad9d9524477e85c11...,1.0,0.0,0.0,0.0,0.0,0.0,19.0,
c67f6ac3fea6ef46d...,0.0,0.0,1.0,1.0,0.0,0.0,,71.0
bb36e2cb69517fac3...,1.0,0.0,1.0,0.0,2.0,0.0,16.0,107.0
af69a7dff814fb213...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,
c9c794b5025e14a7d...,0.0,0.0,1.0,1.0,0.0,37.8,,56.0
61b48d763bd82369a...,1.0,0.0,1.0,0.0,0.0,0.0,21.0,48.0
adc390ceaa6428ba4...,0.0,0.0,1.0,1.0,1.0,0.0,58.0,115.0


In [4]:
loan_defaulters_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- delinq_2yrs: string (nullable = true)
 |-- delinq_amnt: double (nullable = true)
 |-- pub_rec: string (nullable = true)
 |-- pub_rec_bankruptcies: double (nullable = true)
 |-- inq_last_6mths: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- mths_since_last_delinq: string (nullable = true)
 |-- mths_since_last_record: string (nullable = true)



In [38]:
loan_defaulters_raw_df.createOrReplaceTempView("loan_defaulters")

In [39]:
spark.sql("select distinct(delinq_2yrs) from loan_defaulters")

delinq_2yrs
1.0
271 monthly payme...
I bike to work on...
183xx
VISA and AMEX cre...
etc. and I feel t...
AZ
017xx
923xx
446xx


In [40]:
spark.sql("select delinq_2yrs, count(*) as total from loan_defaulters group by delinq_2yrs order by total desc")

delinq_2yrs,total
0.0,1838878
1.0,281335
2.0,81285
3.0,29539
4.0,13179
5.0,6599
6.0,3717
7.0,2062
8.0,1223
9.0,818


## 1. Change to suitable dataatypes

In [5]:
loan_defaulters_schema = "member_id string, delinq_2yrs float, delinq_amnt float, pub_rec float, pub_rec_bankruptcies float,inq_last_6mths float, total_rec_late_fee float, mths_since_last_delinq float, mths_since_last_record float"

In [6]:
loan_default_raw_df = spark.read\
.format("csv")\
.option("header", True)\
.schema(loan_defaulters_schema)\
.load("/user/itv008299/lendingclubproject/raw/loans_defaulters_data.csv")

## 2. Insert a new column named as ingestion date (current time )

In [11]:
from pyspark.sql.functions import current_timestamp

loans_default_ingested_df = loan_default_raw_df.withColumn("ingest_date", current_timestamp())

## 2. Update column delinq_2yrs to int and replace nulls with zeros

In [12]:
loans_default_ingested_df.createOrReplaceTempView("loan_defaulters")

In [8]:
spark.sql("select delinq_2yrs, count(*) as total from loan_defaulters group by delinq_2yrs order by total desc").show(40)

+-----------+-------+
|delinq_2yrs|  total|
+-----------+-------+
|        0.0|1838878|
|        1.0| 281335|
|        2.0|  81285|
|        3.0|  29539|
|        4.0|  13179|
|        5.0|   6599|
|        6.0|   3717|
|        7.0|   2062|
|        8.0|   1223|
|        9.0|    818|
|       10.0|    556|
|       11.0|    363|
|       12.0|    264|
|       null|    261|
|       13.0|    165|
|       14.0|    120|
|       15.0|     87|
|       16.0|     55|
|       17.0|     30|
|       18.0|     30|
|       19.0|     23|
|       20.0|     17|
|       21.0|     12|
|       22.0|      5|
|       24.0|      4|
|       26.0|      3|
|       29.0|      2|
|       25.0|      2|
|       23.0|      2|
|       3.44|      2|
|       30.0|      2|
|       3.45|      1|
|      13.76|      1|
|      21.72|      1|
|       6.52|      1|
|      26.24|      1|
|       9.44|      1|
|       5.52|      1|
|      22.95|      1|
|      22.62|      1|
+-----------+-------+
only showing top 40 rows



In [13]:
from pyspark.sql.functions import col

loan_default_filtered_df = loans_default_ingested_df.withColumn("delinq_2yrs", col("delinq_2yrs").cast("integer")).fillna(0, subset = ["delinq_2yrs"])

## 3. Store only details of person who have defaulted

In [14]:
loan_default_filtered_df.createOrReplaceTempView("loan_defaulters")

In [16]:
loans_def_delinq_df = spark.sql("select member_id, delinq_2yrs, delinq_amnt, int(mths_since_last_delinq), ingest_date from loan_defaulters where delinq_2yrs >0 and mths_since_last_delinq > 0")

In [17]:
loans_def_delinq_df

member_id,delinq_2yrs,delinq_amnt,mths_since_last_delinq,ingest_date
202d9f56ecb7c3bc9...,1,0.0,6,2023-11-08 17:29:...
1b3a50d854fbbf97e...,1,0.0,21,2023-11-08 17:29:...
9847d8c1e9d0b2084...,1,0.0,6,2023-11-08 17:29:...
6f196952e71277fd4...,4,0.0,5,2023-11-08 17:29:...
9c617cbc6e3e3d6a1...,1,0.0,20,2023-11-08 17:29:...
8ff4d0ed17a1cab92...,1,0.0,15,2023-11-08 17:29:...
aec13cfd611b2cfea...,1,0.0,19,2023-11-08 17:29:...
af7a8a661df3318bd...,2,0.0,7,2023-11-08 17:29:...
176d6002f8f60ea33...,1,1850.0,1,2023-11-08 17:29:...
2a589bfc0ceedd44d...,1,0.0,1,2023-11-08 17:29:...


## 4. Store only member_id for those record having public records

In [18]:
loans_def_record_delinq_df = spark.sql("select member_id from loan_defaulters where pub_rec > 0.0 or pub_rec_bankruptcies > 0.0 or inq_last_6mths > 0.0")

## Write cleaned data into datalake

In [23]:
loans_def_delinq_df.write\
.format("parquet")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned/loan_defaulters_delinq_parquet")\
.save()

In [22]:
loans_def_delinq_df.write\
.option("header", True) \
.format("csv")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned/loan_defaulters_delinq_csv")\
.save()

In [24]:
loans_def_record_delinq_df.write\
.format("parquet")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned/loan_defaulters_records_enq_parquet")\
.save()

In [25]:
loans_def_record_delinq_df.write\
.option("header", True) \
.format("csv")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned/loan_defaulters_records_enq_csv")\
.save()

## Client Requirement Updated Scenario

Before we were only storing member_id for those members who had social inquiry of any kind. Now, based on furhter analysis and communication, client want us to add extra details regarding the enquiry, as it is seen to be useful for calculating loan score

In [26]:
loans_def_detail_record_delinq_df = spark.sql("select member_id, pub_rec, pub_rec_bankruptcies, inq_last_6mths, ingest_date from loan_defaulters where pub_rec > 0.0 or pub_rec_bankruptcies > 0.0 or inq_last_6mths > 0.0")

In [27]:
loans_def_processed_public_record_df = loans_def_detail_record_delinq_df.withColumn("pub_rec", col("pub_rec").cast("integer")).fillna(0, subset = ["pub_rec"])

In [28]:
loans_def_processed_public_record_bankruptcies_df = loans_def_processed_public_record_df.withColumn("pub_rec_bankruptcies", col("pub_rec_bankruptcies").cast("integer")).fillna(0, subset = ["pub_rec_bankruptcies"])

In [29]:
loans_def_processed_public_record_inq6mths_df = loans_def_processed_public_record_bankruptcies_df.withColumn("inq_last_6mths", col("inq_last_6mths").cast("integer")).fillna(0, subset = ["inq_last_6mths"])

In [30]:
loans_def_processed_public_record_inq6mths_df.createOrReplaceTempView("loan_defaulters")

In [31]:
loans_def_detail_record_final_df = spark.sql("select member_id, pub_rec, pub_rec_bankruptcies, inq_last_6mths, ingest_date from loan_defaulters")

In [32]:
loans_def_detail_record_final_df.write\
.format("parquet")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned/loan_defaulters_detail_records_enq_parquet")\
.save()

In [34]:
loans_def_detail_record_final_df.write\
.option("header", True)\
.format("csv")\
.mode("overwrite")\
.option("path", "/user/itv008299/lendingclubproject/cleaned/loan_defaulters_detail_records_enq_csv")\
.save()

In [35]:
spark.stop()