# Cleaning loan data

## Data Cleaning Actions to Take

1. Create dataframe with proper datatypes, and proper column names

2. Insert a new column named as ingestion date (current time )

3. Drop any nulls

4. Convert monthly loan term to yearly loan term

5. Modify loan purpose other than in look up to Others

In [1]:
# Create spark session
import getpass
from pyspark.sql import SparkSession


username = getpass.getuser()
spark = SparkSession.builder.\
    config('spark.shuffle.useOldFetchProtocol','true').\
    config('spark.ui.port', '0').\
    config('spark.sql.warehouse.dir', f'/user/{username}/warehouse').\
    enableHiveSupport().\
    master('yarn').\
    getOrCreate()

## 1. Create dataframe with proper datatypes, and proper colum

In [6]:
loan_schema = """loan_id string, member_id string, loan_amount float, 
funded_amount float, loan_term_months string, interest_rate float, 
monthly_installment float, issue_date string, loan_status string, 
loan_purpose string, loan_title string"""

In [7]:
loans_raw_df = spark.read\
.format("csv")\
.option("header", True)\
.schema(loan_schema)\
.load("/user/itv008299/lendingclubproject/raw/loans_data_csv")

In [8]:
loans_raw_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
491699,961ae110ec063761f...,7000.0,7000.0,36 months,12.73,234.97,Mar-2010,Fully Paid,debt_consolidation,Noelle's debt con...
491685,291854d244748c911...,15000.0,15000.0,36 months,10.62,488.4,Mar-2010,Fully Paid,other,Education
491667,b5cf66b90193da9d2...,6400.0,6400.0,36 months,7.88,200.2,Mar-2010,Fully Paid,debt_consolidation,Make money easily
491160,7f53fd1acb7759e5b...,4000.0,4000.0,36 months,14.59,137.86,Mar-2010,Fully Paid,wedding,wedding expenses
491675,12edc8e328b993914...,20000.0,20000.0,36 months,13.85,682.08,Mar-2010,Fully Paid,other,Short Term Tax Loan
491668,e18a847f091332ff6...,6000.0,6000.0,36 months,11.36,197.47,Mar-2010,Charged Off,debt_consolidation,GREAT BORROWER --...
491663,d6fe0000e876ab76d...,5500.0,5500.0,36 months,11.36,181.02,Mar-2010,Fully Paid,credit_card,Road to Success
491632,62c38934f3a2ebcd2...,10000.0,10000.0,36 months,15.7,350.11,Mar-2010,Fully Paid,credit_card,Finish Paying off...
491618,7701762ef5d302b0d...,25000.0,25000.0,36 months,15.33,870.71,Mar-2010,Fully Paid,debt_consolidation,Pay off my high i...
491622,42a2df45b02b21b9a...,25000.0,25000.0,36 months,16.07,879.85,Mar-2010,Fully Paid,debt_consolidation,Debt Pay Off


In [9]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



## 2. Insert a new column named as ingestion date (current time)

In [10]:
from pyspark.sql.functions import current_timestamp

loans_ingested_df = loans_raw_df.withColumn("ingest_date", current_timestamp())

In [11]:
loans_ingested_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
491699,961ae110ec063761f...,7000.0,7000.0,36 months,12.73,234.97,Mar-2010,Fully Paid,debt_consolidation,Noelle's debt con...,2023-11-08 16:39:...
491685,291854d244748c911...,15000.0,15000.0,36 months,10.62,488.4,Mar-2010,Fully Paid,other,Education,2023-11-08 16:39:...
491667,b5cf66b90193da9d2...,6400.0,6400.0,36 months,7.88,200.2,Mar-2010,Fully Paid,debt_consolidation,Make money easily,2023-11-08 16:39:...
491160,7f53fd1acb7759e5b...,4000.0,4000.0,36 months,14.59,137.86,Mar-2010,Fully Paid,wedding,wedding expenses,2023-11-08 16:39:...
491675,12edc8e328b993914...,20000.0,20000.0,36 months,13.85,682.08,Mar-2010,Fully Paid,other,Short Term Tax Loan,2023-11-08 16:39:...
491668,e18a847f091332ff6...,6000.0,6000.0,36 months,11.36,197.47,Mar-2010,Charged Off,debt_consolidation,GREAT BORROWER --...,2023-11-08 16:39:...
491663,d6fe0000e876ab76d...,5500.0,5500.0,36 months,11.36,181.02,Mar-2010,Fully Paid,credit_card,Road to Success,2023-11-08 16:39:...
491632,62c38934f3a2ebcd2...,10000.0,10000.0,36 months,15.7,350.11,Mar-2010,Fully Paid,credit_card,Finish Paying off...,2023-11-08 16:39:...
491618,7701762ef5d302b0d...,25000.0,25000.0,36 months,15.33,870.71,Mar-2010,Fully Paid,debt_consolidation,Pay off my high i...,2023-11-08 16:39:...
491622,42a2df45b02b21b9a...,25000.0,25000.0,36 months,16.07,879.85,Mar-2010,Fully Paid,debt_consolidation,Debt Pay Off,2023-11-08 16:39:...


## 3. Dropping the rows which has null values in the mentioned columns

We are dropping these rows because rows with null values in these column are meaningless to our analysis

In [12]:
loans_ingested_df.createOrReplaceTempView("loans")

In [13]:
spark.sql("select * from loans where loan_amount is null")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Loans that do not...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-08 16:41:...


In [14]:
columns_to_check = ["loan_amount", "funded_amount", "loan_term_months", "interest_rate", "monthly_installment", "issue_date", "loan_status", "loan_purpose"]

In [15]:
loans_filtered_df = loans_ingested_df.na.drop(subset=columns_to_check)

In [16]:
loans_filtered_df.createOrReplaceTempView("loans")

## 4. Convert loan_term_months to int

In [18]:
from pyspark.sql.functions import regexp_replace, col

loan_term_modified_df = loans_filtered_df.withColumn("loan_term_months", (regexp_replace(col("loan_term_months"), " months", "") \
                                         .cast("int")/12) \
                                         .cast("int")) \
                                         .withColumnRenamed("loan_term_months", "loan_term_years")

In [19]:
loan_term_modified_df

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
491699,961ae110ec063761f...,7000.0,7000.0,3,12.73,234.97,Mar-2010,Fully Paid,debt_consolidation,Noelle's debt con...,2023-11-08 16:46:...
491685,291854d244748c911...,15000.0,15000.0,3,10.62,488.4,Mar-2010,Fully Paid,other,Education,2023-11-08 16:46:...
491667,b5cf66b90193da9d2...,6400.0,6400.0,3,7.88,200.2,Mar-2010,Fully Paid,debt_consolidation,Make money easily,2023-11-08 16:46:...
491160,7f53fd1acb7759e5b...,4000.0,4000.0,3,14.59,137.86,Mar-2010,Fully Paid,wedding,wedding expenses,2023-11-08 16:46:...
491675,12edc8e328b993914...,20000.0,20000.0,3,13.85,682.08,Mar-2010,Fully Paid,other,Short Term Tax Loan,2023-11-08 16:46:...
491668,e18a847f091332ff6...,6000.0,6000.0,3,11.36,197.47,Mar-2010,Charged Off,debt_consolidation,GREAT BORROWER --...,2023-11-08 16:46:...
491663,d6fe0000e876ab76d...,5500.0,5500.0,3,11.36,181.02,Mar-2010,Fully Paid,credit_card,Road to Success,2023-11-08 16:46:...
491632,62c38934f3a2ebcd2...,10000.0,10000.0,3,15.7,350.11,Mar-2010,Fully Paid,credit_card,Finish Paying off...,2023-11-08 16:46:...
491618,7701762ef5d302b0d...,25000.0,25000.0,3,15.33,870.71,Mar-2010,Fully Paid,debt_consolidation,Pay off my high i...,2023-11-08 16:46:...
491622,42a2df45b02b21b9a...,25000.0,25000.0,3,16.07,879.85,Mar-2010,Fully Paid,debt_consolidation,Debt Pay Off,2023-11-08 16:46:...


In [20]:
loan_term_modified_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



## 5. Clean the loan_purpose column

In [21]:
loan_term_modified_df.createOrReplaceTempView("loans")

In [22]:
# check loan_purpose data
spark.sql("select distinct(loan_purpose) from loans")

loan_purpose
"guaranteed!"""
and if they are a...
never had any tro...
Bank of America c...
<br/><br/>Lending...
stocks
please feel free ...
I became his prim...
brakes
on one of the bus...


In [23]:
# Count total number of records per loan_purpose
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


#### We can see that there are 15 meaningful loan purpose. All other datas are just non relevant, so we map these type of datas to others category as well

In [25]:
loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement", "other", "major_purchase", "medical", "small_business", "car", "vacation", "moving", "house", "wedding", "renewable_energy", "educational"]

In [26]:
from pyspark.sql.functions import when, col

loan_purpose_modified_df = loan_term_modified_df.withColumn("loan_purpose", \
                                when(col("loan_purpose").isin(loan_purpose_lookup), col("loan_purpose")).\
                                otherwise("other"))

In [27]:
loan_purpose_modified_df.createOrReplaceTempView("loans")

In [30]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


## Write the cleaned loan df to the datalake

In [32]:
loan_purpose_modified_df.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv008299/lendingclubproject/cleaned/loans_parquet") \
.save()

In [None]:
loan_purpose_modified_df.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv008299/lendingclubproject/cleaned/loans_csv") \
.save()