# Lending Club Project Loans Data Cleaning

In [53]:
from pyspark.sql import SparkSession
import getpass
username=getpass.getuser()
spark=SparkSession. \
builder. \
config('spark.ui.port','0'). \
config('spark.shuffle.useOldFetchProtocol','true'). \
config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [54]:
loans_raw_df=spark.read \
.format("csv") \
.option("header",True) \
.option("inferSchema",True) \
.load("/user/itv007473/lendingclubproject/raw/loans_data_csv")

In [5]:
loans_raw_df

loan_id,member_id,loan_amnt,funded_amnt,term,int_rate,installment,issue_d,loan_status,purpose,title
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation
75537401,e935a4c27fc78ae61...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...
76301424,f7116b7f7546a7952...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying
76272510,d3792868819649ba9...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation
76304116,6d3a5a422261348b3...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...


## Defining Schema along with desired column names

In [55]:
loans_schema = '''loan_id string,member_id string,loan_amount
float,funded_amount float,loan_term_months string,interest_rate
float,monthly_installment float,issue_date string,loan_status
string,loan_purpose string,loan_title string'''

In [56]:
loans_raw_df=spark.read \
.format("csv") \
.option("header",True) \
.schema(loans_schema) \
.load("/user/itv007473/lendingclubproject/raw/loans_data_csv") 

In [8]:
loans_raw_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation
75537401,e935a4c27fc78ae61...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...
76301424,f7116b7f7546a7952...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying
76272510,d3792868819649ba9...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation
76304116,6d3a5a422261348b3...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...


In [10]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [57]:
from pyspark.sql.functions import current_timestamp

In [58]:
loans_df_ingestd=loans_raw_df.withColumn("ingest_date",current_timestamp())

In [15]:
loans_df_ingestd

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 09:23:...
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 09:23:...
75537401,e935a4c27fc78ae61...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other,2023-11-05 09:23:...
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 09:23:...
76301424,f7116b7f7546a7952...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 09:23:...
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 09:23:...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 09:23:...
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2023-11-05 09:23:...
76272510,d3792868819649ba9...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2023-11-05 09:23:...
76304116,6d3a5a422261348b3...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2023-11-05 09:23:...


In [59]:
loans_df_ingestd.createOrReplaceTempView("loans")

In [46]:
spark.sql("select count(*) from loans")

count(1)
2260701


### If any of the columns is null then drop the row, use columns_to_check = ["loan_amount", "funded_amount", "loan_term_months","interest_rate","monthly_installment","issue_date","loan_status","loan_purpose"]

In [60]:
spark.sql("select * from loans where loan_amount is null")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2023-11-05 10:01:...


In [63]:
columns_to_check = ["loan_amount", "funded_amount", "loan_term_months","interest_rate","monthly_installment","issue_date","loan_status","loan_purpose"]

In [64]:
loans_filtered_df=loans_df_ingestd.na.drop(subset=columns_to_check)

In [28]:
loans_filtered_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 09:40:...
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 09:40:...
75537401,e935a4c27fc78ae61...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other,2023-11-05 09:40:...
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 09:40:...
76301424,f7116b7f7546a7952...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 09:40:...
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 09:40:...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 09:40:...
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2023-11-05 09:40:...
76272510,d3792868819649ba9...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2023-11-05 09:40:...
76304116,6d3a5a422261348b3...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2023-11-05 09:40:...


In [29]:
loans_filtered_df.count()

2260667

In [65]:
loans_filtered_df.createOrReplaceTempView('loans')

In [31]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 09:45:...
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 09:45:...
75537401,e935a4c27fc78ae61...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other,2023-11-05 09:45:...
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 09:45:...
76301424,f7116b7f7546a7952...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 09:45:...
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 09:45:...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 09:45:...
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2023-11-05 09:45:...
76272510,d3792868819649ba9...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2023-11-05 09:45:...
76304116,6d3a5a422261348b3...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2023-11-05 09:45:...


In [66]:
from pyspark.sql.functions import regexp_replace, col

In [67]:
loans_term_modified_df=loans_filtered_df \
.withColumn("loan_term_months",(regexp_replace(col("loan_term_months")," months","") \
.cast("int")/12) \
.cast("int")) \
.withColumnRenamed("loan_term_months","loan_term_years")

In [68]:
loans_term_modified_df

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,5,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 10:01:...
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,3,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 10:01:...
75537401,e935a4c27fc78ae61...,12600.0,12600.0,3,7.39,391.31,Apr-2016,Fully Paid,other,Other,2023-11-05 10:01:...
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,5,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 10:01:...
76301424,f7116b7f7546a7952...,4300.0,4300.0,3,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 10:01:...
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,3,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 10:01:...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,5,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 10:01:...
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,3,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2023-11-05 10:01:...
76272510,d3792868819649ba9...,30000.0,30000.0,5,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2023-11-05 10:01:...
76304116,6d3a5a422261348b3...,4800.0,4800.0,3,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2023-11-05 10:01:...


In [69]:
loans_term_modified_df.createOrReplaceTempView("loans")

In [70]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,5,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 10:03:...
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,3,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 10:03:...
75537401,e935a4c27fc78ae61...,12600.0,12600.0,3,7.39,391.31,Apr-2016,Fully Paid,other,Other,2023-11-05 10:03:...
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,5,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 10:03:...
76301424,f7116b7f7546a7952...,4300.0,4300.0,3,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 10:03:...
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,3,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 10:03:...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,5,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 10:03:...
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,3,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2023-11-05 10:03:...
76272510,d3792868819649ba9...,30000.0,30000.0,5,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2023-11-05 10:03:...
76304116,6d3a5a422261348b3...,4800.0,4800.0,3,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2023-11-05 10:03:...


In [71]:
spark.sql("select distinct(loan_purpose) from loans")

loan_purpose
"guaranteed!"""
and if they are a...
never had any tro...
Bank of America c...
<br/><br/>Lending...
stocks
please feel free ...
I became his prim...
brakes
on one of the bus...


In [74]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


### Loan purpose lookup

In [75]:
loan_purpose_lookup = ["debt_consolidation", "credit_card",
"home_improvement", "other", "major_purchase", "medical", "small_business",
"car", "vacation", "moving", "house", "wedding", "renewable_energy",
"educational"]

In [76]:
from pyspark.sql.functions import when

In [78]:
loans_purpose_modified=loans_term_modified_df.withColumn("loan_purpose", when(col("loan_purpose").isin(loan_purpose_lookup),col("loan_purpose")).otherwise("other"))

In [79]:
loans_purpose_modified

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,a4ec00ba67fadf2fe...,24000.0,24000.0,5,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 10:14:...
76263914,4f7a9e6ffaacd5da2...,2400.0,2400.0,3,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 10:14:...
75537401,e935a4c27fc78ae61...,12600.0,12600.0,3,7.39,391.31,Apr-2016,Fully Paid,other,Other,2023-11-05 10:14:...
75038986,2d32004bd5e1dc3c3...,16800.0,16800.0,5,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 10:14:...
76301424,f7116b7f7546a7952...,4300.0,4300.0,3,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2023-11-05 10:14:...
75333198,d3aa3a3c95eca5631...,8950.0,8950.0,3,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2023-11-05 10:14:...
76391453,fc8a2e046eaaba02d...,35000.0,35000.0,5,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2023-11-05 10:14:...
76363364,577ae670ac2ec7ed3...,15000.0,15000.0,3,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2023-11-05 10:14:...
76272510,d3792868819649ba9...,30000.0,30000.0,5,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2023-11-05 10:14:...
76304116,6d3a5a422261348b3...,4800.0,4800.0,3,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2023-11-05 10:14:...


In [80]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [81]:
loans_purpose_modified.createOrReplaceTempView("loans")

In [82]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [83]:
from pyspark.sql.functions import count

In [86]:
loans_purpose_modified.groupBy("loan_purpose").agg(count("*").alias("total")). \
orderBy(col("total").desc())

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [87]:
loans_purpose_modified.write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","lendingclubproject/cleaned/loans_csv") \
.save()

In [None]:
loans_purpose_modified.write \
.format("parquet") \
.mode("overwrite") \
.option("path","lendingclubproject/cleaned/loans_parquet") \
.save()