In [42]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
config('spark.shuffle.useOldFetchProtocol', 'true'). \
config('spark.app.name', 'itv012041_Loans_Data_Cleanup'). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [43]:
spark

In [44]:
loans_schema="""loan_id string,member_id string,loan_amount float,funded_amount float,loan_term_months string,
interest_rate float,monthly_installment float,issue_date string,loan_status string,loan_purpose string,
loan_title string"""

In [45]:
loans_raw_df = spark.read \
.format("csv") \
.option("header", True) \
.schema(loans_schema) \
.load("/user/itv012041/lendingclubproject/raw/loans_data_csv")

In [16]:
loans_raw_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
76003861,6f32d9a5af686d037...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation
76263914,e9f0d508da72a2f6f...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation
75537401,d0e4ce54c2c9eda10...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other
75038986,a6ac8fa6500b757de...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...
76301424,47d6ef856ab65437e...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation
75333198,a50a79ff1e281eae8...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...
76391453,1d958a151f5710d8f...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation
76363364,c543fd86ac52bfa5c...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying
76272510,146d0f5d55cc7e35c...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation
76304116,a21ce38328fe67c6e...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...


In [18]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



In [46]:
from pyspark.sql.functions import current_timestamp

In [47]:
loans_df_with_ingest_date = loans_raw_df.withColumn("ingest_date", current_timestamp())

In [23]:
loans_df_with_ingest_date

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,6f32d9a5af686d037...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:12:...
76263914,e9f0d508da72a2f6f...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:12:...
75537401,d0e4ce54c2c9eda10...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other,2024-12-13 02:12:...
75038986,a6ac8fa6500b757de...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:12:...
76301424,47d6ef856ab65437e...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:12:...
75333198,a50a79ff1e281eae8...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:12:...
76391453,1d958a151f5710d8f...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:12:...
76363364,c543fd86ac52bfa5c...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2024-12-13 02:12:...
76272510,146d0f5d55cc7e35c...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2024-12-13 02:12:...
76304116,a21ce38328fe67c6e...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2024-12-13 02:12:...


In [24]:
loans_df_with_ingest_date.createOrReplaceTempView("loans")

In [25]:
spark.sql("select count(*) from loans")

count(1)
2260701


In [26]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,6f32d9a5af686d037...,24000.0,24000.0,60 months,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:13:...
76263914,e9f0d508da72a2f6f...,2400.0,2400.0,36 months,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:13:...
75537401,d0e4ce54c2c9eda10...,12600.0,12600.0,36 months,7.39,391.31,Apr-2016,Fully Paid,other,Other,2024-12-13 02:13:...
75038986,a6ac8fa6500b757de...,16800.0,16800.0,60 months,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:13:...
76301424,47d6ef856ab65437e...,4300.0,4300.0,36 months,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:13:...
75333198,a50a79ff1e281eae8...,8950.0,8950.0,36 months,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:13:...
76391453,1d958a151f5710d8f...,35000.0,35000.0,60 months,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:13:...
76363364,c543fd86ac52bfa5c...,15000.0,15000.0,36 months,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2024-12-13 02:13:...
76272510,146d0f5d55cc7e35c...,30000.0,30000.0,60 months,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2024-12-13 02:13:...
76304116,a21ce38328fe67c6e...,4800.0,4800.0,36 months,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2024-12-13 02:13:...


In [27]:
spark.sql("select * from loans where loan_amount is null")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...
Total amount fund...,e3b0c44298fc1c149...,,,,,,,,,,2024-12-13 02:14:...


In [28]:
spark.sql("select count(*) from loans where loan_amount is null")

count(1)
33


In [48]:
columns_to_check=["loan_amount","funded_amount","loan_term_months","interest_rate",
                  "monthly_installment","issue_date","loan_status","loan_purpose"]

In [49]:
loans_filtered_df = loans_df_with_ingest_date.na.drop(subset=columns_to_check)

In [50]:
loans_filtered_df.count()

2260667

In [33]:
loans_filtered_df.createOrReplaceTempView("loans")

In [51]:
from pyspark.sql.functions import regexp_replace, col

In [52]:
loans_term_modified_df = loans_filtered_df. \
withColumn("loan_term_months", (regexp_replace(col("loan_term_months"), " months", ""). \
cast("int") / 12). \
cast("int")). \
withColumnRenamed("loan_term_months", "loan_term_years")

In [37]:
loans_term_modified_df

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,6f32d9a5af686d037...,24000.0,24000.0,5,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:29:...
76263914,e9f0d508da72a2f6f...,2400.0,2400.0,3,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:29:...
75537401,d0e4ce54c2c9eda10...,12600.0,12600.0,3,7.39,391.31,Apr-2016,Fully Paid,other,Other,2024-12-13 02:29:...
75038986,a6ac8fa6500b757de...,16800.0,16800.0,5,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:29:...
76301424,47d6ef856ab65437e...,4300.0,4300.0,3,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:29:...
75333198,a50a79ff1e281eae8...,8950.0,8950.0,3,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:29:...
76391453,1d958a151f5710d8f...,35000.0,35000.0,5,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:29:...
76363364,c543fd86ac52bfa5c...,15000.0,15000.0,3,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2024-12-13 02:29:...
76272510,146d0f5d55cc7e35c...,30000.0,30000.0,5,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2024-12-13 02:29:...
76304116,a21ce38328fe67c6e...,4800.0,4800.0,3,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2024-12-13 02:29:...


In [38]:
loans_term_modified_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [53]:
loans_term_modified_df.createOrReplaceTempView("loans")

In [54]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,6f32d9a5af686d037...,24000.0,24000.0,5,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:33:...
76263914,e9f0d508da72a2f6f...,2400.0,2400.0,3,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:33:...
75537401,d0e4ce54c2c9eda10...,12600.0,12600.0,3,7.39,391.31,Apr-2016,Fully Paid,other,Other,2024-12-13 02:33:...
75038986,a6ac8fa6500b757de...,16800.0,16800.0,5,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:33:...
76301424,47d6ef856ab65437e...,4300.0,4300.0,3,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:33:...
75333198,a50a79ff1e281eae8...,8950.0,8950.0,3,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:33:...
76391453,1d958a151f5710d8f...,35000.0,35000.0,5,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:33:...
76363364,c543fd86ac52bfa5c...,15000.0,15000.0,3,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2024-12-13 02:33:...
76272510,146d0f5d55cc7e35c...,30000.0,30000.0,5,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2024-12-13 02:33:...
76304116,a21ce38328fe67c6e...,4800.0,4800.0,3,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2024-12-13 02:33:...


In [55]:
spark.sql("select distinct(loan_purpose) from loans")

loan_purpose
"guaranteed!"""
and if they are a...
never had any tro...
<br/><br/>Lending...
Bank of America c...
stocks
please feel free ...
I became his prim...
brakes
on one of the bus...


In [57]:
spark.sql("select loan_purpose, count(*) from loans group by 1 order by 2 desc")

loan_purpose,count(1)
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [58]:
loan_purpose_lookup=["debt_consolidation","credit_card","home_improvement","other","major_purchase",
                     "medical","small_business","car","vacation","moving","house","wedding",
                     "renewable_energy","educational"]


In [59]:
from pyspark.sql.functions import when

In [63]:
loan_purpose_modified = loans_term_modified_df.withColumn("loan_purpose", when(col("loan_purpose").isin(loan_purpose_lookup), col("loan_purpose")).otherwise("other"))

In [64]:
loan_purpose_modified

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
76003861,6f32d9a5af686d037...,24000.0,24000.0,5,15.31,574.88,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:51:...
76263914,e9f0d508da72a2f6f...,2400.0,2400.0,3,11.47,79.11,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:51:...
75537401,d0e4ce54c2c9eda10...,12600.0,12600.0,3,7.39,391.31,Apr-2016,Fully Paid,other,Other,2024-12-13 02:51:...
75038986,a6ac8fa6500b757de...,16800.0,16800.0,5,19.53,440.72,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:51:...
76301424,47d6ef856ab65437e...,4300.0,4300.0,3,17.27,153.89,Apr-2016,Charged Off,debt_consolidation,Debt consolidation,2024-12-13 02:51:...
75333198,a50a79ff1e281eae8...,8950.0,8950.0,3,22.45,343.9,Apr-2016,Current,credit_card,Credit card refin...,2024-12-13 02:51:...
76391453,1d958a151f5710d8f...,35000.0,35000.0,5,12.99,796.18,Apr-2016,Fully Paid,debt_consolidation,Debt consolidation,2024-12-13 02:51:...
76363364,c543fd86ac52bfa5c...,15000.0,15000.0,3,9.16,478.12,Apr-2016,Fully Paid,house,Home buying,2024-12-13 02:51:...
76272510,146d0f5d55cc7e35c...,30000.0,30000.0,5,16.29,734.18,Apr-2016,Current,debt_consolidation,Debt consolidation,2024-12-13 02:51:...
76304116,a21ce38328fe67c6e...,4800.0,4800.0,3,19.99,178.37,Apr-2016,Fully Paid,credit_card,Credit card refin...,2024-12-13 02:51:...


In [65]:
loan_purpose_modified.createOrReplaceTempView("loans")

In [66]:
spark.sql("select loan_purpose, count(*) from loans group by 1 order by 2 desc")

loan_purpose,count(1)
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [68]:
loan_purpose_modified.write \
.option("header", True) \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv012041/lendingclubproject/cleaned/loans_parquet") \
.save()

In [69]:
loan_purpose_modified.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv012041/lendingclubproject/cleaned/loans_csv") \
.save()