In [62]:
from pyspark.sql import *
import getpass
username = getpass.getuser()
spark = SparkSession.builder \
    .appName('cleanLoans') \
    .config('spark.ui.port', '0') \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [61]:
spark.stop()

In [63]:
loans_schema = "loand_id string,member_id string, loan_amount float,funded_amount float,term string, int_rate float, installment float,issue_d string, loan_status string, purpose string,title string"

In [64]:
loan_df = spark.read.option("mode","failfast").csv("/user/itv012667/lendingclub/raw/loans_data_csv",schema=loans_schema,header=True)

In [65]:
loan_df

loand_id,member_id,loan_amount,funded_amount,term,int_rate,installment,issue_d,loan_status,purpose,title
56633077,b59d80da191f5b573...,3000.0,3000.0,36 months,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36 months,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36 months,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation
56463188,e12aefc548f750777...,11200.0,11200.0,60 months,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60 months,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation
56663266,1c4329e5f17697127...,20000.0,20000.0,60 months,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation
56483027,5026c86ad983175eb...,10000.0,10000.0,36 months,12.69,335.45,Aug-2015,Fully Paid,other,Other
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60 months,19.19,609.46,Aug-2015,Current,small_business,Business
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36 months,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36 months,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation


In [66]:
loan_df.printSchema()

root
 |-- loand_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: float (nullable = true)
 |-- installment: float (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)



# Add Ingest Time Stamp

In [67]:
from pyspark.sql.functions import *

In [68]:
df_ingest = loan_df.withColumn("ingestTimestamp",current_timestamp())

# Drop Nulls

In [69]:
columns_to_check = ["loan_amount","funded_amount","term","int_rate","installment","issue_d","loan_status","purpose"]

In [70]:
df = df_ingest.na.drop(subset=columns_to_check)

In [71]:
df

loand_id,member_id,loan_amount,funded_amount,term,int_rate,installment,issue_d,loan_status,purpose,title,ingestTimestamp
56633077,b59d80da191f5b573...,3000.0,3000.0,36 months,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2024-05-26 19:46:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36 months,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2024-05-26 19:46:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36 months,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-26 19:46:...
56463188,e12aefc548f750777...,11200.0,11200.0,60 months,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2024-05-26 19:46:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60 months,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 19:46:...
56663266,1c4329e5f17697127...,20000.0,20000.0,60 months,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 19:46:...
56483027,5026c86ad983175eb...,10000.0,10000.0,36 months,12.69,335.45,Aug-2015,Fully Paid,other,Other,2024-05-26 19:46:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60 months,19.19,609.46,Aug-2015,Current,small_business,Business,2024-05-26 19:46:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36 months,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-26 19:46:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36 months,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 19:46:...


In [72]:
df_ingest.count()

2260701

In [73]:
df.count()

2260667

# convert the term into years instead of months

In [74]:
df_with_term = df.withColumn("term",regexp_replace(col("term"),"(\D)",""))

In [75]:
df_with_term

loand_id,member_id,loan_amount,funded_amount,term,int_rate,installment,issue_d,loan_status,purpose,title,ingestTimestamp
56633077,b59d80da191f5b573...,3000.0,3000.0,36,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2024-05-26 19:46:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,36,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2024-05-26 19:46:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,36,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-26 19:46:...
56463188,e12aefc548f750777...,11200.0,11200.0,60,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2024-05-26 19:46:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,60,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 19:46:...
56663266,1c4329e5f17697127...,20000.0,20000.0,60,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 19:46:...
56483027,5026c86ad983175eb...,10000.0,10000.0,36,12.69,335.45,Aug-2015,Fully Paid,other,Other,2024-05-26 19:46:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,60,19.19,609.46,Aug-2015,Current,small_business,Business,2024-05-26 19:46:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,36,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-26 19:46:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,36,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 19:46:...


In [76]:
df_with_term_int = df_with_term.withColumn("term",col("term").cast("int"))

In [83]:
df_with_term_int = df_with_term.withColumn("term",(col("term")/12).cast("int"))

In [91]:
df_with_term_int

loand_id,member_id,loan_amount,funded_amount,term,int_rate,installment,issue_d,loan_status,purpose,title,ingestTimestamp
56633077,b59d80da191f5b573...,3000.0,3000.0,3,7.89,93.86,Aug-2015,Fully Paid,credit_card,Credit card refin...,2024-05-26 20:01:...
55927518,202d9f56ecb7c3bc9...,15600.0,15600.0,3,7.89,488.06,Aug-2015,Fully Paid,credit_card,Credit card refin...,2024-05-26 20:01:...
56473345,e5a140c0922b554b9...,20000.0,20000.0,3,9.17,637.58,Aug-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-26 20:01:...
56463188,e12aefc548f750777...,11200.0,11200.0,5,21.99,309.27,Aug-2015,Fully Paid,home_improvement,Home improvement,2024-05-26 20:01:...
56473316,1b3a50d854fbbf97e...,16000.0,16000.0,5,20.99,432.77,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 20:01:...
56663266,1c4329e5f17697127...,20000.0,20000.0,5,13.33,458.45,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 20:01:...
56483027,5026c86ad983175eb...,10000.0,10000.0,3,12.69,335.45,Aug-2015,Fully Paid,other,Other,2024-05-26 20:01:...
56613385,9847d8c1e9d0b2084...,23400.0,23400.0,5,19.19,609.46,Aug-2015,Current,small_business,Business,2024-05-26 20:01:...
56643620,8340dbe1adea41fb4...,16000.0,16000.0,3,5.32,481.84,Jul-2015,Fully Paid,debt_consolidation,Debt consolidation,2024-05-26 20:01:...
56533114,d4de0de3ab7d79ad4...,25450.0,25450.0,3,27.31,1043.24,Aug-2015,Charged Off,debt_consolidation,Debt consolidation,2024-05-26 20:01:...


# Replacing the columns with values other than in the specified bucket of values to “other”

In [97]:
loan_purpose_lookup = ["debt_consolidation","credit_card","home_improvement","other","major_purchase","medical","small_business","car","vacation","moving","house","wedding","renewable_energy","educational"]

In [101]:
loan_purpose_modifier = df_with_term_int.withColumn("purpose",when(col("purpose").isin(loan_purpose_lookup),col("purpose")).otherwise("other"))

In [105]:
df_with_term_int.groupBy("purpose").agg(count("purpose").alias("total")).orderBy("total",ascending = False)

purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [106]:
loan_purpose_modifier.groupBy("purpose").agg(count("purpose").alias("total")).orderBy("total",ascending = False)

purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


# Write it back to the folder

In [107]:
loan_purpose_modifier.repartition(1).write.format("parquet").mode("overwrite").option("header",True).option("path","/user/itv012667/lendingclub/cleaneddata/loans_data").save()