In [0]:
%fs ls mnt/lending_club/landing/loans_df/csv/single/


path,name,size,modificationTime
dbfs:/mnt/lending_club/landing/loans_df/csv/single/_SUCCESS,_SUCCESS,0,1702883689000
dbfs:/mnt/lending_club/landing/loans_df/csv/single/_committed_357489048282395346,_committed_357489048282395346,112,1702883689000
dbfs:/mnt/lending_club/landing/loans_df/csv/single/_started_357489048282395346,_started_357489048282395346,0,1702883681000
dbfs:/mnt/lending_club/landing/loans_df/csv/single/part-00000-tid-357489048282395346-559a0c24-f628-402f-840d-9e24c7f6b5a8-268-1-c000.csv,part-00000-tid-357489048282395346-559a0c24-f628-402f-840d-9e24c7f6b5a8-268-1-c000.csv,374481073,1702883688000


In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, IntegerType, StringType, FloatType, LongType, TimestampType
from pyspark.sql.functions import regexp_replace, concat_ws
from pyspark.sql.functions import when, col, length
from pyspark.sql.functions import current_timestamp
from pyspark.sql.functions import avg

In [0]:
loans_df_schema = StructType([
    StructField("id", StringType(), True)
    ,StructField("member_id", StringType(), True)
    ,StructField("loan_amount", FloatType(), True)
    ,StructField("funded_amount", FloatType(), True)
    ,StructField("loan_term_months", StringType(), True)
    ,StructField("interest_rate", FloatType(), True)
    ,StructField("monthly_installment", FloatType(), True)
    ,StructField("issue_date", StringType(), True)
    ,StructField("loan_status", StringType(), True)
    ,StructField("loan_purpose", StringType(), True)
    ,StructField("loan_title", StringType(), True)
])

In [0]:
loans_df_raw = spark.read\
    .format("csv")\
    .option("header", True)\
    .schema(loans_df_schema)\
    .load("/mnt/lending_club/landing/loans_df/csv/single/part-00000-tid-357489048282395346-559a0c24-f628-402f-840d-9e24c7f6b5a8-268-1-c000.csv")

In [0]:
loans_df_raw.show()

+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+
|      id|           member_id|loan_amount|funded_amount|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|          loan_title|
+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+
|68407277|af2c01919a67ad070...|     3600.0|       3600.0|       36 months|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|  Debt consolidation|
|68355089|51cf0089ac3e1beeb...|    24700.0|      24700.0|       36 months|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|            Business|
|68341763|a03d0c087a03631e3...|    20000.0|      20000.0|       60 months|        10.78|             432.66|  Dec-2015| Fully Paid|  home_im

In [0]:
loans_df_raw.printSchema()

root
 |-- id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amnt: float (nullable = true)
 |-- funded_amnt: float (nullable = true)
 |-- term: string (nullable = true)
 |-- int_rate: float (nullable = true)
 |-- installment: float (nullable = true)
 |-- issue_d: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- purpose: string (nullable = true)
 |-- title: string (nullable = true)



# insert a new column named as ingestion date(current time)

In [0]:
loans_df_ingested = loans_df_raw.withColumn("ingest_date", current_timestamp())
loans_df_ingested.show()

+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|      id|           member_id|loan_amount|funded_amount|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|          loan_title|         ingest_date|
+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|68407277|af2c01919a67ad070...|     3600.0|       3600.0|       36 months|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|  Debt consolidation|2023-12-19 13:56:...|
|68355089|51cf0089ac3e1beeb...|    24700.0|      24700.0|       36 months|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|            Business|2023-12-19 13:56:...|
|68341763|a03d0c087a03631e3...|    


#### Dropping the rows which has null values in the mentioned columns

In [0]:
null_count = loans_df_ingested.filter(
    (loans_df_ingested.loan_amount.isNull()) | (loans_df_ingested.funded_amount.isNull()) | (loans_df_ingested.loan_term_months.isNull()) | (loans_df_ingested.interest_rate.isNull()) | (loans_df_ingested.monthly_installment.isNull()) | \
        (loans_df_ingested.issue_date.isNull()) |(loans_df_ingested.loan_status.isNull()) |(loans_df_ingested.loan_purpose.isNull())
    ).count()

In [0]:
print(f"Number of nulls are : {null_count}")

Number of nulls are : 34


In [0]:
columns_to_check = ["loan_amount", "funded_amount", "loan_term_months", "interest_rate", "monthly_installment", "issue_date", "loan_status", "loan_purpose"]

loans_filtered = loans_df_ingested.na.drop(subset = columns_to_check)
loans_filtered.show()

+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|      id|           member_id|loan_amount|funded_amount|loan_term_months|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|          loan_title|         ingest_date|
+--------+--------------------+-----------+-------------+----------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|68407277|af2c01919a67ad070...|     3600.0|       3600.0|       36 months|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|  Debt consolidation|2023-12-19 14:15:...|
|68355089|51cf0089ac3e1beeb...|    24700.0|      24700.0|       36 months|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|            Business|2023-12-19 14:15:...|
|68341763|a03d0c087a03631e3...|    

In [0]:
loans_filtered.filter(
    (loans_df_ingested.loan_amount.isNull())
    ).count()

0

#### convert loan_term_months to integer

In [0]:
loans_modified = loans_filtered.withColumn("loan_term_months",
                                           regexp_replace(col("loan_term_months"), " months", ""))
                                           
loans_modified = loans_modified.withColumn("loan_term_months", loans_modified.loan_term_months.cast("int"))
loans_modified = loans_modified.withColumn("loan_term_months", loans_modified.loan_term_months/12)
loans_modified = loans_modified.withColumnRenamed("loan_term_months", "loan_term_years")
loans_modified.show()

+--------+--------------------+-----------+-------------+---------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|      id|           member_id|loan_amount|funded_amount|loan_term_years|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|          loan_title|         ingest_date|
+--------+--------------------+-----------+-------------+---------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|68407277|af2c01919a67ad070...|     3600.0|       3600.0|            3.0|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|  Debt consolidation|2023-12-19 14:28:...|
|68355089|51cf0089ac3e1beeb...|    24700.0|      24700.0|            3.0|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|            Business|2023-12-19 14:28:...|
|68341763|a03d0c087a03631e3...|    20000

#### Clean the loans_purpose column

In [0]:
dist = loans_modified.select("loan_purpose")
dist = dist.groupBy("loan_purpose").count()
dist_count_ordered = dist.orderBy("count", ascending = False)


In [0]:
display(dist_count_ordered)

loan_purpose,count
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [0]:
loan_purpose_lookup = ["debt_consolidation", "credit_card", "home_improvement", "other", "major_purchase", "medical", "small_business", "car", "vacation", "moving", "house", "wedding", "renewable_energy", "educational"]

loans_df_purpose = loans_modified.withColumn(
    "loan_purpose"\
    ,when(
        col("loan_purpose").isin(loan_purpose_lookup)\
        ,col("loan_purpose")
    )\
    .otherwise("other")

)

loans_df_purpose.show()

+--------+--------------------+-----------+-------------+---------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|      id|           member_id|loan_amount|funded_amount|loan_term_years|interest_rate|monthly_installment|issue_date|loan_status|      loan_purpose|          loan_title|         ingest_date|
+--------+--------------------+-----------+-------------+---------------+-------------+-------------------+----------+-----------+------------------+--------------------+--------------------+
|68407277|af2c01919a67ad070...|     3600.0|       3600.0|            3.0|        13.99|             123.03|  Dec-2015| Fully Paid|debt_consolidation|  Debt consolidation|2023-12-19 14:51:...|
|68355089|51cf0089ac3e1beeb...|    24700.0|      24700.0|            3.0|        11.99|             820.28|  Dec-2015| Fully Paid|    small_business|            Business|2023-12-19 14:51:...|
|68341763|a03d0c087a03631e3...|    20000

In [0]:
loans_df_purpose.select("loan_purpose").distinct().show()

+------------------+
|      loan_purpose|
+------------------+
|           wedding|
|       educational|
|             other|
|    small_business|
|debt_consolidation|
|       credit_card|
|            moving|
|          vacation|
|  renewable_energy|
|             house|
|               car|
|    major_purchase|
|           medical|
|  home_improvement|
+------------------+



In [0]:
loans_df_purpose.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/mnt/lending_club/staging/loans/partitioned/parquet") \
.partitionBy("loan_purpose")\
.save()

In [0]:
loans_df_purpose.write \
.format("csv") \
.mode("overwrite") \
.option("path", "/mnt/lending_club/staging/loans/partitioned/csv") \
.partitionBy("loan_purpose")\
.save()

In [0]:
loans_df_purpose.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/mnt/lending_club/staging/loans/non_partitioned/parquet") \
.save()

In [0]:
loans_df_purpose.write \
.format("csv") \
.mode("overwrite") \
.option("path", "/mnt/lending_club/staging/loans/non_partitioned/csv") \
.save()