In [0]:
%fs ls /mnt/lending_club/landing/loan_repayments_df/csv/single

path,name,size,modificationTime
dbfs:/mnt/lending_club/landing/loan_repayments_df/csv/single/_SUCCESS,_SUCCESS,0,1702883438000
dbfs:/mnt/lending_club/landing/loan_repayments_df/csv/single/_committed_2414998971585683794,_committed_2414998971585683794,113,1702883438000
dbfs:/mnt/lending_club/landing/loan_repayments_df/csv/single/_started_2414998971585683794,_started_2414998971585683794,0,1702883433000
dbfs:/mnt/lending_club/landing/loan_repayments_df/csv/single/part-00000-tid-2414998971585683794-11892453-bff9-4546-97cc-2a94e5f52b5f-250-1-c000.csv,part-00000-tid-2414998971585683794-11892453-bff9-4546-97cc-2a94e5f52b5f-250-1-c000.csv,138123359,1702883438000


In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, IntegerType, StringType, FloatType, LongType, TimestampType
from pyspark.sql.functions import regexp_replace, concat_ws
from pyspark.sql.functions import when, col, length
from pyspark.sql.functions import current_timestamp
from pyspark.sql.functions import avg

In [0]:
loan_repayments_schema = StructType([
    StructField("id", StringType(), True)
    ,StructField("total_principal_recieved", FloatType(), True)
    ,StructField("total_interest_recieved", FloatType(), True)
    ,StructField("total_late_fee_recieved", FloatType(), True)
    ,StructField("total_payment_recieved", FloatType(), True)
    ,StructField("last_payment_amount", FloatType(), True)
    ,StructField("last_payment_date", StringType(), True)
    ,StructField("next_payment_date", StringType(), True)
])

In [0]:
loan_repayments = spark.read\
    .format("csv")\
    .schema(loan_repayments_schema)\
    .option("header", True)\
    .load("dbfs:/mnt/lending_club/landing/loan_repayments_df/csv/single/part-00000-tid-2414998971585683794-11892453-bff9-4546-97cc-2a94e5f52b5f-250-1-c000.csv")

In [0]:
loan_repayments.show()

+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+
|      id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount|last_payment_date|next_payment_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+
|68407277|                  3600.0|                 821.72|                    0.0|              4421.724|             122.67|         Jan-2019|             NULL|
|68355089|                 24700.0|                 979.66|                    0.0|              25679.66|             926.35|         Jun-2016|             NULL|
|68341763|                 20000.0|                2705.92|                    0.0|             22705.924|            15813.3|         Jun-2017|             NULL|
|66310712|            

In [0]:
loan_repayments_ingested = loan_repayments.withColumn("ingest_date", current_timestamp())
loan_repayments_ingested.show()

+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|      id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount|last_payment_date|next_payment_date|         ingest_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|68407277|                  3600.0|                 821.72|                    0.0|              4421.724|             122.67|         Jan-2019|             NULL|2023-12-20 07:22:...|
|68355089|                 24700.0|                 979.66|                    0.0|              25679.66|             926.35|         Jun-2016|             NULL|2023-12-20 07:22:...|
|68341763|                 20000.0|                2705.92|                    0

In [0]:
loan_repayments_ingested.filter("total_payment_recieved = 0").count()

1068

In [0]:
loan_repayments_ingested.filter("total_payment_recieved = 0 and total_principal_recieved != 0.0").show()

+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+------------------+-----------------+--------------------+
|      id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount| last_payment_date|next_payment_date|         ingest_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+------------------+-----------------+--------------------+
|61400928|                 4053.98|                1233.28|                 814.56|                   0.0|               NULL|361.10519999940004|           228.25|2023-12-20 07:54:...|
| 1064185|                11600.98|               11600.98|                10000.0|                   0.0|                0.0|               0.0|         Dec-2014|2023-12-20 07:54:...|
|  839321|                 4523.48|                 4400.0|                

In [0]:
loan_repayments_payments_df = loan_repayments_ingested.withColumn(
    "total_payment_recieved"
    ,when(
        (col("total_payment_recieved") == 0.0) 
        ,col("total_principal_recieved") + col("total_interest_recieved") + col("total_late_fee_recieved")
    ).otherwise(col("total_payment_recieved"))
)

loan_repayments_payments_df.show()

+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|      id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount|last_payment_date|next_payment_date|         ingest_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|68407277|                  3600.0|                 821.72|                    0.0|              4421.724|             122.67|         Jan-2019|             NULL|2023-12-20 08:12:...|
|68355089|                 24700.0|                 979.66|                    0.0|              25679.66|             926.35|         Jun-2016|             NULL|2023-12-20 08:12:...|
|68341763|                 20000.0|                2705.92|                    0

In [0]:
loan_repayments_payments_df.filter("id in (61400928, 1064185)").show()

+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+------------------+-----------------+--------------------+
|      id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount| last_payment_date|next_payment_date|         ingest_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+------------------+-----------------+--------------------+
|61400928|                 4053.98|                1233.28|                 814.56|               6101.82|               NULL|361.10519999940004|           228.25|2023-12-20 08:12:...|
| 1064185|                11600.98|               11600.98|                10000.0|              33201.96|                0.0|               0.0|         Dec-2014|2023-12-20 08:12:...|
+--------+------------------------+-----------------------+----------------

In [0]:
loan_repayments_payments_df.filter("total_principal_recieved is null ").show()

+--------------------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|                  id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount|last_payment_date|next_payment_date|         ingest_date|
+--------------------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|Total amount fund...|                    NULL|                   NULL|                   NULL|                  NULL|               NULL|             NULL|             NULL|2023-12-20 08:13:...|
|Total amount fund...|                    NULL|                   NULL|                   NULL|                  NULL|               NULL|             NULL|             NULL|2023-12-20 08:13:...|
|Total amount fund..

In [0]:
columns_to_check = ["total_principal_recieved", "total_interest_recieved", "total_late_fee_recieved", "total_payment_recieved", "last_payment_amount"]
loans_repay_filtered_df = loan_repayments_payments_df.na.drop(subset=columns_to_check)
loans_repay_filtered_df.show()

+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|      id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount|last_payment_date|next_payment_date|         ingest_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|68407277|                  3600.0|                 821.72|                    0.0|              4421.724|             122.67|         Jan-2019|             NULL|2023-12-20 08:13:...|
|68355089|                 24700.0|                 979.66|                    0.0|              25679.66|             926.35|         Jun-2016|             NULL|2023-12-20 08:13:...|
|68341763|                 20000.0|                2705.92|                    0

In [0]:
loans_repay_filtered_df.filter("total_payment_recieved == 0.0").count()

949

In [0]:
loans_repay_filtered_df.filter("total_payment_recieved == 0.0").show()

+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|      id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount|last_payment_date|next_payment_date|         ingest_date|
+--------+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+--------------------+
|68393736|                     0.0|                    0.0|                    0.0|                   0.0|                0.0|             NULL|             NULL|2023-12-20 08:13:...|
|67789810|                     0.0|                    0.0|                    0.0|                   0.0|                0.0|             NULL|             NULL|2023-12-20 08:13:...|
|67216542|                     0.0|                    0.0|                    0

In [0]:
loans_repay_filtered_df.filter("total_payment_recieved == 0.0 and total_principal_recieved != 0.0").count()

0

In [0]:
loans_repay_filtered_df.filter("total_payment_recieved is null and total_principal_recieved != 0.0").show()

+---+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+
| id|total_principal_recieved|total_interest_recieved|total_late_fee_recieved|total_payment_recieved|last_payment_amount|last_payment_date|next_payment_date|ingest_date|
+---+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+
+---+------------------------+-----------------------+-----------------------+----------------------+-------------------+-----------------+-----------------+-----------+



In [0]:
loans_repay_filtered_df.filter("last_payment_date == 0.0").count()

48

In [0]:
loans_last_payment = loans_repay_filtered_df.withColumn(
    "last_payment_date",
    when(
        (col("last_payment_date") == 0.0)
        ,None
    ).otherwise("last_payment_date")
)

In [0]:
loans_last_payment.filter("last_payment_date == 0.0").count()

0

In [0]:
loans_last_payment.filter("next_payment_date == 0.0").count()

24

In [0]:
loans_next_payment = loans_repay_filtered_df.withColumn(
    "next_payment_date",
    when(
        (col("next_payment_date") == 0.0)
        ,None
    ).otherwise("next_payment_date")
)

In [0]:
loans_next_payment.filter("next_payment_date == 0.0").count()

0

In [0]:
loans_next_payment.write \
.format("parquet") \
.mode("overwrite") \
.option("path", "/mnt/lending_club/staging/loans_repayments/non_partitioned/parquet") \
.save()

In [0]:
loans_next_payment.write \
.format("csv") \
.mode("overwrite") \
.option("path", "/mnt/lending_club/staging/loans_repayments/non_partitioned/csv") \
.save()