# Lending Club Project Loans_repayments Data Cleaning

In [1]:
from pyspark.sql import SparkSession
import getpass
username=getpass.getuser()
spark=SparkSession. \
builder. \
config('spark.ui.port','0'). \
config('spark.shuffle.useOldFetchProtocol','true'). \
config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [5]:
loans_repayments_raw_df=spark.read \
.format("csv") \
.option("header",True) \
.option("inferSchema",True) \
.load("/user/itv007473/lendingclubproject/raw/loans_repayments_data_csv")

In [6]:
loans_repayments_raw_df

loan_id,total_rec_prncp,total_rec_int,total_rec_late_fee,total_pymnt,last_pymnt_amnt,last_pymnt_d,next_pymnt_d
56633077,3000.0,376.21,0.0,3376.205975527,93.74,Aug-2018,
55927518,15600.0,1956.32,0.0,17556.320693408998,487.9,Aug-2018,
56473345,20000.0,2408.94,0.0,22408.9398096902,9677.72,May-2017,
56463188,11200.0,5231.01,0.0,16431.0146429476,7475.86,Feb-2018,
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,
56483027,10000.0,2062.03,0.0,12062.026276051,335.38,Aug-2018,
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019
56643620,16000.0,1031.67,0.0,17031.673055266598,8363.28,Mar-2017,
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,


In [7]:
loans_repayments_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_rec_prncp: string (nullable = true)
 |-- total_rec_int: string (nullable = true)
 |-- total_rec_late_fee: string (nullable = true)
 |-- total_pymnt: string (nullable = true)
 |-- last_pymnt_amnt: string (nullable = true)
 |-- last_pymnt_d: string (nullable = true)
 |-- next_pymnt_d: string (nullable = true)



In [8]:
loans_repay_schema='''loan_id string,total_principal_received float,total_interest_received
float,total_late_fee_received float,total_payment_received
float,last_payment_amount float,last_payment_date string,next_payment_date
string'''

In [9]:
loans_repayments_raw_df=spark.read \
.format("csv") \
.option("header",True) \
.schema(loans_repay_schema) \
.load("/user/itv007473/lendingclubproject/raw/loans_repayments_data_csv")

In [10]:
loans_repayments_raw_df

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,


In [11]:
loans_repayments_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- total_principal_received: float (nullable = true)
 |-- total_interest_received: float (nullable = true)
 |-- total_late_fee_received: float (nullable = true)
 |-- total_payment_received: float (nullable = true)
 |-- last_payment_amount: float (nullable = true)
 |-- last_payment_date: string (nullable = true)
 |-- next_payment_date: string (nullable = true)



In [12]:
from pyspark.sql.functions import current_timestamp

In [13]:
loans_repay_df_ingetd=loans_repayments_raw_df.withColumn("ingest_date",current_timestamp())

In [14]:
loans_repay_df_ingetd

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date,ingest_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,,2023-11-05 11:56:...
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,,2023-11-05 11:56:...
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,,2023-11-05 11:56:...
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,,2023-11-05 11:56:...
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,,2023-11-05 11:56:...
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,,2023-11-05 11:56:...
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,,2023-11-05 11:56:...
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019,2023-11-05 11:56:...
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,,2023-11-05 11:56:...
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,,2023-11-05 11:56:...


In [15]:
loans_repayments_raw_df.createOrReplaceTempView("loan_repyaments")

In [16]:
spark.sql("select * from loan_repyaments")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,


In [19]:
columns_to_check =["total_principal_received","total_interest_received","total_late_fee_received", "total_payment_received","last_payment_amount"]

In [22]:
loans_repayments_raw_df.count()

2260701

In [23]:
loan_repay_filtered_df=loans_repayments_raw_df.na.drop(subset=columns_to_check)

In [24]:
loan_repay_filtered_df.count()

2260498

In [25]:
loan_repay_filtered_df.createOrReplaceTempView("loan_repayments")

In [27]:
spark.sql("select * from loan_repayments where total_payment_received = 0.0")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
56532281,0.0,0.0,0.0,0.0,0.0,,
56350750,0.0,0.0,0.0,0.0,0.0,,
56280418,0.0,0.0,0.0,0.0,0.0,,
55991277,0.0,0.0,0.0,0.0,0.0,,
55330278,0.0,0.0,0.0,0.0,0.0,,
55422987,0.0,0.0,0.0,0.0,0.0,,
55220751,0.0,0.0,0.0,0.0,0.0,,
55209541,0.0,0.0,0.0,0.0,0.0,,
55238282,0.0,0.0,0.0,0.0,0.0,,
54648144,0.0,0.0,0.0,0.0,0.0,,


In [28]:
spark.sql("select * from loan_repayments where total_payment_received = 0.0 and total_principal_received !=0.0")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
485818,14640.096,13388.84,13000.0,0.0,0.0,0.0,Mar-2013
485471,29620.818,29134.64,25000.0,0.0,0.0,0.0,Mar-2013
482256,8735.611,7479.87,8000.0,0.0,0.0,0.0,Feb-2011
478160,410.0,407.36,0.0,0.0,143.1,410.0,
476557,28865.18,24164.67,5692.31,0.0,6972.59,19916.78,Dec-2010
472516,25951.482,24731.76,25000.0,0.0,0.0,0.0,May-2010
472197,12048.13,12018.01,10000.0,0.0,0.0,0.0,Jan-2013
467364,29216.791,29066.19,24250.0,0.0,0.0,0.0,Dec-2012
399499,26557.729,26336.41,24000.0,0.0,0.0,0.0,Dec-2010
451482,7587.5513,7587.55,7000.0,0.0,0.0,0.0,Jan-2011


In [29]:
from pyspark.sql.functions import when, col

In [30]:
loan_payments_fixed_df=loan_repay_filtered_df.withColumn(
"total_payment_received",
    when(
    (col("total_principal_received") !=0.0) & (col("total_payment_received")==0.0),
        (col("total_principal_received")+col("total_interest_received")+col("total_late_fee_received"))
    ).otherwise(col("total_payment_received"))
)

In [31]:
loan_payments_fixed_df

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,


In [32]:
loan_payments_fixed_df.filter("total_payment_received=0.0").count()

949

In [33]:
loan_payments_fixed_df.filter("total_payment_received=0.0")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
56532281,0.0,0.0,0.0,0.0,0.0,,
56350750,0.0,0.0,0.0,0.0,0.0,,
56280418,0.0,0.0,0.0,0.0,0.0,,
55991277,0.0,0.0,0.0,0.0,0.0,,
55330278,0.0,0.0,0.0,0.0,0.0,,
55422987,0.0,0.0,0.0,0.0,0.0,,
55220751,0.0,0.0,0.0,0.0,0.0,,
55209541,0.0,0.0,0.0,0.0,0.0,,
55238282,0.0,0.0,0.0,0.0,0.0,,
54648144,0.0,0.0,0.0,0.0,0.0,,


In [34]:
loan_payments_fixed2_df=loan_payments_fixed_df.filter("total_payment_received!=0.0")

In [35]:
loan_payments_fixed2_df

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,


In [36]:
loan_payments_fixed2_df.filter("last_payment_date=0.0")

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
485818,14640.096,13388.84,13000.0,41028.938,0.0,0.0,Mar-2013
485471,29620.818,29134.64,25000.0,83755.46,0.0,0.0,Mar-2013
482256,8735.611,7479.87,8000.0,24215.48,0.0,0.0,Feb-2011
480240,17457.0,71.3,33.0,16.0,14153.22,0.0,14153.22
472516,25951.482,24731.76,25000.0,75683.24,0.0,0.0,May-2010
472197,12048.13,12018.01,10000.0,34066.14,0.0,0.0,Jan-2013
467364,29216.791,29066.19,24250.0,82532.984,0.0,0.0,Dec-2012
399499,26557.729,26336.41,24000.0,76894.14,0.0,0.0,Dec-2010
455662,19137.0,42.5,28.0,9.0,13522.135,0.0,13353.11
451482,7587.5513,7587.55,7000.0,22175.102,0.0,0.0,Jan-2011


In [38]:
loans_payments_ldate_fixed_df = loan_payments_fixed2_df.withColumn(
"last_payment_date",
when(
(col("last_payment_date") == 0.0),
None
).otherwise(col("last_payment_date"))
)

In [39]:
loans_payments_ndate_fixed_df = loans_payments_ldate_fixed_df.withColumn(
"next_payment_date",
when(
(col("last_payment_date") == 0.0),
None
).otherwise(col("next_payment_date"))
)

In [40]:
loans_payments_ndate_fixed_df

loan_id,total_principal_received,total_interest_received,total_late_fee_received,total_payment_received,last_payment_amount,last_payment_date,next_payment_date
56633077,3000.0,376.21,0.0,3376.206,93.74,Aug-2018,
55927518,15600.0,1956.32,0.0,17556.32,487.9,Aug-2018,
56473345,20000.0,2408.94,0.0,22408.94,9677.72,May-2017,
56463188,11200.0,5231.01,0.0,16431.014,7475.86,Feb-2018,
56473316,5215.47,6513.51,0.0,13237.07,432.77,Nov-2017,
56663266,6477.26,4221.77,0.0,12544.05,458.45,Jun-2017,
56483027,10000.0,2062.03,0.0,12062.026,335.38,Aug-2018,
56613385,13932.74,11678.23,0.0,25610.97,609.46,Feb-2019,Apr-2019
56643620,16000.0,1031.67,0.0,17031.674,8363.28,Mar-2017,
56533114,3478.66,3815.07,0.0,12460.86,1111.52,Mar-2016,


In [41]:
loans_payments_ndate_fixed_df.write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","lendingclubproject/cleaned/loans_repayment_csv") \
.save()

In [None]:
loans_payments_ndate_fixed_df.write \
.option("header",True) \
.format("pa") \
.mode("overwrite") \
.option("path","lendingclubproject/cleaned/loans_repayment_parquet") \
.save()