In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.\
builder.\
config('spark.shuffle.useOldFetchProtocol','true').\
config('spark.ui.port','0').\
config("spark.sql.warehouse.dir","/user/itv012713/warehouse").\
enableHiveSupport().\
master('yarn').\
getOrCreate()



In [2]:
from pyspark.sql.functions import current_timestamp, regexp_replace, col, when, count

In [3]:
loans_schema='loan_id string,member_id string,loan_amount float,funded_amount float,loan_term_months string,interest_rate float,monthly_installment float,issue_date string,loan_status string,loan_purpose string,loan_title string'

In [4]:
loans_raw_df = spark.read.format("csv").option("header",True).schema(loans_schema).load("/user/itv012713/lendingclubproject/raw/loans_data_csv")

In [5]:
loans_raw_df

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title
118184277,707271898dcabc8b2...,4500.0,4500.0,36 months,7.07,139.1,Sep-2017,Fully Paid,debt_consolidation,Debt consolidation
117289602,8e1ea10aca3c4ad8f...,4000.0,4000.0,36 months,9.93,128.94,Sep-2017,Current,credit_card,Credit card refin...
118216186,1d6546a2cbc1fd240...,10000.0,10000.0,36 months,10.91,326.97,Sep-2017,Current,debt_consolidation,Debt consolidation
117584790,d6208beced388988f...,20000.0,20000.0,36 months,12.62,670.23,Sep-2017,Fully Paid,home_improvement,Home improvement
118208834,b4af936688c28c165...,15000.0,15000.0,36 months,7.97,469.84,Sep-2017,Current,debt_consolidation,Debt consolidation
117566588,2c04e047879ada04e...,21000.0,21000.0,60 months,13.59,484.19,Sep-2017,Current,home_improvement,Home improvement
118221134,39dfcd293cb7b2c17...,10000.0,10000.0,60 months,15.05,238.17,Sep-2017,Current,debt_consolidation,Debt consolidation
118183021,5e6e1f8ad59c71a0b...,5600.0,5600.0,36 months,5.32,168.65,Sep-2017,Current,debt_consolidation,Debt consolidation
118209005,afd3b57e55eb95ed8...,16000.0,16000.0,36 months,7.21,495.58,Sep-2017,Current,home_improvement,Home improvement
118214952,8b5eed45ac53a0238...,7000.0,7000.0,36 months,7.97,219.26,Sep-2017,Current,car,Car financing


In [8]:
loans_raw_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_months: string (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)



## Inserting Ingestion Date for data processing

In [6]:
loans_df_ingest_date = loans_raw_df.withColumn("ingest_date", current_timestamp())

In [7]:
loans_df_ingest_date.createOrReplaceTempView("loans")

## Dropping rows having null values

In [8]:
spark.sql("select count(*) from loans")

count(1)
2260701


In [9]:
spark.sql("select count(*) from loans where loan_amount is null")

count(1)
33


In [10]:
columns_to_check=["loan_amount","funded_amount","loan_term_months","interest_rate","monthly_installment","issue_date","loan_status","loan_purpose"]

In [11]:
loans_filtered_df = loans_df_ingest_date.na.drop(subset = columns_to_check)

In [12]:
loans_filtered_df.count()

2260667

In [13]:
loans_filtered_df.createOrReplaceTempView("loans")

In [14]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_months,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
118184277,707271898dcabc8b2...,4500.0,4500.0,36 months,7.07,139.1,Sep-2017,Fully Paid,debt_consolidation,Debt consolidation,2024-08-10 15:17:...
117289602,8e1ea10aca3c4ad8f...,4000.0,4000.0,36 months,9.93,128.94,Sep-2017,Current,credit_card,Credit card refin...,2024-08-10 15:17:...
118216186,1d6546a2cbc1fd240...,10000.0,10000.0,36 months,10.91,326.97,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:17:...
117584790,d6208beced388988f...,20000.0,20000.0,36 months,12.62,670.23,Sep-2017,Fully Paid,home_improvement,Home improvement,2024-08-10 15:17:...
118208834,b4af936688c28c165...,15000.0,15000.0,36 months,7.97,469.84,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:17:...
117566588,2c04e047879ada04e...,21000.0,21000.0,60 months,13.59,484.19,Sep-2017,Current,home_improvement,Home improvement,2024-08-10 15:17:...
118221134,39dfcd293cb7b2c17...,10000.0,10000.0,60 months,15.05,238.17,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:17:...
118183021,5e6e1f8ad59c71a0b...,5600.0,5600.0,36 months,5.32,168.65,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:17:...
118209005,afd3b57e55eb95ed8...,16000.0,16000.0,36 months,7.21,495.58,Sep-2017,Current,home_improvement,Home improvement,2024-08-10 15:17:...
118214952,8b5eed45ac53a0238...,7000.0,7000.0,36 months,7.97,219.26,Sep-2017,Current,car,Car financing,2024-08-10 15:17:...


## Converting loan_term_months to loan_term_years

In [15]:
loans_term_modified_df = loans_filtered_df \
.withColumn("loan_term_months", (regexp_replace(col("loan_term_months"), "months", "") \
.cast('int')/12).cast('int')).withColumnRenamed("loan_term_months", "loan_term_years")

In [16]:
loans_term_modified_df.printSchema()

root
 |-- loan_id: string (nullable = true)
 |-- member_id: string (nullable = true)
 |-- loan_amount: float (nullable = true)
 |-- funded_amount: float (nullable = true)
 |-- loan_term_years: integer (nullable = true)
 |-- interest_rate: float (nullable = true)
 |-- monthly_installment: float (nullable = true)
 |-- issue_date: string (nullable = true)
 |-- loan_status: string (nullable = true)
 |-- loan_purpose: string (nullable = true)
 |-- loan_title: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [17]:
loans_term_modified_df.createOrReplaceTempView("loans")

In [18]:
spark.sql("select * from loans")

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
118184277,707271898dcabc8b2...,4500.0,4500.0,3,7.07,139.1,Sep-2017,Fully Paid,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
117289602,8e1ea10aca3c4ad8f...,4000.0,4000.0,3,9.93,128.94,Sep-2017,Current,credit_card,Credit card refin...,2024-08-10 15:18:...
118216186,1d6546a2cbc1fd240...,10000.0,10000.0,3,10.91,326.97,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
117584790,d6208beced388988f...,20000.0,20000.0,3,12.62,670.23,Sep-2017,Fully Paid,home_improvement,Home improvement,2024-08-10 15:18:...
118208834,b4af936688c28c165...,15000.0,15000.0,3,7.97,469.84,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
117566588,2c04e047879ada04e...,21000.0,21000.0,5,13.59,484.19,Sep-2017,Current,home_improvement,Home improvement,2024-08-10 15:18:...
118221134,39dfcd293cb7b2c17...,10000.0,10000.0,5,15.05,238.17,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
118183021,5e6e1f8ad59c71a0b...,5600.0,5600.0,3,5.32,168.65,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
118209005,afd3b57e55eb95ed8...,16000.0,16000.0,3,7.21,495.58,Sep-2017,Current,home_improvement,Home improvement,2024-08-10 15:18:...
118214952,8b5eed45ac53a0238...,7000.0,7000.0,3,7.97,219.26,Sep-2017,Current,car,Car financing,2024-08-10 15:18:...


## Classifying important loan purposes and remaining ones as other

In [19]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139413
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [20]:
loan_purpose_lookup=["debt_consolidation","credit_card","home_improvement","other","major_purchase","medical","small_business",
                     "car","vacation","moving","house","wedding","renewable_energy","educational"]

In [21]:
loans_purpose_modified = loans_term_modified_df.withColumn("loan_purpose", when(col("loan_purpose") \
                        .isin(loan_purpose_lookup), col("loan_purpose")).otherwise("other"))

In [22]:
loans_purpose_modified

loan_id,member_id,loan_amount,funded_amount,loan_term_years,interest_rate,monthly_installment,issue_date,loan_status,loan_purpose,loan_title,ingest_date
118184277,707271898dcabc8b2...,4500.0,4500.0,3,7.07,139.1,Sep-2017,Fully Paid,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
117289602,8e1ea10aca3c4ad8f...,4000.0,4000.0,3,9.93,128.94,Sep-2017,Current,credit_card,Credit card refin...,2024-08-10 15:18:...
118216186,1d6546a2cbc1fd240...,10000.0,10000.0,3,10.91,326.97,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
117584790,d6208beced388988f...,20000.0,20000.0,3,12.62,670.23,Sep-2017,Fully Paid,home_improvement,Home improvement,2024-08-10 15:18:...
118208834,b4af936688c28c165...,15000.0,15000.0,3,7.97,469.84,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
117566588,2c04e047879ada04e...,21000.0,21000.0,5,13.59,484.19,Sep-2017,Current,home_improvement,Home improvement,2024-08-10 15:18:...
118221134,39dfcd293cb7b2c17...,10000.0,10000.0,5,15.05,238.17,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
118183021,5e6e1f8ad59c71a0b...,5600.0,5600.0,3,5.32,168.65,Sep-2017,Current,debt_consolidation,Debt consolidation,2024-08-10 15:18:...
118209005,afd3b57e55eb95ed8...,16000.0,16000.0,3,7.21,495.58,Sep-2017,Current,home_improvement,Home improvement,2024-08-10 15:18:...
118214952,8b5eed45ac53a0238...,7000.0,7000.0,3,7.97,219.26,Sep-2017,Current,car,Car financing,2024-08-10 15:18:...


In [23]:
loans_purpose_modified.createOrReplaceTempView("loans")

In [25]:
spark.sql("select loan_purpose, count(*) as total from loans group by loan_purpose order by total desc")

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


In [26]:
loans_purpose_modified.groupBy("loan_purpose").agg(count("*").alias("total")).orderBy(col("total").desc())

loan_purpose,total
debt_consolidation,1277790
credit_card,516926
home_improvement,150440
other,139667
major_purchase,50429
medical,27481
small_business,24659
car,24009
vacation,15525
moving,15402


## Writing the data back to hdfs in parquet and csv format for further use

In [28]:
loans_purpose_modified.write \
.option("header", True) \
.format("csv") \
.mode("overwrite") \
.option("path", "/user/itv012713/lendingclubproject/clean_data/loans_cleaned_csv").save()

In [29]:
loans_purpose_modified.write \
.option("header", True) \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv012713/lendingclubproject/clean_data/loans_cleaned").save()