In [44]:
from pyspark.sql import *
import getpass
username = getpass.getuser()
spark = SparkSession.builder \
    .appName('aggregations') \
    .config('spark.ui.port', '0') \
    .config("spark.sql.warehouse.dir", f"/user/{username}/warehouse") \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [43]:
spark.stop()

# Cleaning customer data

In [45]:
customer_schema = "member_id string, emp_title string, emp_length string, home_ownership string, annual_inc float, addr_state string,zip_code string, country string, grade string, sub_grade string, verification_status string, tot_hi_cred_lim float, application_type string, annual_inc_joint float, verification_status_joint string"

In [46]:
customers_raw_df = spark.read.csv("/user/itv012667/lendingclub/raw/customers_data_csv",header=True,schema=customer_schema)

In [47]:
customers_raw_df

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


In [48]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



# Renaming the Columns

In [52]:
customers_cleaned_columns = customers_raw_df.withColumnRenamed("annual_inc","annual_income") \
                .withColumnRenamed("addr_state","address_state") \
                .withColumnRenamed("zip_code","address_zipcode") \
                .withColumnRenamed("country","address_country") \
                .withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit") \
                .withColumnRenamed("annual_inc_joint","joint_annual_income")

In [53]:
customers_cleaned_columns

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,joint_annual_income,verification_status_joint
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,


# Ingest Time Stamp 

In [54]:
from pyspark.sql.functions import *

In [55]:
customers_with_ingestion_time = customers_cleaned_columns.withColumn("ingest_timestamp",current_timestamp())

In [56]:
customers_with_ingestion_time

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,joint_annual_income,verification_status_joint,ingest_timestamp
b59d80da191f5b573...,,,RENT,50000.0,OR,973xx,USA,A,A5,Source Verified,8600.0,Individual,,,2024-05-26 15:47:...
202d9f56ecb7c3bc9...,police officer,7 years,OWN,85000.0,TX,799xx,USA,A,A5,Source Verified,272384.0,Individual,,,2024-05-26 15:47:...
e5a140c0922b554b9...,community living ...,6 years,RENT,48000.0,NY,146xx,USA,B,B2,Source Verified,85092.0,Individual,,,2024-05-26 15:47:...
e12aefc548f750777...,Office,10+ years,OWN,33000.0,CT,067xx,USA,F,F1,Verified,7100.0,Individual,,,2024-05-26 15:47:...
1b3a50d854fbbf97e...,Special Tooling I...,10+ years,MORTGAGE,81000.0,TX,791xx,USA,E,E5,Verified,190274.0,Individual,,,2024-05-26 15:47:...
1c4329e5f17697127...,Mine ops tech 6,2 years,MORTGAGE,68000.0,AZ,855xx,USA,C,C3,Not Verified,182453.0,Individual,,,2024-05-26 15:47:...
5026c86ad983175eb...,caregiver,4 years,RENT,76020.0,WA,993xx,USA,C,C2,Source Verified,15308.0,Individual,,,2024-05-26 15:47:...
9847d8c1e9d0b2084...,,,OWN,65000.0,IL,624xx,USA,E,E3,Verified,128800.0,Individual,,,2024-05-26 15:47:...
8340dbe1adea41fb4...,Vice President Re...,8 years,MORTGAGE,111000.0,CT,063xx,USA,A,A1,Not Verified,343507.0,Individual,,,2024-05-26 15:47:...
d4de0de3ab7d79ad4...,FOREMAN,10+ years,MORTGAGE,67000.0,WA,992xx,USA,G,G2,Verified,211501.0,Individual,,,2024-05-26 15:47:...


# Drop duplicate rows

In [57]:
distinct_data = customers_with_ingestion_time.drop_duplicates()

In [58]:
distinct_data.count()

2260638

# Remove rows where annual income is null

In [59]:
customer_with_annuual_income = distinct_data.filter("annual_income is not null")

In [60]:
customer_with_annuual_income

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,joint_annual_income,verification_status_joint,ingest_timestamp
7138261c576d8d781...,Intelligence Spec...,5 years,MORTGAGE,132000.0,GA,309xx,USA,C,C3,Not Verified,280896.0,Individual,,,2024-05-26 15:48:...
bf310134bf4932d6b...,Andministrative C...,10+ years,MORTGAGE,60000.0,TX,770xx,USA,B,B2,Source Verified,122537.0,Individual,,,2024-05-26 15:48:...
52817b840418f1c32...,Bus Operator/Mech...,10+ years,MORTGAGE,49000.0,WV,258xx,USA,A,A4,Source Verified,208200.0,Individual,,,2024-05-26 15:48:...
138b92a6f81bac215...,Security Director,5 years,RENT,35000.0,FL,331xx,USA,B,B1,Not Verified,27346.0,Individual,,,2024-05-26 15:48:...
c35a61457dbd9013a...,Space Simulation ...,5 years,RENT,75000.0,CA,907xx,USA,D,D1,Verified,39760.0,Individual,,,2024-05-26 15:48:...
02a178bfaf83462c2...,corrections officer,4 years,MORTGAGE,54500.0,FL,329xx,USA,B,B3,Not Verified,198361.0,Individual,,,2024-05-26 15:48:...
88fd6f08092f2a778...,Senior Specialist,2 years,RENT,82000.0,DC,200xx,USA,A,A3,Not Verified,97320.0,Individual,,,2024-05-26 15:48:...
e080bb01c1979ed4c...,Team leader,10+ years,MORTGAGE,100000.0,OH,435xx,USA,D,D4,Verified,298994.0,Individual,,,2024-05-26 15:48:...
72701aca16ac1939b...,Office Manager,4 years,RENT,27000.0,CA,928xx,USA,D,D3,Not Verified,38650.0,Individual,,,2024-05-26 15:48:...
59100f5816491d64f...,supervisor,4 years,MORTGAGE,46000.0,MT,597xx,USA,C,C3,Verified,27799.0,Individual,,,2024-05-26 15:48:...


# Update emp_length to integer data type and replace null values

In [61]:
customers_with_emp_length = customer_with_annuual_income.withColumn("emp_length",regexp_replace(col("emp_length"),"(\D)",""))

In [80]:
customers_with_emp_length_cleaned = customers_with_emp_length.withColumn("emp_length",customers_with_emp_length.emp_length.cast('int'))

In [83]:
customers_with_emp_length_cleaned.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- joint_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_timestamp: timestamp (nullable = false)



In [84]:
mean = customers_with_emp_length_cleaned.agg(floor(avg("emp_length")))

In [85]:
mean = mean.collect()[0][0]

In [86]:
print(type(mean))

<class 'int'>


In [96]:
customers_filled_emp_length = customers_with_emp_length_cleaned.fillna(mean,subset = ["emp_length"])

In [97]:
customers_filled_emp_length.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- joint_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_timestamp: timestamp (nullable = false)



In [98]:
customers_filled_emp_length.filter("emp_length is null").count()

0

# Update the state with more than 2 chars

In [102]:
customer_state_cleaned = customers_filled_emp_length.withColumn("address_state", when(length(col("address_state"))>2,"NA").otherwise(col("address_state")))

# Save the cleaned data on disk

In [105]:
customer_state_cleaned.repartition(1).write.format("parquet").mode("overwrite").option("path","/user/itv012667/lendingclub/cleaneddata").save()