In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port', '0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
config('spark.shuffle.useOldFetchProtocol', 'true'). \
config('spark.app.name', 'itv012041_Customers_Data_Cleanup'). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
spark

In [3]:
customer_schema = """member_id string, emp_title string, emp_length string, 
home_ownership string, annual_inc float, addr_state string, 
zip_code string, country string, grade string, 
sub_grade string, verification_status string, 
tot_hi_cred_lim float, application_type string, 
annual_inc_joint float, verification_status_joint string"""

In [4]:
customers_raw_df = spark.read \
.format("csv") \
.option("header", True) \
.schema(customer_schema) \
.load("/user/itv012041/lendingclubproject/raw/customers_data_csv")

In [5]:
customers_raw_df

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
bbf87639f02df9e0f...,Crothall Services...,3 years,RENT,28685.0,NY,104xx,USA,C,C1,Not Verified,,Individual,,
1166edd4433e2c5e6...,PharMerica,1 year,RENT,111000.0,TX,757xx,USA,B,B3,Not Verified,,Individual,,
85ac408940a41c483...,Robert Half Inter...,3 years,RENT,38000.0,CA,956xx,USA,A,A5,Not Verified,,Individual,,
b967e90649c13df8c...,walmart,2 years,RENT,17000.0,NY,130xx,USA,D,D1,Not Verified,,Individual,,
cf201bf3b979cc53f...,Delloite,10+ years,MORTGAGE,500000.0,GA,300xx,USA,C,C4,Not Verified,,Individual,,
fc5352bf3df1141df...,Hightower Investm...,3 years,OWN,65000.0,NY,100xx,USA,B,B5,Not Verified,,Individual,,
cf5f3e25690264e2f...,double barrel env...,2 years,MORTGAGE,48000.0,CA,933xx,USA,B,B5,Not Verified,,Individual,,
c3c8ed21681fdf8c1...,e-Dialog,3 years,MORTGAGE,68500.0,MA,021xx,USA,D,D4,Not Verified,,Individual,,
d01624b43a8b3d161...,City of Hialeah,10+ years,MORTGAGE,80000.0,FL,331xx,USA,D,D3,Not Verified,,Individual,,
ab0496cfe8100126c...,AARP Services,4 years,MORTGAGE,120000.0,VA,201xx,USA,D,D5,Verified,,Individual,,


In [6]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [7]:
customers_renamed_df = customers_raw_df.withColumnRenamed("annual_inc", "annual_income") \
.withColumnRenamed("addr_state", "address_state") \
.withColumnRenamed("zip_code", "address_zipcode") \
.withColumnRenamed("country", "address_country") \
.withColumnRenamed("tot_hi_credit_lim", "total_high_credit_limit") \
.withColumnRenamed("annual_inc_joint", "joint_annual_income")

In [8]:
customers_renamed_df

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,joint_annual_income,verification_status_joint
bbf87639f02df9e0f...,Crothall Services...,3 years,RENT,28685.0,NY,104xx,USA,C,C1,Not Verified,,Individual,,
1166edd4433e2c5e6...,PharMerica,1 year,RENT,111000.0,TX,757xx,USA,B,B3,Not Verified,,Individual,,
85ac408940a41c483...,Robert Half Inter...,3 years,RENT,38000.0,CA,956xx,USA,A,A5,Not Verified,,Individual,,
b967e90649c13df8c...,walmart,2 years,RENT,17000.0,NY,130xx,USA,D,D1,Not Verified,,Individual,,
cf201bf3b979cc53f...,Delloite,10+ years,MORTGAGE,500000.0,GA,300xx,USA,C,C4,Not Verified,,Individual,,
fc5352bf3df1141df...,Hightower Investm...,3 years,OWN,65000.0,NY,100xx,USA,B,B5,Not Verified,,Individual,,
cf5f3e25690264e2f...,double barrel env...,2 years,MORTGAGE,48000.0,CA,933xx,USA,B,B5,Not Verified,,Individual,,
c3c8ed21681fdf8c1...,e-Dialog,3 years,MORTGAGE,68500.0,MA,021xx,USA,D,D4,Not Verified,,Individual,,
d01624b43a8b3d161...,City of Hialeah,10+ years,MORTGAGE,80000.0,FL,331xx,USA,D,D3,Not Verified,,Individual,,
ab0496cfe8100126c...,AARP Services,4 years,MORTGAGE,120000.0,VA,201xx,USA,D,D5,Verified,,Individual,,


In [9]:
from pyspark.sql.functions import current_timestamp
customers_df_with_ingest_date = customers_renamed_df.withColumn("ingest_date", current_timestamp())

In [10]:
customers_df_with_ingest_date

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,joint_annual_income,verification_status_joint,ingest_date
bbf87639f02df9e0f...,Crothall Services...,3 years,RENT,28685.0,NY,104xx,USA,C,C1,Not Verified,,Individual,,,2024-12-12 10:14:...
1166edd4433e2c5e6...,PharMerica,1 year,RENT,111000.0,TX,757xx,USA,B,B3,Not Verified,,Individual,,,2024-12-12 10:14:...
85ac408940a41c483...,Robert Half Inter...,3 years,RENT,38000.0,CA,956xx,USA,A,A5,Not Verified,,Individual,,,2024-12-12 10:14:...
b967e90649c13df8c...,walmart,2 years,RENT,17000.0,NY,130xx,USA,D,D1,Not Verified,,Individual,,,2024-12-12 10:14:...
cf201bf3b979cc53f...,Delloite,10+ years,MORTGAGE,500000.0,GA,300xx,USA,C,C4,Not Verified,,Individual,,,2024-12-12 10:14:...
fc5352bf3df1141df...,Hightower Investm...,3 years,OWN,65000.0,NY,100xx,USA,B,B5,Not Verified,,Individual,,,2024-12-12 10:14:...
cf5f3e25690264e2f...,double barrel env...,2 years,MORTGAGE,48000.0,CA,933xx,USA,B,B5,Not Verified,,Individual,,,2024-12-12 10:14:...
c3c8ed21681fdf8c1...,e-Dialog,3 years,MORTGAGE,68500.0,MA,021xx,USA,D,D4,Not Verified,,Individual,,,2024-12-12 10:14:...
d01624b43a8b3d161...,City of Hialeah,10+ years,MORTGAGE,80000.0,FL,331xx,USA,D,D3,Not Verified,,Individual,,,2024-12-12 10:14:...
ab0496cfe8100126c...,AARP Services,4 years,MORTGAGE,120000.0,VA,201xx,USA,D,D5,Verified,,Individual,,,2024-12-12 10:14:...


In [11]:
customers_df_with_ingest_date.count()

2260701

In [12]:
customers_distinct = customers_df_with_ingest_date.distinct()

In [13]:
customers_distinct.createOrReplaceTempView("customers")

In [14]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,joint_annual_income,verification_status_joint,ingest_date
3cf9329afc524db70...,The Medicine Shoppe,4 years,OWN,15000.0,CA,930xx,USA,B,B4,Not Verified,,Individual,,,2024-12-12 10:14:...
1a4da9f27d4b476e5...,GTS,< 1 year,MORTGAGE,43680.0,IL,601xx,USA,C,C2,Not Verified,,Individual,,,2024-12-12 10:14:...
9985708558f0dd08b...,Lanai High & Elem...,8 years,RENT,15000.0,HI,967xx,USA,C,C2,Not Verified,,Individual,,,2024-12-12 10:14:...
5d12019fc2f2e29dc...,Sensis Corporation,2 years,RENT,103241.0,NY,132xx,USA,B,B1,Not Verified,,Individual,,,2024-12-12 10:14:...
65edbeffa728c7912...,Temecula Creek Ch...,3 years,RENT,42000.0,CA,925xx,USA,C,C3,Not Verified,,Individual,,,2024-12-12 10:14:...
59c539d9f9252c012...,,< 1 year,MORTGAGE,67500.0,324xx,Trying P2P Borrowing,USA,A,A2,Not Verified,0.0,1.0,,,2024-12-12 10:14:...
33d891bc01647cc23...,Brandon Harley Da...,4 years,RENT,384000.0,FL,335xx,USA,E,E2,Not Verified,,Individual,,,2024-12-12 10:14:...
8b38830b3b0fe63ac...,BuckleySandler LLP,4 years,OWN,170000.0,MD,208xx,USA,B,B5,Not Verified,,Individual,,,2024-12-12 10:14:...
24a3fbb044a3ff34a...,American Public M...,< 1 year,RENT,65000.0,MN,554xx,USA,F,F3,Not Verified,,Individual,,,2024-12-12 10:14:...
2bd27580b51910f20...,,1 year,RENT,32000.0,TX,775xx,USA,A,A4,Not Verified,,Individual,,,2024-12-12 10:14:...


In [15]:
spark.sql("select * from customers where annual_income is null")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,joint_annual_income,verification_status_joint,ingest_date
392650c3bcd78274d...,,< 1 year,NONE,,NY,100xx,USA,A,A3,Not Verified,,Individual,,,2024-12-12 10:15:...
6653dca72da571005...,,< 1 year,NONE,,NY,100xx,USA,A,A5,Not Verified,,Individual,,,2024-12-12 10:15:...
77f5f41c3ddd56543...,"""Coil Winder """"B""""","reactors""",2 years,,531xx,Other,USA,B,B4,38000.0,0.0,1.0,,,2024-12-12 10:15:...
e3b0c44298fc1c149...,,,,,,,USA,,,,,,,,2024-12-12 10:15:...
6332caa680e7500fb...,,< 1 year,NONE,,NY,100xx,USA,A,A2,Not Verified,,Individual,,,2024-12-12 10:15:...


In [16]:
customers_income_filtered = spark.sql("select * from customers where annual_income is not null")

In [17]:
customers_income_filtered.createOrReplaceTempView("customers_income_filtered")
spark.sql("select count(*) from customers_income_filtered where annual_income is null")

count(1)
0


In [18]:
spark.sql("select distinct(emp_length) from customers_income_filtered")

emp_length
5 years
9 years
""
1 year
2 years
7 years
8 years
4 years
6 years
3 years


In [19]:
from pyspark.sql.functions import regexp_replace, col

In [20]:
customers_emp_length_cleaned = customers_income_filtered.withColumn("emp_length", 
                                                                    regexp_replace(
                                                                        col("emp_length"), 
                                                                        "(\D)", ""))

In [21]:
customers_emp_length_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,joint_annual_income,verification_status_joint,ingest_date
9b4c6e1fa91564fd1...,Senior Customer S...,10,MORTGAGE,118000.0,TX,773xx,USA,C,C1,Source Verified,228695.0,Individual,,,2024-12-12 10:15:...
b5f6c0714525c3263...,insulator,1,MORTGAGE,90000.0,RI,028xx,USA,C,C1,Verified,246593.0,Individual,,,2024-12-12 10:15:...
c328f514f0d675e51...,Medical Records/P...,4,OWN,75000.0,AR,722xx,USA,B,B2,Source Verified,70500.0,Individual,,,2024-12-12 10:15:...
ff255a474a2342099...,Director of Schoo...,8,RENT,70000.0,CA,917xx,USA,B,B3,Source Verified,14000.0,Individual,,,2024-12-12 10:15:...
f0a2f3491dcb2fe9f...,SECURITY PROFESSI...,8,RENT,38000.0,CA,940xx,USA,E,E1,Verified,47579.0,Individual,,,2024-12-12 10:15:...
f2e0145b714f72de6...,Teacher,4,RENT,38500.0,VA,234xx,USA,B,B4,Source Verified,60534.0,Individual,,,2024-12-12 10:15:...
54ba039a1cc9f9d1f...,Lpn,10,MORTGAGE,53800.0,MN,553xx,USA,C,C1,Not Verified,157479.0,Individual,,,2024-12-12 10:15:...
d3e0c3a19e947c54a...,Administrative As...,10,MORTGAGE,43000.0,GA,310xx,USA,B,B2,Source Verified,186608.0,Individual,,,2024-12-12 10:15:...
d64d07a5d175eb270...,Sales manager,1,OWN,126000.0,ND,585xx,USA,C,C4,Source Verified,79548.0,Individual,,,2024-12-12 10:15:...
c083be2f70001a747...,machine operator,7,MORTGAGE,52860.0,NY,146xx,USA,B,B2,Not Verified,138368.0,Individual,,,2024-12-12 10:15:...


In [22]:
customers_emp_length_cleaned.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- joint_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [23]:
customers_emp_length_casted = customers_emp_length_cleaned.withColumn("emp_length", customers_emp_length_cleaned.emp_length.cast('int'))

In [24]:
customers_emp_length_casted

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,joint_annual_income,verification_status_joint,ingest_date
553df42851a40d185...,Spivak Lipton LLP,10,OWN,66150.0,NY,112xx,USA,C,C1,Not Verified,,Individual,,,2024-12-12 10:15:...
67b033264ca0263a4...,E2 Inc.,4,RENT,62500.0,VA,229xx,USA,B,B2,Not Verified,,Individual,,,2024-12-12 10:15:...
9db01b50290493c94...,Matrix Resources,9,MORTGAGE,100000.0,GA,300xx,USA,A,A3,Not Verified,,Individual,,,2024-12-12 10:15:...
be36d0cb69bb6a768...,British Motors,4,RENT,72000.0,CA,949xx,USA,C,C4,Not Verified,,Individual,,,2024-12-12 10:15:...
9c8ade856ec9e7a2f...,"Soldier Capital, LLC",1,RENT,84000.0,NJ,070xx,USA,D,D2,Not Verified,,Individual,,,2024-12-12 10:15:...
7cfacd9a93884dfe7...,Boeing,10,MORTGAGE,92000.0,AL,357xx,USA,E,E2,Verified,,Individual,,,2024-12-12 10:15:...
f7cead1c1282f92e4...,,1,OWN,200000.0,MA,024xx,USA,D,D2,Verified,,Individual,,,2024-12-12 10:15:...
89d65d245751401c5...,Howard Dental Group,1,RENT,120000.0,GA,314xx,USA,A,A2,Not Verified,,Individual,,,2024-12-12 10:15:...
cbae860ce12b719c7...,Pelham School Dis...,1,RENT,55000.0,NY,104xx,USA,B,B5,Not Verified,,Individual,,,2024-12-12 10:15:...
86846d1901bf9cf7a...,Highland Theatres,10,RENT,83200.0,CA,902xx,USA,D,D3,Verified,,Individual,,,2024-12-12 10:15:...


In [25]:
customers_emp_length_casted.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- joint_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [26]:
customers_emp_length_casted.filter("emp_length is null").count()

146903

In [27]:
customers_emp_length_casted.createOrReplaceTempView("customers_emp_length_casted")

In [28]:
avg_emp_length = spark.sql("select floor(avg(emp_length)) as avg_emp_length from customers_emp_length_casted").collect()

In [29]:
print(avg_emp_length)

[Row(avg_emp_length=6)]


In [30]:
avg_employment_length = avg_emp_length[0][0]

In [31]:
avg_employment_length

6

In [32]:
customers_emp_length_replaced = customers_emp_length_casted.na.fill(avg_employment_length, subset=['emp_length'])

In [33]:
customers_emp_length_replaced

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,joint_annual_income,verification_status_joint,ingest_date
3ef0c414327b234b8...,Producer,8,MORTGAGE,45000.0,FL,322xx,USA,E,E5,Source Verified,98500.0,Individual,,,2024-12-12 10:16:...
a2c38280e0b8330ef...,Seamstress,2,RENT,25000.0,OH,432xx,USA,C,C1,Verified,92734.0,Individual,,,2024-12-12 10:16:...
7a7b53896eb082ff1...,Service Manger,1,MORTGAGE,87000.0,IL,604xx,USA,C,C3,Source Verified,230086.0,Individual,,,2024-12-12 10:16:...
5484abfabd3c11221...,Operation Manager,1,MORTGAGE,70000.0,TX,785xx,USA,D,D1,Source Verified,9286.0,Individual,,,2024-12-12 10:16:...
6a51e670937a8e03e...,Packer/order puller,1,RENT,21200.0,CA,913xx,USA,D,D5,Verified,31167.0,Individual,,,2024-12-12 10:16:...
326ef319ec5a2928b...,Doorman,6,RENT,82440.0,NY,104xx,USA,B,B5,Verified,21200.0,Individual,,,2024-12-12 10:16:...
8fabcdf5dd46c92f1...,Electrical appren...,3,RENT,40320.0,MD,208xx,USA,C,C3,Source Verified,14071.0,Individual,,,2024-12-12 10:16:...
65526144d655c3118...,Dispatcher,4,RENT,40000.0,FL,327xx,USA,D,D2,Source Verified,59103.0,Individual,,,2024-12-12 10:16:...
5236ac81e883c8e18...,registered nurse,7,MORTGAGE,95000.0,CA,917xx,USA,D,D2,Source Verified,525749.0,Individual,,,2024-12-12 10:16:...
55968feb4db0e2a11...,Professor,3,RENT,75000.0,IL,606xx,USA,E,E1,Source Verified,192648.0,Individual,,,2024-12-12 10:16:...


In [34]:
customers_emp_length_replaced.filter("emp_length is null").count()

0

In [35]:
customers_emp_length_replaced.createOrReplaceTempView("customers_emp_length_replaced")

In [37]:
spark.sql("select count(address_state) from customers_emp_length_replaced where length(address_state) > 2")

count(address_state)
254


In [38]:
from pyspark.sql.functions import when, col, length

In [40]:
customers_state_cleaned = customers_emp_length_replaced.withColumn(
    "address_state", 
    when(length(col("address_state")) > 2, "NA").otherwise(col("address_state"))
)

In [41]:
customers_state_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,joint_annual_income,verification_status_joint,ingest_date
446196d7b12508af7...,,6,MORTGAGE,60000.0,MN,550xx,USA,C,C1,Verified,211000.0,Individual,,,2024-12-12 10:30:...
a04d3bd8853ee9091...,Fireman's Fund In...,1,OWN,35000.0,MO,633xx,USA,E,E5,Verified,42926.0,Individual,,,2024-12-12 10:30:...
1e015ea585e9061b3...,Tufts Health Plan,1,MORTGAGE,65000.0,MA,021xx,USA,B,B1,Not Verified,267883.0,Individual,,,2024-12-12 10:30:...
af4afe11742622395...,Dunkin Donuts,10,RENT,56160.0,CT,060xx,USA,C,C5,Not Verified,39759.0,Individual,,,2024-12-12 10:30:...
7daa0b03228588cdc...,,6,RENT,15995.0,OH,437xx,USA,D,D5,Verified,13900.0,Individual,,,2024-12-12 10:30:...
12887da8b12f8a721...,Concur Technologies,3,MORTGAGE,150000.0,WA,980xx,USA,B,B2,Verified,293801.0,Individual,,,2024-12-12 10:30:...
2fa324bbfec2e4a14...,Intersil,10,MORTGAGE,75000.0,CA,945xx,USA,B,B2,Verified,284680.0,Individual,,,2024-12-12 10:30:...
32b5a4c4b2dce7c32...,Mile High Organics,3,RENT,33000.0,CO,803xx,USA,C,C2,Not Verified,59897.0,Individual,,,2024-12-12 10:30:...
4da10e92478b8bf22...,valley hospital a...,4,MORTGAGE,75000.0,NV,891xx,USA,A,A5,Not Verified,214394.0,Individual,,,2024-12-12 10:30:...
8db97fcd1784116a4...,Mashburn Trans. S...,10,RENT,70000.0,CA,932xx,USA,C,C1,Verified,59355.0,Individual,,,2024-12-12 10:30:...


In [42]:
customers_state_cleaned.select("address_state").distinct

<bound method DataFrame.distinct of +-------------+
|address_state|
+-------------+
|           GA|
|           TX|
|           CT|
|           CA|
|           CO|
|           MI|
|           IL|
|           VA|
|           TX|
|           AZ|
|           NY|
|           NV|
|           TN|
|           NJ|
|           AL|
|           OH|
|           MS|
|           NY|
|           NY|
|           FL|
+-------------+
only showing top 20 rows
>

In [45]:
customers_state_cleaned.write \
.option("header", True) \
.format("parquet") \
.mode("overwrite") \
.option("path", "/user/itv012041/lendingclubproject/cleaned/customers_parquet") \
.save()