# Lending Club Project Customers Data Cleaning

In [85]:
from pyspark.sql import SparkSession
import getpass
username=getpass.getuser()
spark=SparkSession. \
builder. \
config('spark.ui.port','0'). \
config('spark.shuffle.useOldFetchProtocol','true'). \
config("spark.sql.warehouse.dir",f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [86]:
spark

In [87]:
customers_raw_df=spark.read \
.format("csv") \
.option("header",True) \
.option("inferSchema",True) \
.load("/user/itv007473/lendingclubproject/raw/customers_data_csv")

In [32]:
customers_raw_df

member_id,emp_title,emp_length,home_ownership,annual_inc,addr_state,zip_code,country,grade,sub_grade,verification_status,tot_hi_cred_lim,application_type,annual_inc_joint,verification_status_joint
4480925324607267c...,IT,3 years,RENT,92000.0,NJ,088xx,USA,E,E3,Verified,132523.0,Individual,,
b54711d4a553ea330...,District Manager,7 years,RENT,45000.0,TX,787xx,USA,C,C4,Verified,33247.0,Individual,,
db06b45a938f1a3b5...,Case Management,6 years,RENT,33000.0,NJ,070xx,USA,D,D3,Not Verified,34825.0,Individual,,
ad9d9524477e85c11...,senior financial ...,< 1 year,RENT,80000.0,NJ,079xx,USA,A,A4,Source Verified,81859.0,Individual,,
c67f6ac3fea6ef46d...,Senior Analyst H....,6 years,RENT,30000.0,FL,328xx,USA,B,B3,Source Verified,30825.0,Individual,,
bb36e2cb69517fac3...,Owner Realtor,1 year,MORTGAGE,40000.0,WI,549xx,USA,E,E3,Not Verified,186722.0,Individual,,
af69a7dff814fb213...,nurse,10+ years,MORTGAGE,45000.0,TX,791xx,USA,C,C5,Not Verified,261804.0,Individual,,
c9c794b5025e14a7d...,Assistant IT Dire...,2 years,MORTGAGE,76500.0,UT,840xx,USA,D,D5,Source Verified,67822.0,Individual,,
61b48d763bd82369a...,Network Provision...,3 years,RENT,98000.0,CA,950xx,USA,B,B2,Source Verified,100609.0,Individual,,
adc390ceaa6428ba4...,,,MORTGAGE,38964.0,MO,650xx,USA,C,C2,Source Verified,18400.0,Individual,,


In [33]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: string (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: double (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: string (nullable = true)
 |-- verification_status_joint: string (nullable = true)



### Providing our own Schema

In [88]:
customers_schema="member_id string, emp_title string, emp_length string, home_ownership string, annual_inc float, addr_state string,zip_code string, country string, grade string, sub_grade string,verification_status string,tot_hi_cred_lim float,application_type string,annual_inc_joint float,verification_status_joint string"

In [89]:
customers_raw_df=spark.read \
.format("csv") \
.option("header",True) \
.schema(customers_schema) \
.load("/user/itv007473/lendingclubproject/raw/customers_data_csv")

In [36]:
customers_raw_df.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: string (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_inc: float (nullable = true)
 |-- addr_state: string (nullable = true)
 |-- zip_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- tot_hi_cred_lim: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- annual_inc_joint: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)



In [90]:
 customers_df_renamed=customers_raw_df.withColumnRenamed("annual_inc","annual_income") \
.withColumnRenamed("addr_state","address_state") \
.withColumnRenamed("zip_code","address_zipcode") \
.withColumnRenamed("country","address_country") \
.withColumnRenamed("tot_hi_cred_lim","total_high_credit_limit") \
.withColumnRenamed("annual_inc_joint","join_annual_income")

In [38]:
customers_df_renamed

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint
4480925324607267c...,IT,3 years,RENT,92000.0,NJ,088xx,USA,E,E3,Verified,132523.0,Individual,,
b54711d4a553ea330...,District Manager,7 years,RENT,45000.0,TX,787xx,USA,C,C4,Verified,33247.0,Individual,,
db06b45a938f1a3b5...,Case Management,6 years,RENT,33000.0,NJ,070xx,USA,D,D3,Not Verified,34825.0,Individual,,
ad9d9524477e85c11...,senior financial ...,< 1 year,RENT,80000.0,NJ,079xx,USA,A,A4,Source Verified,81859.0,Individual,,
c67f6ac3fea6ef46d...,Senior Analyst H....,6 years,RENT,30000.0,FL,328xx,USA,B,B3,Source Verified,30825.0,Individual,,
bb36e2cb69517fac3...,Owner Realtor,1 year,MORTGAGE,40000.0,WI,549xx,USA,E,E3,Not Verified,186722.0,Individual,,
af69a7dff814fb213...,nurse,10+ years,MORTGAGE,45000.0,TX,791xx,USA,C,C5,Not Verified,261804.0,Individual,,
c9c794b5025e14a7d...,Assistant IT Dire...,2 years,MORTGAGE,76500.0,UT,840xx,USA,D,D5,Source Verified,67822.0,Individual,,
61b48d763bd82369a...,Network Provision...,3 years,RENT,98000.0,CA,950xx,USA,B,B2,Source Verified,100609.0,Individual,,
adc390ceaa6428ba4...,,,MORTGAGE,38964.0,MO,650xx,USA,C,C2,Source Verified,18400.0,Individual,,


### Add a Column

In [91]:
from pyspark.sql.functions import current_timestamp

In [92]:
customers_df_ingestd=customers_df_renamed.withColumn("ingest_date",current_timestamp())

In [42]:
customers_df_ingestd

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
4480925324607267c...,IT,3 years,RENT,92000.0,NJ,088xx,USA,E,E3,Verified,132523.0,Individual,,,2023-11-05 05:04:...
b54711d4a553ea330...,District Manager,7 years,RENT,45000.0,TX,787xx,USA,C,C4,Verified,33247.0,Individual,,,2023-11-05 05:04:...
db06b45a938f1a3b5...,Case Management,6 years,RENT,33000.0,NJ,070xx,USA,D,D3,Not Verified,34825.0,Individual,,,2023-11-05 05:04:...
ad9d9524477e85c11...,senior financial ...,< 1 year,RENT,80000.0,NJ,079xx,USA,A,A4,Source Verified,81859.0,Individual,,,2023-11-05 05:04:...
c67f6ac3fea6ef46d...,Senior Analyst H....,6 years,RENT,30000.0,FL,328xx,USA,B,B3,Source Verified,30825.0,Individual,,,2023-11-05 05:04:...
bb36e2cb69517fac3...,Owner Realtor,1 year,MORTGAGE,40000.0,WI,549xx,USA,E,E3,Not Verified,186722.0,Individual,,,2023-11-05 05:04:...
af69a7dff814fb213...,nurse,10+ years,MORTGAGE,45000.0,TX,791xx,USA,C,C5,Not Verified,261804.0,Individual,,,2023-11-05 05:04:...
c9c794b5025e14a7d...,Assistant IT Dire...,2 years,MORTGAGE,76500.0,UT,840xx,USA,D,D5,Source Verified,67822.0,Individual,,,2023-11-05 05:04:...
61b48d763bd82369a...,Network Provision...,3 years,RENT,98000.0,CA,950xx,USA,B,B2,Source Verified,100609.0,Individual,,,2023-11-05 05:04:...
adc390ceaa6428ba4...,,,MORTGAGE,38964.0,MO,650xx,USA,C,C2,Source Verified,18400.0,Individual,,,2023-11-05 05:04:...


### Removing Duplicates

In [93]:
customers_df_ingestd.count()

2260701

In [94]:
customers_distinct=customers_df_ingestd.distinct()

In [95]:
customers_distinct.count()

2260638

### Create Spark Sql Table

In [96]:
customers_distinct.createOrReplaceTempView("customers")

In [97]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
3fbac57a2ba1bfcae...,Advocacy Officer,4 years,RENT,70000.0,NY,113xx,USA,B,B3,Verified,157658.0,Individual,,,2023-11-05 05:47:...
49c1064ca72f95720...,Receiving supervisor,10+ years,OWN,43000.0,OK,730xx,USA,C,C3,Source Verified,162587.0,Individual,,,2023-11-05 05:47:...
817c9fe0c5c8e6119...,Fitness Professional,2 years,RENT,65000.0,NY,115xx,USA,C,C3,Source Verified,21700.0,Individual,,,2023-11-05 05:47:...
28c92ace7cb3cd670...,Appointment Cente...,2 years,MORTGAGE,32000.0,MN,550xx,USA,B,B2,Source Verified,152255.0,Individual,,,2023-11-05 05:47:...
674b5fd76cdecf666...,Linguist,10+ years,OWN,80000.0,TX,761xx,USA,A,A1,Not Verified,86534.0,Individual,,,2023-11-05 05:47:...
22e8cb697453736ff...,maint.mech/indust...,6 years,MORTGAGE,50000.0,OH,446xx,USA,C,C3,Source Verified,37667.0,Individual,,,2023-11-05 05:47:...
79377392eb6baa3fd...,Regional Sales Ma...,10+ years,MORTGAGE,130000.0,NC,282xx,USA,B,B5,Not Verified,436574.0,Individual,,,2023-11-05 05:47:...
ebaea37d84d95f03e...,Server/Manager,6 years,RENT,35442.0,CA,910xx,USA,D,D2,Source Verified,26644.0,Individual,,,2023-11-05 05:47:...
8c11fde5c57d0a921...,Machine Operator,< 1 year,RENT,52000.0,MA,016xx,USA,C,C5,Source Verified,51452.0,Individual,,,2023-11-05 05:47:...
0839ffefa9f5e7e9f...,,,MORTGAGE,24000.0,IL,629xx,USA,E,E1,Verified,8673.0,Individual,,,2023-11-05 05:47:...


### Remove the rows where annual income is null

In [98]:
spark.sql("select count(*) from customers where annual_income is null")

count(1)
5


In [99]:
customers_income_filtered=spark.sql("select * from customers where annual_income is not null")

In [51]:
customers_income_filtered

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
9db1b6a319edcceba...,United Food and C...,10+ years,MORTGAGE,86044.0,FL,335xx,USA,A,A3,Not Verified,,Individual,,,2023-11-05 05:11:...
96998a6be3dfda426...,Creighton School ...,3 years,MORTGAGE,45000.0,AZ,853xx,USA,A,A4,Not Verified,,Individual,,,2023-11-05 05:11:...
aee5dcf9f22fb8668...,S. C. Johnson & S...,9 years,RENT,69000.0,WI,534xx,USA,C,C4,Not Verified,,Individual,,,2023-11-05 05:11:...
a4594bad6a582a57e...,USDA,7 years,MORTGAGE,108000.0,NY,130xx,USA,C,C3,Verified,,Individual,,,2023-11-05 05:11:...
e2f77c99640cc917d...,Bargain Beachwear,2 years,MORTGAGE,27000.0,SC,295xx,USA,B,B4,Verified,,Individual,,,2023-11-05 05:11:...
e92488d4b1495a739...,CareerBuilder.com,2 years,RENT,75000.0,IL,606xx,USA,B,B3,Verified,,Individual,,,2023-11-05 05:11:...
40cecd807c162b0c5...,Nature Publishing...,4 years,RENT,54000.0,NY,112xx,USA,A,A4,Not Verified,,Individual,,,2023-11-05 05:11:...
cfe8a666052c69bc1...,US Govt,10+ years,MORTGAGE,130000.0,IL,601xx,USA,C,C3,Verified,,Individual,,,2023-11-05 05:11:...
b39886a37f766ac60...,411 Signs & Graphics,4 years,RENT,20000.0,NJ,080xx,USA,C,C2,Not Verified,,Individual,,,2023-11-05 05:11:...
e33981d8eb35b5388...,Charter Township ...,5 years,MORTGAGE,68004.0,MI,481xx,USA,B,B2,Source Verified,,Individual,,,2023-11-05 05:11:...


In [100]:
customers_income_filtered.createOrReplaceTempView("customers")

In [53]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
a7a482cf17ff6ce11...,Colorado Ballet,5 years,RENT,47004.0,debt_consolidation,hopefully without...,USA,B,B5,Not Verified,,0.0,,Individual,2023-11-05 05:12:...
f6b275ff35a9084d9...,Progressive Insur...,3 years,MORTGAGE,51000.0,NJ,089xx,USA,D,D3,Not Verified,,Individual,,,2023-11-05 05:12:...
919fc600fad7f4fec...,Boscov's Departme...,10+ years,MORTGAGE,60500.0,PA,196xx,USA,A,A5,Source Verified,,Individual,,,2023-11-05 05:12:...
8f917d2960999d61f...,HEALTH CARE SERVI...,< 1 year,RENT,70000.0,IL,606xx,USA,B,B3,Not Verified,,Individual,,,2023-11-05 05:12:...
709a15d4c3ea54a82...,TM Floyd & Company,1 year,MORTGAGE,108000.0,FL,327xx,USA,A,A3,Verified,,Individual,,,2023-11-05 05:12:...
93f70919c60cda11f...,Richfield Public ...,6 years,RENT,54924.0,MN,554xx,USA,B,B1,Not Verified,,Individual,,,2023-11-05 05:12:...
3e30ba8f0df169108...,,< 1 year,OTHER,35000.0,CA,917xx,USA,B,B3,Verified,,Individual,,,2023-11-05 05:12:...
2660d62ce57ee47fe...,Oak Patch Gifts,10+ years,MORTGAGE,40000.0,OK,731xx,USA,B,B5,Not Verified,,Individual,,,2023-11-05 05:12:...
eedad14031a4d9d77...,"eScreen, Inc.",2 years,RENT,42000.0,MI,482xx,USA,C,C3,Not Verified,,Individual,,,2023-11-05 05:12:...
6d3b250959883cce5...,Service Managemen...,3 years,MORTGAGE,69000.0,MO,640xx,USA,D,D2,Not Verified,,Individual,,,2023-11-05 05:12:...


### Checking emp_length column and removing the characters and keeping only digits

In [101]:
spark.sql("select distinct(emp_length) from customers").show()

+----------+
|emp_length|
+----------+
|   9 years|
|   5 years|
|      null|
|    1 year|
|   2 years|
|   7 years|
|   8 years|
|   4 years|
|   6 years|
|   3 years|
| 10+ years|
|  < 1 year|
+----------+



In [102]:
from pyspark.sql.functions import regexp_replace, col 

In [103]:
customers_emplength_cleanded=customers_income_filtered.withColumn("emp_length", regexp_replace(col("emp_length"),"(\D)",""))

In [60]:
customers_emplength_cleanded

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
bcead26174c709d34...,MARKETING,3,MORTGAGE,68000.0,CA,944xx,USA,A,A3,Not Verified,136601.0,Individual,,,2023-11-05 05:24:...
abc1d7e82ebc6b4b7...,Analyst,10,MORTGAGE,63000.0,CA,934xx,USA,D,D2,Verified,84719.0,Individual,,,2023-11-05 05:24:...
89eab9cb0a978638c...,Operations Manager,6,MORTGAGE,38000.0,FL,324xx,USA,B,B5,Not Verified,95152.0,Individual,,,2023-11-05 05:24:...
be1bcb54c57893b5f...,Sales Manager,3,RENT,78000.0,NJ,074xx,USA,C,C1,Verified,62500.0,Individual,,,2023-11-05 05:24:...
0c93eb5d799e16770...,supervisor,10,MORTGAGE,64000.0,MI,481xx,USA,C,C4,Verified,115952.0,Individual,,,2023-11-05 05:24:...
5e884d808cb2a7e16...,Front Office Support,3,OWN,32000.0,OR,972xx,USA,D,D1,Source Verified,20170.0,Individual,,,2023-11-05 05:24:...
4f36604b9c7b860a7...,Installation Depl...,2,RENT,45000.0,OK,735xx,USA,D,D3,Source Verified,82113.0,Individual,,,2023-11-05 05:24:...
213ec4504d57f0e88...,Machine Operator,10,OWN,40000.0,SC,294xx,USA,A,A4,Not Verified,111407.0,Individual,,,2023-11-05 05:24:...
939ab1fada1a36310...,Paramedic,10,MORTGAGE,58545.0,DE,199xx,USA,C,C3,Not Verified,419539.0,Individual,,,2023-11-05 05:24:...
f315159247c71db70...,Executive Director,10,RENT,52000.0,CA,957xx,USA,B,B2,Verified,27211.0,Individual,,,2023-11-05 05:24:...


In [104]:
customers_emplength_casted= customers_emplength_cleanded.withColumn("emp_length",customers_emplength_cleanded.emp_length.cast('int'))

In [62]:
customers_emplength_casted.printSchema()

root
 |-- member_id: string (nullable = true)
 |-- emp_title: string (nullable = true)
 |-- emp_length: integer (nullable = true)
 |-- home_ownership: string (nullable = true)
 |-- annual_income: float (nullable = true)
 |-- address_state: string (nullable = true)
 |-- address_zipcode: string (nullable = true)
 |-- address_country: string (nullable = true)
 |-- grade: string (nullable = true)
 |-- sub_grade: string (nullable = true)
 |-- verification_status: string (nullable = true)
 |-- total_high_credit_limit: float (nullable = true)
 |-- application_type: string (nullable = true)
 |-- join_annual_income: float (nullable = true)
 |-- verification_status_joint: string (nullable = true)
 |-- ingest_date: timestamp (nullable = false)



In [105]:
customers_emplength_casted.filter("emp_length is null").count()

146903

### Replace the nulls with avg of the emp_length column

In [106]:
customers_emplength_casted.createOrReplaceTempView("customers")

In [67]:
spark.sql("select * from customers")

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
809731e41fc6493e4...,Investment Bankin...,1.0,RENT,120000.0,NJ,073xx,USA,C,C1,Verified,82602.0,Individual,,,2023-11-05 05:32:...
9c9fd069b759d39e0...,Medical resident,3.0,RENT,60000.0,OH,452xx,USA,F,F4,Source Verified,81634.0,Individual,,,2023-11-05 05:32:...
6be1b2015ea4ea3b2...,LPN,2.0,RENT,40000.0,CT,067xx,USA,D,D4,Source Verified,40261.0,Individual,,,2023-11-05 05:32:...
6a2c42af661cdfc85...,Supervisor,10.0,MORTGAGE,70000.0,OH,450xx,USA,B,B2,Not Verified,193674.0,Individual,,,2023-11-05 05:32:...
f89c4d5afb7b3bf3c...,supervisor,10.0,RENT,90000.0,UT,841xx,USA,C,C1,Source Verified,54848.0,Individual,,,2023-11-05 05:32:...
be4b34601923b54d0...,Teacher,10.0,RENT,75000.0,CA,902xx,USA,E,E4,Verified,114769.0,Individual,,,2023-11-05 05:32:...
85779692f6fd492f4...,Sales,1.0,RENT,80000.0,NC,276xx,USA,E,E3,Source Verified,35600.0,Individual,,,2023-11-05 05:32:...
dc1dde480fa48b517...,CSR,5.0,OWN,35000.0,MN,557xx,USA,E,E1,Not Verified,39743.0,Individual,,,2023-11-05 05:32:...
ed407599c1517a4bf...,Senior Pastor,10.0,MORTGAGE,153000.0,AR,725xx,USA,D,D2,Source Verified,673302.0,Individual,,,2023-11-05 05:32:...
0990f8e25ffe374a8...,preowned sales,1.0,MORTGAGE,65000.0,LA,701xx,USA,A,A2,Source Verified,319241.0,Individual,,,2023-11-05 05:32:...


In [107]:
avg_emp_length=spark.sql("select floor(avg(emp_length)) as avg_emp_length from customers").collect()

In [74]:
print(avg_emp_length)

[Row(avg_emp_length=6)]


In [108]:
avg_emp_duration=avg_emp_length[0][0]

In [76]:
avg_emp_duration

6

In [109]:
customers_emplength_replaced=customers_emplength_casted.na.fill(avg_emp_duration,subset=['emp_length'])

In [78]:
customers_emplength_replaced

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
3adcb5aa46ce8fb9e...,Asst to Director,10,MORTGAGE,88000.0,SC,292xx,USA,A,A4,Verified,331200.0,Individual,,,2023-11-05 05:41:...
2433b70ca21663225...,Registered Nurse,10,MORTGAGE,70000.0,TX,774xx,USA,C,C1,Verified,323649.0,Joint App,210000.0,Source Verified,2023-11-05 05:41:...
86b24bbcd76340cab...,vice president,10,RENT,210000.0,IL,606xx,USA,A,A2,Not Verified,216985.0,Individual,,,2023-11-05 05:41:...
a8b02e2eea51b6f7f...,porter,6,RENT,58000.0,NY,104xx,USA,B,B2,Not Verified,48000.0,Individual,,,2023-11-05 05:41:...
97a3e5a1b43b784db...,Health Informatio...,10,MORTGAGE,45000.0,CT,060xx,USA,C,C5,Verified,200034.0,Individual,,,2023-11-05 05:41:...
f4ed0559d964c2584...,,6,MORTGAGE,0.0,CA,945xx,USA,A,A3,Not Verified,555806.0,Joint App,170000.0,Not Verified,2023-11-05 05:41:...
7594975a70260b374...,Mailroom Specialist,1,RENT,23000.0,KS,672xx,USA,A,A5,Source Verified,44128.0,Individual,,,2023-11-05 05:41:...
fbb6b74469915f8ed...,Senior Administra...,10,MORTGAGE,73000.0,TX,770xx,USA,D,D2,Verified,11462.0,Individual,,,2023-11-05 05:41:...
cbe7ea1d899bf6bab...,Mechanic,2,OWN,70000.0,VA,238xx,USA,A,A4,Not Verified,99843.0,Individual,,,2023-11-05 05:41:...
518dc8a1c31234a8d...,Custodian,1,RENT,60000.0,MD,217xx,USA,A,A5,Not Verified,72900.0,Individual,,,2023-11-05 05:41:...


In [110]:
customers_emplength_replaced.filter("emp_length is null").count()

0

In [111]:
customers_emplength_replaced.createOrReplaceTempView("customers")

In [112]:
spark.sql("select distinct(address_state) from customers")

address_state
Helping Kenya's D...
223xx
175 (total projec...
SC
AZ
"so Plan """"C"""" is ..."
I am 56 yrs. old ...
financially I mad...
but no one will l...
LA


In [113]:
spark.sql("select count(address_state) from customers where length(address_state)>2")

count(address_state)
254


In [114]:
from pyspark.sql.functions import when, col, length

In [116]:
customers_state_cleaned=customers_emplength_replaced.withColumn(
"address_state",
    when(length(col("address_state"))>2, "NA").otherwise(col("address_state"))
)

In [117]:
customers_state_cleaned

member_id,emp_title,emp_length,home_ownership,annual_income,address_state,address_zipcode,address_country,grade,sub_grade,verification_status,total_high_credit_limit,application_type,join_annual_income,verification_status_joint,ingest_date
c72d77107216d1eb8...,foreman,10,MORTGAGE,37500.0,AR,726xx,USA,C,C5,Not Verified,71208.0,Joint App,52000.0,Not Verified,2023-11-05 05:56:...
09deadc5e8eb0654e...,Transportation,6,MORTGAGE,65000.0,TX,760xx,USA,B,B2,Verified,188848.0,Individual,,,2023-11-05 05:56:...
0c880e940eb9a9519...,Tutor,1,MORTGAGE,10000.0,FL,324xx,USA,C,C1,Not Verified,307761.0,Joint App,135000.0,Not Verified,2023-11-05 05:56:...
85c5304ec95e3974e...,Brand Specialist,3,MORTGAGE,83000.0,MD,207xx,USA,C,C4,Source Verified,65069.0,Individual,,,2023-11-05 05:56:...
3c3892cda6eaa00f2...,pharmacist,3,MORTGAGE,212000.0,CO,810xx,USA,B,B2,Source Verified,656131.0,Individual,,,2023-11-05 05:56:...
e900e6093290ea6e1...,Financial services,6,MORTGAGE,154500.0,CO,804xx,USA,B,B4,Source Verified,512001.0,Individual,,,2023-11-05 05:56:...
916d22beaa6e464de...,Instructor,8,MORTGAGE,85000.0,MI,499xx,USA,B,B2,Source Verified,300818.0,Individual,,,2023-11-05 05:56:...
264708b38c4bd0eb5...,,6,RENT,25000.0,WV,249xx,USA,C,C4,Not Verified,26998.0,Individual,,,2023-11-05 05:56:...
7f27d410408e51405...,Finance Manager,10,RENT,485000.0,CA,941xx,USA,C,C1,Verified,319002.0,Individual,,,2023-11-05 05:56:...
9de010e60b1eb05fd...,Receptioist,10,RENT,39000.0,LA,701xx,USA,C,C3,Source Verified,73454.0,Individual,,,2023-11-05 05:56:...


In [118]:
customers_state_cleaned.select("address_state").distinct()

address_state
AZ
SC
LA
MN
NJ
DC
OR
""
VA
""


In [120]:
customers_state_cleaned.write \
.format("parquet") \
.mode("overwrite") \
.option("path","/user/itv007473/lendingclubproject/cleaned/customers_parquet").save()

In [121]:
customers_state_cleaned.write \
.option("header",True) \
.format("csv") \
.mode("overwrite") \
.option("path","/user/itv007473/lendingclubproject/cleaned/customers_csv").save()