In [56]:
from pyspark.sql import SparkSession
# Read data from a CSV file
input_path = 'gs://capstone-g4/customers.csv'
customers_df = spark.read.csv(input_path, header=True, inferSchema=True)

# Show the first few rows of the DataFrame
customers_df.show()


+-----------+--------------------+--------------------+----------+--------------+
|Customer_id|           Full_Name|      Customer_Email|       dob|Customer_Phone|
+-----------+--------------------+--------------------+----------+--------------+
| V0H084B3FA|         Mahika Gill|mahika.gill@gmail...| 6/23/1993|     785877743|
| 7SI1YKSDMB|        Elakshi Gour|elakshi.gour@outl...| 6/14/1981|    9638371516|
| H0L4L7VHXT|    Tanya Srinivasan|tanyasrinivasan@y...| 3/22/2003|     559420656|
| 6I324KGK08|        Kashvi Saraf|kashvisaraf@yahoo...| 6/27/2001|    4985572504|
| WKJ0VXQFIY|            Piya Ram| piyaram@hotmail.com| 3/31/1994|    5549849574|
| HT4P3N5R38|    Aniruddh Kothari|aniruddhkothari@h...| 4/20/1994|    8609260186|
| Y4EHCD6U3G|           Riya Gala|riya.gala@hotmail...| 8/10/1989|    6947567136|
| KTYHYNJG3Q|    Tarini Sabharwal|tarini.sabharwal@...| 12/3/1973|    7701533431|
| 2N2119NGLF|     Shanaya Sampath|shanaya_sampath@g...|10/29/1988|    4587205438|
| CT2JZ4HB0I|   

In [57]:
customers_df.printSchema()

root
 |-- Customer_id: string (nullable = true)
 |-- Full_Name: string (nullable = true)
 |-- Customer_Email: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- Customer_Phone: long (nullable = true)



In [58]:
dumb_saish = customers_df.withColumn('dob', to_date('dob', 'M/d/yyyy'))

In [59]:
dumb_saish.printSchema()

root
 |-- Customer_id: string (nullable = true)
 |-- Full_Name: string (nullable = true)
 |-- Customer_Email: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- Customer_Phone: long (nullable = true)



In [60]:
dumb_saish.show()

+-----------+--------------------+--------------------+----------+--------------+
|Customer_id|           Full_Name|      Customer_Email|       dob|Customer_Phone|
+-----------+--------------------+--------------------+----------+--------------+
| V0H084B3FA|         Mahika Gill|mahika.gill@gmail...|1993-06-23|     785877743|
| 7SI1YKSDMB|        Elakshi Gour|elakshi.gour@outl...|1981-06-14|    9638371516|
| H0L4L7VHXT|    Tanya Srinivasan|tanyasrinivasan@y...|2003-03-22|     559420656|
| 6I324KGK08|        Kashvi Saraf|kashvisaraf@yahoo...|2001-06-27|    4985572504|
| WKJ0VXQFIY|            Piya Ram| piyaram@hotmail.com|1994-03-31|    5549849574|
| HT4P3N5R38|    Aniruddh Kothari|aniruddhkothari@h...|1994-04-20|    8609260186|
| Y4EHCD6U3G|           Riya Gala|riya.gala@hotmail...|1989-08-10|    6947567136|
| KTYHYNJG3Q|    Tarini Sabharwal|tarini.sabharwal@...|1973-12-03|    7701533431|
| 2N2119NGLF|     Shanaya Sampath|shanaya_sampath@g...|1988-10-29|    4587205438|
| CT2JZ4HB0I|   

In [61]:
# Handle missing values
cleaned_customer_data = dumb_saish.fillna({'Full_Name': 'Unknown', 'Customer_Email': 'Unknown','Customer_Phone': 0})

In [62]:
# Handle duplicates
cleaned_customer_data = dumb_saish.dropDuplicates(["Customer_id"])

In [66]:
# Replace invalid phone numbers with null values
cleaned_customer_data = dumb_saish.withColumn('Customer_Phone', when(length(col('Customer_Phone')) == 10, col('Customer_Phone')).otherwise(None))
cleaned_customer_data.show()

+-----------+--------------------+--------------------+----------+--------------+
|Customer_id|           Full_Name|      Customer_Email|       dob|Customer_Phone|
+-----------+--------------------+--------------------+----------+--------------+
| V0H084B3FA|         Mahika Gill|mahika.gill@gmail...|1993-06-23|          null|
| 7SI1YKSDMB|        Elakshi Gour|elakshi.gour@outl...|1981-06-14|    9638371516|
| H0L4L7VHXT|    Tanya Srinivasan|tanyasrinivasan@y...|2003-03-22|          null|
| 6I324KGK08|        Kashvi Saraf|kashvisaraf@yahoo...|2001-06-27|    4985572504|
| WKJ0VXQFIY|            Piya Ram| piyaram@hotmail.com|1994-03-31|    5549849574|
| HT4P3N5R38|    Aniruddh Kothari|aniruddhkothari@h...|1994-04-20|    8609260186|
| Y4EHCD6U3G|           Riya Gala|riya.gala@hotmail...|1989-08-10|    6947567136|
| KTYHYNJG3Q|    Tarini Sabharwal|tarini.sabharwal@...|1973-12-03|    7701533431|
| 2N2119NGLF|     Shanaya Sampath|shanaya_sampath@g...|1988-10-29|    4587205438|
| CT2JZ4HB0I|   

In [75]:
cleaned_customer_data1 = cleaned_customer_data.withColumn('Full_Name', initcap('Full_Name'))
cleaned_customer_data1.show()

+-----------+--------------------+--------------------+----------+--------------+
|Customer_id|           Full_Name|      Customer_Email|       dob|Customer_Phone|
+-----------+--------------------+--------------------+----------+--------------+
| V0H084B3FA|         Mahika Gill|mahika.gill@gmail...|1993-06-23|          null|
| 7SI1YKSDMB|        Elakshi Gour|elakshi.gour@outl...|1981-06-14|    9638371516|
| H0L4L7VHXT|    Tanya Srinivasan|tanyasrinivasan@y...|2003-03-22|          null|
| 6I324KGK08|        Kashvi Saraf|kashvisaraf@yahoo...|2001-06-27|    4985572504|
| WKJ0VXQFIY|            Piya Ram| piyaram@hotmail.com|1994-03-31|    5549849574|
| HT4P3N5R38|    Aniruddh Kothari|aniruddhkothari@h...|1994-04-20|    8609260186|
| Y4EHCD6U3G|           Riya Gala|riya.gala@hotmail...|1989-08-10|    6947567136|
| KTYHYNJG3Q|    Tarini Sabharwal|tarini.sabharwal@...|1973-12-03|    7701533431|
| 2N2119NGLF|     Shanaya Sampath|shanaya_sampath@g...|1988-10-29|    4587205438|
| CT2JZ4HB0I|   

In [76]:
cleaned_customer_data2 = cleaned_customer_data1.withColumn("email_check", regexp_extract(col("Customer_Email"), r'^\S+@\S+\.\S+', 0))


In [77]:
cleaned_customer_data2 = cleaned_customer_data2.withColumn("Customer_Email", when(col("email_check") != "", col("Customer_Email")).otherwise(None))


In [78]:
cleaned_customer_data2.show()

+-----------+--------------------+--------------------+----------+--------------+--------------------+
|Customer_id|           Full_Name|      Customer_Email|       dob|Customer_Phone|         email_check|
+-----------+--------------------+--------------------+----------+--------------+--------------------+
| V0H084B3FA|         Mahika Gill|mahika.gill@gmail...|1993-06-23|          null|mahika.gill@gmail...|
| 7SI1YKSDMB|        Elakshi Gour|elakshi.gour@outl...|1981-06-14|    9638371516|elakshi.gour@outl...|
| H0L4L7VHXT|    Tanya Srinivasan|tanyasrinivasan@y...|2003-03-22|          null|tanyasrinivasan@y...|
| 6I324KGK08|        Kashvi Saraf|kashvisaraf@yahoo...|2001-06-27|    4985572504|kashvisaraf@yahoo...|
| WKJ0VXQFIY|            Piya Ram| piyaram@hotmail.com|1994-03-31|    5549849574| piyaram@hotmail.com|
| HT4P3N5R38|    Aniruddh Kothari|aniruddhkothari@h...|1994-04-20|    8609260186|aniruddhkothari@h...|
| Y4EHCD6U3G|           Riya Gala|riya.gala@hotmail...|1989-08-10|    694

In [79]:
# Drop the temporary 'email_check' column
cleaned_customer_data2 = cleaned_customer_data1.drop("email_check")

# Display the DataFrame with null values for invalid email addresses
cleaned_customer_data2.show()

+-----------+--------------------+--------------------+----------+--------------+
|Customer_id|           Full_Name|      Customer_Email|       dob|Customer_Phone|
+-----------+--------------------+--------------------+----------+--------------+
| V0H084B3FA|         Mahika Gill|mahika.gill@gmail...|1993-06-23|          null|
| 7SI1YKSDMB|        Elakshi Gour|elakshi.gour@outl...|1981-06-14|    9638371516|
| H0L4L7VHXT|    Tanya Srinivasan|tanyasrinivasan@y...|2003-03-22|          null|
| 6I324KGK08|        Kashvi Saraf|kashvisaraf@yahoo...|2001-06-27|    4985572504|
| WKJ0VXQFIY|            Piya Ram| piyaram@hotmail.com|1994-03-31|    5549849574|
| HT4P3N5R38|    Aniruddh Kothari|aniruddhkothari@h...|1994-04-20|    8609260186|
| Y4EHCD6U3G|           Riya Gala|riya.gala@hotmail...|1989-08-10|    6947567136|
| KTYHYNJG3Q|    Tarini Sabharwal|tarini.sabharwal@...|1973-12-03|    7701533431|
| 2N2119NGLF|     Shanaya Sampath|shanaya_sampath@g...|1988-10-29|    4587205438|
| CT2JZ4HB0I|   