In [1]:
%run /spark-data/CRM/utilities/common_utility.ipynb

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/17 05:17:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/17 05:17:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/17 05:17:25 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/09/17 05:17:25 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
24/09/17 05:17:25 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
24/09/17 05:17:25 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


# Initialize Spark Session

In [2]:
spark = initialize_spark_session("Customers Cleaning")

24/09/17 05:17:27 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Logs Configuration

In [3]:
log_file_path = 'logs/customer_cleaning.log'
logger = initialize_logger(log_file_path)

logger.info("Logger initialized with dynamic path!")

2024-09-17 05:17:27,150 - logger - INFO - [92mLogger initialized with dynamic path![0m


In [4]:

customers_file_path = "/spark-data/CRM/Dataset/customers.csv"
customers_df = load_data_files(customers_file_path)
display_dataframes(customers_df)

2024-09-17 05:17:31,107 - logger - INFO - [92mDisplayed first 5 records of Spark DataFrame.[0m


+------------------------------------+-----------------+----------------------+-------------+----------------+
|Customer_ID                         |Name             |Email                 |Phone        |Country         |
+------------------------------------+-----------------+----------------------+-------------+----------------+
|a85e6a90-78d5-490c-a53f-c58b2e57c59b|Shannon Deleon   |NULL                  |5878628895   |Japan           |
|babec972-ffb3-4c56-99c3-e8e3855adf0f|Christina Sanchez|craigprice@example.org|4832368495   |Haiti           |
|d74c33bd-69d9-4718-9e00-d1895a41ddac|Thomas Brown     |vjohnson@example.org  |(276)903-7065|Pakistan        |
|ff05ceba-f459-4714-a252-e03198d9934c|Lindsey Bradford |kathryn50@example.net |NULL         |Marshall Islands|
|f20755f6-8481-4904-afe6-504451ceded5|John Boyer       |jennifer15@example.org|(749)644-5721|New Caledonia   |
+------------------------------------+-----------------+----------------------+-------------+----------------+
o

In [5]:

# Step 1: Count missing (null) values for each column before filling them
logger.info("Step 1: Counting missing values in each column before filling them...")
missing_values_before = count_missing_values(customers_df)
missing_values_before.show()

# Step 2: Count duplicates before dropping them
logger.info("Step 2: Checking for duplicates in each column before dropping them...")
duplicate_count = count_duplicates_per_column(customers_df)
duplicate_count.show()

# Step 3: Format and clean phone numbers
logger.info("Step 3: Formatting and cleaning phone numbers...")
countries = "/spark-data/CRM/Dataset/countries.csv"
cleaned_customers_df = process_phone_numbers(customers_df,countries)
logger.info("Completed.")

# Step 4: Drop duplicates based on 'Customer_ID' if any are found
logger.info("Step 4: Checking and removing duplicate records based on 'Customer_ID'...")
cleaned_customers_df = drop_duplicates(cleaned_customers_df, "Customer_ID")

# Step 5: Format and clean Email
logger.info("Step 5: Formatting and cleaning Email...")
cleaned_customers_df = validate_emails(cleaned_customers_df, "Email")
logger.info("Completed.")

# Step 5: Fill missing values in 'Email' and 'Phone' columns
logger.info("Step 6: Filling missing values in 'Email' and 'Phone' columns...")
cleaned_customers_df = fill_missing_values(cleaned_customers_df, {'Email': 'unknown', 'Phone': 'unknown'})
cleaned_customers_df.show(5, truncate=False)

# Step 6: Capitalize the first letter of the first and last names in the 'Name' and 'Country' columns
logger.info("Step 7: Capitalizing the first letter of each word in the 'Name' and 'Country' columns...")
cleaned_customers_df = capitalize_columns(cleaned_customers_df, ["Name", "Country"])
logger.info("Completed.")

# Step 7: Cross-validation - Count missing values again after filling them
logger.info("Step 8: Counting missing values in each column after filling them...")
missing_values_after = count_missing_values(cleaned_customers_df)
missing_values_after.show()

# Step 8: Count duplicates after dropping them
logger.info("Step 9: Checking for duplicates in each column after dropping them...")
duplicate_count_after = count_duplicates_per_column(cleaned_customers_df)
duplicate_count_after.show()

# Step 9: Export the cleaned data to a CSV file
logger.info("Step 10: Exporting the cleaned data to 'cleaned_customers.csv'...")
# save_df_to_csv(cleaned_customers_df, "Cleaned_data/cleaned_customers.csv")

# Display the count of records after phone number processing
record_count_after_cleaning = cleaned_customers_df.count()
logger.info(f"Number of records after cleaning: {record_count_after_cleaning}")

logger.info("Data cleaning and export completed successfully.")

2024-09-17 05:17:31,119 - logger - INFO - [92mStep 1: Counting missing values in each column before filling them...[0m
2024-09-17 05:17:31,833 - logger - INFO - [92mStep 2: Checking for duplicates in each column before dropping them...[0m


+-----------+----+-----+-----+-------+
|Customer_ID|Name|Email|Phone|Country|
+-----------+----+-----+-----+-------+
|          0|   0|   52|   52|      0|
+-----------+----+-----+-----+-------+



2024-09-17 05:17:34,839 - logger - INFO - [92mStep 3: Formatting and cleaning phone numbers...[0m


+-----------+---------------+
|     Column|Duplicate_Count|
+-----------+---------------+
|Customer_ID|             25|
|       Name|             26|
|      Email|             22|
|      Phone|             23|
|    Country|            157|
+-----------+---------------+



2024-09-17 05:17:35,213 - logger - INFO - [92mCompleted.[0m
2024-09-17 05:17:35,215 - logger - INFO - [92mStep 4: Checking and removing duplicate records based on 'Customer_ID'...[0m
2024-09-17 05:17:35,837 - logger - INFO - [92mNumber of duplicate records before dropping: 25[0m
2024-09-17 05:17:36,181 - logger - INFO - [92mNumber of duplicate records after dropping: 0[0m
2024-09-17 05:17:36,185 - logger - INFO - [92mStep 5: Formatting and cleaning Email...[0m
2024-09-17 05:17:37,308 - logger - INFO - [92mAll non-null emails are valid.[0m
2024-09-17 05:17:37,310 - logger - INFO - [92mCompleted.[0m
2024-09-17 05:17:37,311 - logger - INFO - [92mStep 6: Filling missing values in 'Email' and 'Phone' columns...[0m
2024-09-17 05:17:37,757 - logger - INFO - [92mStep 7: Capitalizing the first letter of each word in the 'Name' and 'Country' columns...[0m
2024-09-17 05:17:37,781 - logger - INFO - [92mCompleted.[0m
2024-09-17 05:17:37,784 - logger - INFO - [92mStep 8: Countin

+------------------------------------+-----------------+----------------------+---------------+-----------+
|Customer_ID                         |Name             |Email                 |Phone          |Country    |
+------------------------------------+-----------------+----------------------+---------------+-----------+
|003ca69a-991c-4c11-899a-51bb7365499d|Erica Diaz       |hurleyanna@example.com|+964-8545044941|Iraq       |
|006af455-013b-4c09-a6df-15ca3d41010f|Jason Jackson    |dylanduran@example.com|+63-2484285464 |Philippines|
|00fc38f7-b5c7-465c-839b-a55185f2635f|Heather Schneider|larajohn@example.org  |+264-7935879728|Namibia    |
|015cc4e1-5cf8-441c-80ad-ca3536c53e9a|Matthew Wilson   |timothyho@example.org |+357-8883554148|Cyprus     |
|01d13428-d511-4c3d-90c7-b5624931ff48|Robert Contreras |lydia12@example.com   |unknown        |Swaziland  |
+------------------------------------+-----------------+----------------------+---------------+-----------+
only showing top 5 rows



2024-09-17 05:17:38,119 - logger - INFO - [92mStep 9: Checking for duplicates in each column after dropping them...[0m


+-----------+----+-----+-----+-------+
|Customer_ID|Name|Email|Phone|Country|
+-----------+----+-----+-----+-------+
|          0|   0|    0|    0|      0|
+-----------+----+-----+-----+-------+



2024-09-17 05:17:40,112 - logger - INFO - [92mStep 10: Exporting the cleaned data to 'cleaned_customers.csv'...[0m
2024-09-17 05:17:40,253 - logger - INFO - [92mNumber of records after cleaning: 500[0m
2024-09-17 05:17:40,255 - logger - INFO - [92mData cleaning and export completed successfully.[0m


+-----------+---------------+
|     Column|Duplicate_Count|
+-----------+---------------+
|Customer_ID|              0|
|       Name|              1|
|      Email|              1|
|      Phone|              1|
|    Country|            151|
+-----------+---------------+



In [6]:
spark.stop()