In [15]:
%run /spark-data/CRM/utility/common_utility.ipynb

# Logs Configuration

In [16]:
log_file_path = 'logs/products_cleaning.log'
logger = initialize_logger(log_file_path)

logger.info("Logger initialized with dynamic path!")

2024-09-09 05:07:44,725 - logger - INFO - [92mLogger initialized with dynamic path![0m


# Dataset Load

In [17]:
products_df = load_data_files("/spark-data/CRM/Dataset/products.csv")
display_dataframes(products_df)

2024-09-09 05:07:45,074 - logger - INFO - [92mDisplayed first 5 records of Spark DataFrame.[0m


+----------+--------------+-----------+------+
|Product_ID|Product_Name  |Category   |Price |
+----------+--------------+-----------+------+
|1         |Sofa Set      |Home       |411.0 |
|2         |Laptop        |Electronics|333.0 |
|3         |Dining Table  |Home       |645.0 |
|4         |Vacuum Cleaner|NULL       |290.0 |
|5         |Mobile Phone  |Electronics|1738.0|
+----------+--------------+-----------+------+
only showing top 5 rows



# Data Preprocessing

In [18]:
# Step 1: Count missing (null) values for each column before filling them
logger.info("Step 1: Counting missing values in each column before filling them...")
missing_values_before = count_missing_values(products_df)
missing_values_before.show()

# Step 2: Count duplicates before dropping them
logger.info("Step 2: Checking for duplicates in each column before dropping them...")
duplicate_count = count_duplicates_per_column(products_df)
duplicate_count.show()

# Step 3: Drop duplicates based on 'Customer_ID' if any are found
logger.info("Step 3: Checking and removing duplicate records based on 'Customer_ID'...")
cleaned_products_df = drop_duplicates(products_df, "Product_ID")

# Step 4: Fill missing values in 'Email' and 'Phone' columns
logger.info("Step 4: Filling missing values in 'Email' and 'Phone' columns...")
cleaned_products_df = fill_missing_values(cleaned_products_df, {"Category": "Uncategorized"})
cleaned_products_df.show(5, truncate=False)

# Step 5: Capitalize the first letter of the first and last names in the 'Name' and 'Country' columns
logger.info("Step 5: Capitalizing the first letter of each word in the 'Product_Name' and 'Category' columns...")
cleaned_products_df = capitalize_columns(cleaned_products_df, ["Product_Name", "Category"])
print("Completed.")

# Step 6: Cross-validation - Count missing values again after filling them
logger.info("Step 6: Counting missing values in each column after filling them...")
missing_values_after = count_missing_values(cleaned_products_df)
missing_values_after.show()

# Step 7: Count duplicates after dropping them
logger.info("Step 7: Checking for duplicates in each column after dropping them...")
duplicate_count_after = count_duplicates_per_column(cleaned_products_df)
duplicate_count_after.show()

# Step 8: Handle negative or zero prices by replacing with average price
logger.info("Step 8: Replacing negative or zero prices with the average price...")
avg_price = products_df.agg({"Price": "avg"}).collect()[0][0]
cleaned_products_df = cleaned_products_df.withColumn("Price", when(col("Price") <= 0, avg_price).otherwise(col("Price")))
print("completed.")

# Step 9: Export the cleaned data to a CSV file
logger.info("Step 9: Exporting the cleaned data to 'cleaned_products.csv'...")
save_df_to_csv(cleaned_products_df, "/spark-data/CRM/Cleaned_data/cleaned_products.csv")

# show duplicates
# get_duplicate_data_per_column(cleaned_products_df)

# Display the count of records after phone number processing
record_count_after_cleaning = cleaned_products_df.count()
print(f"Number of records after cleaning: {record_count_after_cleaning}")

logger.info("Data cleaning and export completed successfully.")

2024-09-09 05:07:45,093 - logger - INFO - [92mStep 1: Counting missing values in each column before filling them...[0m
2024-09-09 05:07:45,254 - logger - INFO - [92mStep 2: Checking for duplicates in each column before dropping them...[0m


+----------+------------+--------+-----+
|Product_ID|Product_Name|Category|Price|
+----------+------------+--------+-----+
|         0|           0|       3|    0|
+----------+------------+--------+-----+



2024-09-09 05:07:46,841 - logger - INFO - [92mStep 3: Checking and removing duplicate records based on 'Customer_ID'...[0m
2024-09-09 05:07:46,983 - logger - INFO - [92mNumber of duplicate records before dropping: 1[0m


+------------+---------------+
|      Column|Duplicate_Count|
+------------+---------------+
|  Product_ID|              1|
|Product_Name|              1|
|    Category|              5|
|       Price|              2|
+------------+---------------+



2024-09-09 05:07:47,183 - logger - INFO - [92mNumber of duplicate records after dropping: 0[0m
2024-09-09 05:07:47,185 - logger - INFO - [92mStep 4: Filling missing values in 'Email' and 'Phone' columns...[0m
2024-09-09 05:07:47,340 - logger - INFO - [92mStep 5: Capitalizing the first letter of each word in the 'Product_Name' and 'Category' columns...[0m
2024-09-09 05:07:47,358 - logger - INFO - [92mStep 6: Counting missing values in each column after filling them...[0m


+----------+--------------+-------------+------+
|Product_ID|Product_Name  |Category     |Price |
+----------+--------------+-------------+------+
|1         |Sofa Set      |Home         |411.0 |
|2         |Laptop        |Electronics  |333.0 |
|3         |Dining Table  |Home         |645.0 |
|4         |Vacuum Cleaner|Uncategorized|290.0 |
|5         |Mobile Phone  |Electronics  |1738.0|
+----------+--------------+-------------+------+
only showing top 5 rows

Completed.


2024-09-09 05:07:47,589 - logger - INFO - [92mStep 7: Checking for duplicates in each column after dropping them...[0m


+----------+------------+--------+-----+
|Product_ID|Product_Name|Category|Price|
+----------+------------+--------+-----+
|         0|           0|       0|    0|
+----------+------------+--------+-----+



2024-09-09 05:07:49,062 - logger - INFO - [92mStep 8: Replacing negative or zero prices with the average price...[0m
2024-09-09 05:07:49,167 - logger - INFO - [92mStep 9: Exporting the cleaned data to 'cleaned_products.csv'...[0m


+------------+---------------+
|      Column|Duplicate_Count|
+------------+---------------+
|  Product_ID|              0|
|Product_Name|              0|
|    Category|              5|
|       Price|              1|
+------------+---------------+

completed.


2024-09-09 05:07:49,410 - logger - INFO - [92mData cleaning and export completed successfully.[0m


Number of records after cleaning: 50
