In [2]:
%run /spark-data/CRM/utilities/common_utility.ipynb

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/13 06:18:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Initialize Spark Session

In [3]:
spark = initialize_spark_session("Interactions Cleaning")

24/09/13 06:18:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


# Logs Configuration

In [4]:
log_file_path = 'logs/interactions_cleaning.log'
logger = initialize_logger(log_file_path)

logger.info("Logger initialized with dynamic path!")

2024-09-13 06:18:40,761 - logger - INFO - [92mLogger initialized with dynamic path![0m


# Dataset Load

In [5]:
interactions_file_path = "/spark-data/CRM/Dataset/interactions.csv"
interactions_df = load_data_files(interactions_file_path)
display_dataframes(interactions_df)

2024-09-13 06:18:43,729 - logger - INFO - [92mDisplayed first 5 records of Spark DataFrame.[0m


+------------------------------------+------------------------------------+----------------+----------------+--------------+
|Interaction_ID                      |Customer_ID                         |Interaction_Date|Interaction_Type|Issue_Resolved|
+------------------------------------+------------------------------------+----------------+----------------+--------------+
|5a367006-c47a-4728-a042-c4c6ddd9ee3e|4166b61a-cb6d-4190-ac0b-df812c0308ff|2024-08-07      |NULL            |true          |
|544d6348-cc9c-4c36-9fbe-eaf7e845e682|cc9a2115-b197-4a2b-9a6a-0fb2da5a0c73|2024-01-30      |Email           |true          |
|b1739e10-0690-4ec6-9a6b-147127c0f388|1f22e566-13a0-4758-b42c-b0f6e997d46c|2024-06-27      |Chat            |true          |
|3d9dcd53-7edd-4672-bfcf-a03b430b2935|dae0689d-0c38-440c-b921-fe2413c3df3b|2024-07-20      |Email           |false         |
|2861521b-43ba-45d9-a0bd-9ee5c2e3edf4|c1e7a24a-0cc4-4a3e-8cf1-1e3487076ef4|2024-05-31      |Email           |false         |


# Data Preprocessing

In [6]:
# Step 1: Identify missing values in each column before filling them
logger.info("Step 1: Identifying missing values in each column before filling them...")
missing_values_before = count_missing_values(interactions_df)
missing_values_before.show()

# Step 2: Count occurrences of each 'Interaction_Type' and identify the most occurring type
logger.info("Step 2: Counting occurrences of each 'Interaction_Type' to find the most occurring type...")
type_counts = interactions_df.groupBy("Interaction_Type").count() \
    .orderBy(col("count").desc()) \
    .first()

# Extract the most occurring 'Interaction_Type'
most_occurring_type = type_counts["Interaction_Type"] if type_counts else None
print(f"The most occurring 'Interaction_Type' is: {most_occurring_type}")

# Step 3: Replace only null values in 'Interaction_Type' with the most occurring type
logger.info("Step 3: Replacing null values in 'Interaction_Type' with the most occurring type...")
cleaned_interactions_df = interactions_df.withColumn(
    "Interaction_Type",
    when(col("Interaction_Type").isNull(), lit(most_occurring_type))
    .otherwise(col("Interaction_Type"))
)

# Step 4: Check for duplicate records based on 'Interaction_ID'
logger.info("Step 4: Checking for duplicate records based on 'Interaction_ID'...")
cleaned_interactions_df = drop_duplicates(cleaned_interactions_df, "Interaction_ID")

# Step 5: Capitalize the first letter of the values in the 'Issue_Resolved' and 'Interaction_Type' columns
logger.info("Step 5: Capitalizing the first letter of each word in the and 'Interaction_Type' columns...")
cleaned_interactions_df = capitalize_columns(cleaned_interactions_df, [ "Interaction_Type"])
print("Completed.")

# Step 6: date validation 
logger.info("Step 6: Validating dates present in Interaction_Date column...")
cleaned_interactions_df = date_validation(cleaned_interactions_df,"Interaction_Date")

# Step 7: boolean validation  
logger.info("Step 7: Validating booleans present in Issue_Resolved column...")
cleaned_interactions_df = validate_boolean_values(cleaned_interactions_df,"Issue_Resolved")

# Step 8: Cross-verification of missing values in each column after filling them
logger.info("Step 8: Identifying missing values in each column after filling them...")
missing_values_after = count_missing_values(cleaned_interactions_df)
missing_values_after.show()

# Step 9: Display the cleaned 'interactions_df' DataFrame
logger.info("Step 9: Displaying the cleaned 'interactions_df' DataFrame...")
cleaned_interactions_df.show(5,truncate=False)

# Step 10: Save the cleaned data to a new CSV
logger.info("Step 10: Saving the cleaned data to 'cleaned_interactions.csv'...")
save_df_to_csv(cleaned_interactions_df, "/spark-data/CRM/cleaned_data/cleaned_interactions.csv")

# Display the count of records after phone number processing
record_count_after_cleaning = cleaned_interactions_df.count()
print(f"Number of records after cleaning: {record_count_after_cleaning}")
logger.info("Data cleaning and export completed successfully.")

2024-09-13 06:18:43,744 - logger - INFO - [92mStep 1: Identifying missing values in each column before filling them...[0m
2024-09-13 06:18:44,369 - logger - INFO - [92mStep 2: Counting occurrences of each 'Interaction_Type' to find the most occurring type...[0m


+--------------+-----------+----------------+----------------+--------------+
|Interaction_ID|Customer_ID|Interaction_Date|Interaction_Type|Issue_Resolved|
+--------------+-----------+----------------+----------------+--------------+
|             0|          0|               0|              67|             0|
+--------------+-----------+----------------+----------------+--------------+



2024-09-13 06:18:44,922 - logger - INFO - [92mStep 3: Replacing null values in 'Interaction_Type' with the most occurring type...[0m
2024-09-13 06:18:44,950 - logger - INFO - [92mStep 4: Checking for duplicate records based on 'Interaction_ID'...[0m


The most occurring 'Interaction_Type' is: Chat


2024-09-13 06:18:45,489 - logger - INFO - [92mNumber of duplicate records before dropping: 32[0m
2024-09-13 06:18:45,912 - logger - INFO - [92mNumber of duplicate records after dropping: 0[0m
2024-09-13 06:18:45,913 - logger - INFO - [92mStep 5: Capitalizing the first letter of each word in the and 'Interaction_Type' columns...[0m
2024-09-13 06:18:45,925 - logger - INFO - [92mStep 6: Validating dates present in Interaction_Date column...[0m
2024-09-13 06:18:45,925 - logger - INFO - [92mStep 1: Identifying future dates in 'Interaction_Date'...[0m


Completed.


2024-09-13 06:18:46,306 - logger - INFO - [92mNo future dates found.[0m
2024-09-13 06:18:46,309 - logger - INFO - [92mStep 7: Validating booleans present in Issue_Resolved column...[0m
2024-09-13 06:18:46,310 - logger - INFO - [92mStep 1: Identifying non-boolean values.. [0m
2024-09-13 06:18:46,700 - logger - INFO - [92mAll values in 'Issue_Resolved' are valid booleans.[0m
2024-09-13 06:18:46,701 - logger - INFO - [92mStep 8: Identifying missing values in each column after filling them...[0m
2024-09-13 06:18:47,194 - logger - INFO - [92mStep 9: Displaying the cleaned 'interactions_df' DataFrame...[0m


+--------------+-----------+----------------+----------------+--------------+
|Interaction_ID|Customer_ID|Interaction_Date|Interaction_Type|Issue_Resolved|
+--------------+-----------+----------------+----------------+--------------+
|             0|          0|               0|               0|             0|
+--------------+-----------+----------------+----------------+--------------+



2024-09-13 06:18:47,409 - logger - INFO - [92mStep 10: Saving the cleaned data to 'cleaned_interactions.csv'...[0m


+------------------------------------+------------------------------------+----------------+----------------+--------------+
|Interaction_ID                      |Customer_ID                         |Interaction_Date|Interaction_Type|Issue_Resolved|
+------------------------------------+------------------------------------+----------------+----------------+--------------+
|002debb8-7e9e-476c-85bc-dd3e8b7210ce|dfce9c24-71c4-4cd8-9b59-f34e48842979|2024-01-30      |Chat            |true          |
|00571a5f-03ff-4615-85cf-cd540f2ccdad|bd56525d-e899-4312-9763-e79d3f6fa54c|2024-02-08      |Chat            |false         |
|00960c9f-02c5-407c-8bb4-5d81747be2ea|f963092b-fa72-4017-bfa5-a7daec18bc46|2024-02-15      |Chat            |false         |
|00f8180f-66fe-46a5-9d8c-caf7b97c23d1|0a1fae74-7af1-4032-8d72-0b0a14f9bd1b|2024-06-29      |Chat            |false         |
|016b4c4f-4800-43d4-b6e8-f493fb01746d|b552b0ab-faf1-452b-a2e6-7b37a31aa881|2024-01-25      |Email           |true          |


2024-09-13 06:18:47,684 - logger - INFO - [92mData cleaning and export completed successfully.[0m


Number of records after cleaning: 800


In [7]:
spark.stop()