In [1]:
%run /spark-data/CRM/utility/common_utility.ipynb

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/09 05:23:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/09/09 05:23:59 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Logs Configuration

In [2]:
log_file_path = 'logs/sales_team_cleaning.log'
logger = initialize_logger(log_file_path)

logger.info("Logger initialized with dynamic path!")

2024-09-09 05:24:00,424 - logger - INFO - [92mLogger initialized with dynamic path![0m


# Dataset Load

In [3]:
sales_team_df = load_data_files("/spark-data/CRM/Dataset/sales_team.csv")
display_dataframes(sales_team_df)

2024-09-09 05:24:20,677 - logger - INFO - [92mDisplayed first 5 records of Spark DataFrame.[0m


+------------------------------------+-----------------+-------------+------------+--------------+
|Sales_Rep_ID                        |Name             |Region       |Sales_Target|Sales_Achieved|
+------------------------------------+-----------------+-------------+------------+--------------+
|0437b05a-9628-43f9-ac07-0b9a0dc96dcd|Brittany Taylor  |California   |41135       |14037.0       |
|4daeb6af-d7e9-4f99-91b3-6c912f45b740|Mitchell Williams|New Hampshire|32996       |21461.0       |
|f243144e-485f-4382-81ef-2a9a3c63f172|John Terry       |Kansas       |10385       |NULL          |
|9c44ee81-8254-45e1-af23-a4608ceb126c|Carolyn Miller   |Arizona      |23754       |17149.0       |
|3e97b5d8-933a-4860-bce7-2398af6c5613|Antonio Sparks   |Washington   |27101       |36413.0       |
+------------------------------------+-----------------+-------------+------------+--------------+
only showing top 5 rows



# Data Preprocessing

In [5]:
# Step 1: Identify Missing Values
logger.info("Step 1: Identifying missing values in each column before filling them...")
missing_values_before = count_missing_values(sales_team_df)
missing_values_before.show()

# Step 2: Handle Missing Values
logger.info("Step 2: Handling missing values by filling with averages...")
# Calculate average Sales_Achieved
avg_sales_achieved = sales_team_df.select(avg(col("Sales_Achieved"))).first()[0]
print(f"Average value of sales achieved column is {avg_sales_achieved}")

# Fill missing Sales_Achieved with the average
sales_team_cleaned_df = fill_missing_values(sales_team_df,{"Sales_Achieved": round(avg_sales_achieved,2)})

# Step 3: Check for Duplicate Values
logger.info("Step 3: Checking for duplicate records based on 'Sales_Rep_ID'...")
duplicate_count_before = count_duplicates_per_column(sales_team_cleaned_df)

# Step 4: Drop Duplicates if they exist
sales_team_cleaned_df = drop_duplicates(sales_team_cleaned_df, "Sales_Rep_ID")

# Step 6: Standardize Formats
logger.info("Step 6: Standardizing the format of 'Name' and 'Region' columns...")
sales_team_cleaned_df = capitalize_columns(sales_team_cleaned_df, [ "Name", "Region"])
print("Completed.")

# Step 7: Cross-verification of missing values in each column after filling them
logger.info("Step 7: Identifying missing values in each column after filling them...")
missing_values_after = count_missing_values(sales_team_cleaned_df)
missing_values_after.show()

# Step 8: Display the cleaned 'sales_team_df' DataFrame
logger.info("Step 8: Displaying the cleaned 'sales_team_df' DataFrame...")
sales_team_cleaned_df.show(5,truncate=False)

# Step 9: Save the cleaned data to a new CSV
logger.info("Step 9: Saving the cleaned data to 'cleaned_sales_team.csv'...")
# save_df_to_csv(sales_team_cleaned_df, "Cleaned_data/cleaned_sales_team.csv")

# Display the count of records after phone number processing
record_count_after_cleaning = sales_team_cleaned_df.count()
print(f"Number of records after cleaning: {record_count_after_cleaning}")
logger.info("Data cleaning and export completed successfully.")

2024-09-09 05:24:49,940 - logger - INFO - [92mStep 1: Identifying missing values in each column before filling them...[0m
2024-09-09 05:24:50,149 - logger - INFO - [92mStep 2: Handling missing values by filling with averages...[0m


+------------+----+------+------------+--------------+
|Sales_Rep_ID|Name|Region|Sales_Target|Sales_Achieved|
+------------+----+------+------------+--------------+
|           0|   0|     0|           0|             5|
+------------+----+------+------------+--------------+

Average value of sales achieved column is 21912.425531914894


2024-09-09 05:24:50,366 - logger - INFO - [92mStep 3: Checking for duplicate records based on 'Sales_Rep_ID'...[0m
2024-09-09 05:24:51,657 - logger - INFO - [92mNumber of duplicate records before dropping: 2[0m
2024-09-09 05:24:51,848 - logger - INFO - [92mNumber of duplicate records after dropping: 0[0m
2024-09-09 05:24:51,850 - logger - INFO - [92mStep 6: Standardizing the format of 'Name' and 'Region' columns...[0m
2024-09-09 05:24:51,872 - logger - INFO - [92mStep 7: Identifying missing values in each column after filling them...[0m


Completed.


2024-09-09 05:24:52,114 - logger - INFO - [92mStep 8: Displaying the cleaned 'sales_team_df' DataFrame...[0m
2024-09-09 05:24:52,284 - logger - INFO - [92mStep 9: Saving the cleaned data to 'cleaned_sales_team.csv'...[0m


+------------+----+------+------------+--------------+
|Sales_Rep_ID|Name|Region|Sales_Target|Sales_Achieved|
+------------+----+------+------------+--------------+
|           0|   0|     0|           0|             0|
+------------+----+------+------------+--------------+

+------------------------------------+---------------------+------------+------------+--------------+
|Sales_Rep_ID                        |Name                 |Region      |Sales_Target|Sales_Achieved|
+------------------------------------+---------------------+------------+------------+--------------+
|02acdbab-d149-4053-8c7a-871ba76f003e|Daniel Barber        |Arizona     |13651       |7296.0        |
|0437b05a-9628-43f9-ac07-0b9a0dc96dcd|Brittany Taylor      |California  |41135       |14037.0       |
|0468e32e-644b-491f-a3e1-741306f2c3f2|Dr. Wayne Spencer Dvm|Pennsylvania|15593       |37653.0       |
|05cc8513-8a8a-4598-862a-218d1093ae26|Pamela Pennington    |Idaho       |38399       |11569.0       |
|06a9a95f-

2024-09-09 05:24:52,437 - logger - INFO - [92mData cleaning and export completed successfully.[0m


Number of records after cleaning: 50
