In [13]:
# Importing necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame

## Problem Statement 1: Ensuring Data Accuracy
**Objective: Ensure that data across different files is accurate and correctly linked.**

Description:

1.	Load Data: Load data from all files into data frames using PySpark.
2.	Initial Validation: Check that data has been ingested correctly into data frames.
3.	Verify Data Accuracy:
    -	Confirm that Customer_ID in transactions.csv, interactions.csv, and customers.csv matches correctly.
    -	Check that Product_ID in transactions.csv is valid according to products.csv.
    -	Ensure that Sales_Rep_ID in transactions.csv matches entries in sales_team.csv.

In [14]:
import logging
from pyspark.sql import SparkSession, DataFrame

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("logs/data_validation.log"),
        logging.StreamHandler()
    ]
)

logger = logging.getLogger()

def print_green(text: str):
    logger.info(f"\033[92m{text}\033[0m")

def validate_ids(df1: DataFrame, df2: DataFrame, df1_col: str, df2_col: str, id_name: str):
    """
    Validate the presence of IDs from df1 in df2 and display any missing IDs.
    """
    missing_ids = df1.join(df2, df1[df1_col] == df2[df2_col], "left_anti") \
                     .select(df1[df1_col].alias(f"Missing_{id_name}"))
    
    if missing_ids.count() > 0:
        print_green(f"{id_name}s missing in {df2_col.split('_')[0]}.csv from {df1_col.split('_')[0]}.csv:")
        missing_ids.show()
        logger.info(f"{id_name}s missing in {df2_col.split('_')[0]}.csv from {df1_col.split('_')[0]}.csv:")
    else:
        logger.info(f"All {id_name}s from {df1_col.split('_')[0]}.csv are present in {df2_col.split('_')[0]}.csv.")

# Initialize Spark Session
spark = SparkSession.builder.appName("CRM_Data_Validation").getOrCreate()

# Load data from CSV files into DataFrames
data_files = {
    "customers": "Dataset/customers.csv",
    "products": "Dataset/products.csv",
    "transactions": "Dataset/transactions.csv",
    "interactions": "Dataset/interactions.csv",
    "sales_team": "Dataset/sales_team.csv"
}

dfs = {name: spark.read.csv(file, header=True, inferSchema=True) for name, file in data_files.items()}

# Display initial records for each DataFrame
for name, df in dfs.items():
    print_green(f"{name.capitalize()} DataFrame:")
    df.show(5, truncate=False)
    logger.info(f"Displayed first 5 records for {name.capitalize()} DataFrame.")

# Verify Data Accuracy
validations = [
    ("transactions", "customers", "Customer_ID"),
    ("interactions", "customers", "Customer_ID"),
    ("transactions", "products", "Product_ID"),
    ("transactions", "sales_team", "Sales_Rep_ID")
]

for df1_name, df2_name, id_name in validations:
    print_green(f"Verifying {id_name} matches between {df1_name}.csv and {df2_name}.csv...")
    logger.info(f"Starting validation of {id_name} between {df1_name}.csv and {df2_name}.csv.")
    validate_ids(dfs[df1_name], dfs[df2_name], id_name, id_name, id_name)

print_green("Data validation completed.")
logger.info("Data validation completed.")


2024-09-02 16:29:54,318 - INFO - [92mCustomers DataFrame:[0m
2024-09-02 16:29:54,392 - INFO - Displayed first 5 records for Customers DataFrame.
2024-09-02 16:29:54,394 - INFO - [92mProducts DataFrame:[0m
2024-09-02 16:29:54,445 - INFO - Displayed first 5 records for Products DataFrame.
2024-09-02 16:29:54,446 - INFO - [92mTransactions DataFrame:[0m
2024-09-02 16:29:54,503 - INFO - Displayed first 5 records for Transactions DataFrame.
2024-09-02 16:29:54,506 - INFO - [92mInteractions DataFrame:[0m
2024-09-02 16:29:54,580 - INFO - Displayed first 5 records for Interactions DataFrame.
2024-09-02 16:29:54,581 - INFO - [92mSales_team DataFrame:[0m


+------------------------------------+-----------------+----------------------+-------------+----------------+
|Customer_ID                         |Name             |Email                 |Phone        |Country         |
+------------------------------------+-----------------+----------------------+-------------+----------------+
|a85e6a90-78d5-490c-a53f-c58b2e57c59b|Shannon Deleon   |NULL                  |5878628895   |Japan           |
|babec972-ffb3-4c56-99c3-e8e3855adf0f|Christina Sanchez|craigprice@example.org|4832368495   |Haiti           |
|d74c33bd-69d9-4718-9e00-d1895a41ddac|Thomas Brown     |vjohnson@example.org  |(276)903-7065|Pakistan        |
|ff05ceba-f459-4714-a252-e03198d9934c|Lindsey Bradford |kathryn50@example.net |NULL         |Marshall Islands|
|f20755f6-8481-4904-afe6-504451ceded5|John Boyer       |jennifer15@example.org|(749)644-5721|New Caledonia   |
+------------------------------------+-----------------+----------------------+-------------+----------------+
o

2024-09-02 16:29:54,640 - INFO - Displayed first 5 records for Sales_team DataFrame.
2024-09-02 16:29:54,642 - INFO - [92mVerifying Customer_ID matches between transactions.csv and customers.csv...[0m
2024-09-02 16:29:54,644 - INFO - Starting validation of Customer_ID between transactions.csv and customers.csv.
2024-09-02 16:29:54,787 - INFO - All Customer_IDs from Customer.csv are present in Customer.csv.
2024-09-02 16:29:54,788 - INFO - [92mVerifying Customer_ID matches between interactions.csv and customers.csv...[0m
2024-09-02 16:29:54,789 - INFO - Starting validation of Customer_ID between interactions.csv and customers.csv.


+------------------------------------+-----------------+-------------+------------+--------------+
|Sales_Rep_ID                        |Name             |Region       |Sales_Target|Sales_Achieved|
+------------------------------------+-----------------+-------------+------------+--------------+
|0437b05a-9628-43f9-ac07-0b9a0dc96dcd|Brittany Taylor  |California   |41135       |14037.0       |
|4daeb6af-d7e9-4f99-91b3-6c912f45b740|Mitchell Williams|New Hampshire|32996       |21461.0       |
|f243144e-485f-4382-81ef-2a9a3c63f172|John Terry       |Kansas       |10385       |NULL          |
|9c44ee81-8254-45e1-af23-a4608ceb126c|Carolyn Miller   |Arizona      |23754       |17149.0       |
|3e97b5d8-933a-4860-bce7-2398af6c5613|Antonio Sparks   |Washington   |27101       |36413.0       |
+------------------------------------+-----------------+-------------+------------+--------------+
only showing top 5 rows



2024-09-02 16:29:54,930 - INFO - All Customer_IDs from Customer.csv are present in Customer.csv.
2024-09-02 16:29:54,931 - INFO - [92mVerifying Product_ID matches between transactions.csv and products.csv...[0m
2024-09-02 16:29:54,932 - INFO - Starting validation of Product_ID between transactions.csv and products.csv.
2024-09-02 16:29:55,068 - INFO - All Product_IDs from Product.csv are present in Product.csv.
2024-09-02 16:29:55,069 - INFO - [92mVerifying Sales_Rep_ID matches between transactions.csv and sales_team.csv...[0m
2024-09-02 16:29:55,070 - INFO - Starting validation of Sales_Rep_ID between transactions.csv and sales_team.csv.
2024-09-02 16:29:55,187 - INFO - All Sales_Rep_IDs from Sales.csv are present in Sales.csv.
2024-09-02 16:29:55,188 - INFO - [92mData validation completed.[0m
2024-09-02 16:29:55,189 - INFO - Data validation completed.
