####1.Load Business Rules Conf file (NO NEED TO UPDATE)

In [0]:
jsonFileString = r'''
{
    "fields": [
        "Policy_ID",
        "Holder_Age",
        "Email_Address",
        "Policy_Start_Date"
    ],
    "rules": {
        "Policy_ID": {
            "completeness": true,
            "validity": {
                "regex": "^POL\\d{4}$"
            }
        },
        "Holder_Age": {
            "completeness": true,
            "validity": {
                "type": "integer",
                "min": 18,
                "max": 99
            }
        },
        "Email_Address": {
            "completeness": true,
            "validity": {
                "regex": "^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"
            },
            "accuracy": null
        },
        "Policy_Start_Date": {
            "completeness": true,
            "validity": {
                "date_format": "%Y-%m-%d"
            },
            "accuracy": null
        },
        "Phone_Number_JP": {
            "completeness": true,
            "validity": {
                "regex": "^[0-9]{1,4}-[0-9]{1,4}-[0-9]{4}$"
            }
        },
        "Phone_Number_IN": {
            "completeness": true,
            "validity": {
                "regex": "^[6-9]\\d{9}$"
            }
        },
        "Phone_Number_CN": {
            "completeness": true,
            "validity": {
                "regex": "^1[3-9]\\d{9}$"
            }
        },
        "Phone_Number_ID": {
            "completeness": true,
            "validity": {
                "regex": "^\\+62[ -]?[2-9]\\d{2}[ -]?\\d{3,8}$"
            }
        },
        "Phone_Number_MY": {
            "completeness": true,
            "validity": {
                "regex": "^\\+60[ -]?[1-9]\\d{1,2}[ -]?\\d{6,7}$"
            }
        },
        "Zip_Code_JP": {
            "completeness": true,
            "validity": {
                "regex": "^\\d{3}-\\d{4}$"
            }
        },
        "Zip_Code_AU": {
            "completeness": true,
            "validity": {
                "regex": "^\\d{4}$"
            }
        }
    }
}
'''

# Assuming load_business_rules() parses a JSON string:
import json


def load_business_rules(json_string):
    return json.loads(json_string)

config_json = load_business_rules(jsonFileString)




print("load business rules conf")
def load_business_rules(file_path):
    # Open the file and load the JSON content into a dictionary
    with open(file_path, 'r') as file:
        rules = json.load(file)
    return rules


load business rules conf


####2.Load DQ functions

In [0]:
import pandas as pd
from datetime import datetime
import re


# Step 2: Create a Sample Configuration JSON Object
# This is a simplified version of the rules that would be in the config file.


def load_business_rules(file_path):
    # Open the file and load the JSON content into a dictionary
    with open(file_path, 'r') as file:
        rules = json.load(file)
    return rules


business_rules_path = '/Workspace/20231103.ADA.DQ/data-quality-rules-library.json'
config_json = load_business_rules(business_rules_path)


# Define the functions for DQ checks

def check_completeness(value):
    # Check if the value is a string and if it's empty or represents a missing value
    if isinstance(value, str):
        value = value.strip().lower()  # Trim whitespace and convert to lowercase
        if value in ["", "null", "nil", "n/a", "na", "-", "--"]:
            return False
    return not pd.isnull(value)


def check_validity(field, value, rules):
    if pd.isnull(value) or value in ["", "null", "nan", "N/A", "n/a"]:
        return False  # Incomplete data is also invalid
    if "type" in rules:
        if rules["type"] == "integer":
            try:
                int_value = int(value)  # Attempt to convert to integer
            except ValueError:
                return False  # Non-integer value
            if "min" in rules and int_value < rules["min"]:
                return False
            if "max" in rules and int_value > rules["max"]:
                return False
    if "regex" in rules:
        if not re.match(rules["regex"], str(value)):
            return False
    if "date_format" in rules:
        try:
            datetime.strptime(str(value), rules["date_format"])
        except ValueError:
            return False
    return True


def check_accuracy(field, value, rules):
    # Placeholder for a custom function based on rules
    # Assuming a function for accuracy check is defined elsewhere and can be called here.
    # For now, we just return True, as we don't have any specific logic to check accuracy.
    return True

# Update the calculation of the Data Quality metrics to use the correct formula

def calculate_data_quality_metrics(df, dq_flags):
    # Calculate DQ metrics for each field
    dq_metrics = {}
    for field, (comp, valid, acc) in dq_flags.items():
        completeness = df[comp].mean()
        validity = df[df[comp]][valid].mean()  # % Valid among complete
        accuracy = df[df[valid]][acc].mean()  # % Accurate among valid
        data_quality = completeness * validity * accuracy
        dq_metrics[field] = {
            "Completeness": completeness,
            "Validity": validity,
            "Accuracy": accuracy,
            "Data Quality": data_quality
        }

    # Calculate overall DQ metrics
    overall_completeness = df[[comp for comp, _, _ in dq_flags.values()]].mean().mean()
    overall_validity = df[[valid for _, valid, _ in dq_flags.values()]].mean().mean()
    overall_accuracy = df[[acc for _, _, acc in dq_flags.values()]].mean().mean()
    overall_dq = overall_completeness * overall_validity * overall_accuracy

    dq_metrics["Overall"] = {
        "Completeness": overall_completeness,
        "Validity": overall_validity,
        "Accuracy": overall_accuracy,
        "Data Quality": overall_dq
    }

    return dq_metrics

# Update the main function to include the correct DQ metric calculation
def calculate_data_quality(df, config):
    dq_flags = {}

    for field in config['fields']:
        completeness_col = f"{field}__Completeness"
        validity_col = f"{field}__Validity"
        accuracy_col = f"{field}__Accuracy"

        df[completeness_col] = df[field].apply(check_completeness)
        df[validity_col] = df.apply(lambda row: check_validity(field, row[field], config['rules'].get(field, {}).get('validity', {})), axis=1)
        df[accuracy_col] = df.apply(lambda row: check_accuracy(field, row[field], config['rules'].get(field, {}).get('accuracy', {})), axis=1)

        dq_flags[field] = (completeness_col, validity_col, accuracy_col)

    dq_metrics = calculate_data_quality_metrics(df, dq_flags)

    # Create a summary dataframe for the DQ metrics
    dq_summary = pd.DataFrame.from_dict(dq_metrics, orient='index')

    return df, dq_summary
