In [13]:
%run "_dq-library.ipynb"


In [14]:
import numpy as np
from datetime import datetime
import pandas as pd

# Set the randomness seed for reproducibility
np.random.seed(0)

# Function to generate a random name, with a chance for invalid names
def random_name(validity_chance=0.8):
    first_names = ["James", "Mary", "John", "Patricia", "Robert", "Jennifer", "", "1234", "Invalid_Name"]
    last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "!", "@@@@", ""]
    if np.random.random() < validity_chance:
        return f"{np.random.choice(first_names[:6])} {np.random.choice(last_names[:6])}"
    else:
        return f"{np.random.choice(first_names[6:])} {np.random.choice(last_names[6:])}"

# Function to generate a random email address, with a chance for invalid emails
def random_email(validity_chance=0.8):
    domains = ["example.com", "test.com", "demo.com", "invalid_domain", ""]
    prefix = "".join(np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), size=np.random.randint(5, 10)))
    if np.random.random() < validity_chance:
        return f"{prefix}@{np.random.choice(domains[:3])}"
    else:
        return f"{prefix}{np.random.choice(domains[3:])}"

# Function to generate a random date with multiple formats, with a chance for invalid dates
def random_date(validity_chance=0.8):
    formats = ["%Y-%m-%d", "%d/%m/%Y", "%m-%d-%Y", "invalid_date_format"]
    chosen_format = np.random.choice(formats[:-1]) if np.random.random() < validity_chance else formats[-1]
    date = datetime.now() + pd.Timedelta(days=np.random.randint(1, 365))
    return date.strftime(chosen_format) if chosen_format != "invalid_date_format" else "invalid_date"

# Function to generate a random policy status, with a chance for invalid statuses
def random_policy_status(validity_chance=0.8):
    statuses = ["B","E","H","I","N","P","Q","R","U","W","Y","Z","A","C","D","F","G","J","K","L","M","S","T","V","X","NOTVALID"]
    return np.random.choice(statuses[:25]) if np.random.random() < validity_chance else statuses[-1]

# Function to generate a random policy holder sex, with a chance for invalid statuses
def random_policy_holder_sex(validity_chance=0.8):
    statuses = ["M","F","MALE","FEMALE","-"]
    return np.random.choice(statuses[:4]) if np.random.random() < validity_chance else statuses[-1]

# Function to generate a random policy holder marital status, with a chance for invalid statuses
def random_holder_marital_status(validity_chance=0.8):
    m_statuses = ["M","D","S","Y","N","-"]
    return np.random.choice(m_statuses[:5]) if np.random.random() < validity_chance else m_statuses[-1]

# Function to generate a random policy holder marital status, with a chance for invalid statuses
def random_entity(validity_chance=0.8):
    m_statuses = ["ZGIMB","ZGTMB","ZLIMB","ZTMB","111AA"]
    return np.random.choice(m_statuses[:4]) if np.random.random() < validity_chance else m_statuses[-1]

# Function to generate a random product code, with a chance for invalid codes
def random_product_code(validity_chance=0.8):
    if np.random.random() < validity_chance:
        return f"{np.random.choice(['A', 'B', 'C'])}{np.random.randint(100, 999)}"
    else:
        return f"{np.random.choice(['A', 'B', 'C'])}{np.random.randint(1000, 9999)}"

# Function to generate random data for df_policies
def generate_random_policies(num_records, validity_chance=0.8):
    policy_data = {
        "Entity": [random_entity(validity_chance) for _ in range(num_records)],
        "Policy_ID": [f"800875{np.random.randint(1000, 9999)}" if np.random.random() < validity_chance else f"ERROR{np.random.randint(1000, 9999)}" for _ in range(num_records)],
        "NRIC": [f"{np.random.randint(100000, 999999)}-{np.random.randint(10, 99)}-{np.random.randint(1000, 9999)}" if np.random.random() < validity_chance else f"ERROR{np.random.randint(1000, 9999)}" for _ in range(num_records)],
        "Holder_Name": [random_name(validity_chance) for _ in range(num_records)],
        "Holder_Age": [np.random.randint(18, 100) if np.random.random() < validity_chance else np.random.randint(100, 150) for _ in range(num_records)],
        "Holder_Sex": [random_policy_holder_sex(validity_chance) for _ in range(num_records)],
        "Holder_Marital_Status": [random_holder_marital_status(validity_chance) for _ in range(num_records)],
        "Email_Address": [random_email(validity_chance) for _ in range(num_records)],
        "Mobile_Numbers": [f"601{np.random.randint(100,999)}{np.random.randint(10000, 99999)}" if np.random.random() < validity_chance else f"O{np.random.randint(10000, 99999)}" for _ in range(num_records)],
        "PostCode": [f"{np.random.randint(10000, 99999)}" if np.random.random() < validity_chance else f"O{np.random.randint(100, 999)}" for _ in range(num_records)],
        "Policy_Start_Date": [random_date(validity_chance) for _ in range(num_records)],
        "Policy_Status": [random_policy_status(validity_chance) for _ in range(num_records)],
        "Premium_Amount_IDR": [np.random.randint(10000, 1000000) if np.random.random() < validity_chance else np.random.randint(1, 9999) for _ in range(num_records)],
        "Product_Code": [random_product_code(validity_chance) for _ in range(num_records)],
        "Commission_Percent": [np.random.random() * 100 if np.random.random() < validity_chance else np.random.random() * 200 for _ in range(num_records)],
        "Vehicle_Age": [np.random.randint(0, 50) if np.random.random() < validity_chance else np.random.randint(50, 100) for _ in range(num_records)]
    }

    return pd.DataFrame(policy_data)

# Generate a random df_policies dataframe with 10 records for demonstration
random_df_policies = generate_random_policies(10)

df_policies = random_df_policies


# 
# /!\ /!\  NO UPDATES BEYOND THIS POINT   /!\ /!\
# 

In [15]:
# Apply the updated completeness check and DQ metric calculation to the dataframe
df_processed, dq_summary = calculate_data_quality(df_policies, config_with_uniqueness)
dq_summary

Unnamed: 0,Completeness,Validity,Accuracy,Uniqueness,Data Quality
Entity,100.00%,60.00%,100.00%,100.00%,60.00%
Policy_ID,100.00%,70.00%,100.00%,100.00%,70.00%
NRIC,100.00%,90.00%,100.00%,100.00%,90.00%
Holder_Name,100.00%,60.00%,100.00%,100.00%,60.00%
Holder_Age,100.00%,70.00%,100.00%,100.00%,70.00%
Holder_Sex,100.00%,80.00%,100.00%,100.00%,80.00%
Holder_Marital_Status,100.00%,100.00%,100.00%,100.00%,100.00%
PostCode,100.00%,70.00%,100.00%,100.00%,70.00%
Vehicle_Age,100.00%,20.00%,100.00%,100.00%,20.00%
Email_Address,100.00%,80.00%,100.00%,100.00%,80.00%


In [16]:
df_processed[df_processed.filter(regex='(__Validity|__Completeness|__Accuracy)$').columns]

Unnamed: 0,Entity__Completeness,Entity__Validity,Entity__Accuracy,Policy_ID__Completeness,Policy_ID__Validity,Policy_ID__Accuracy,NRIC__Completeness,NRIC__Validity,NRIC__Accuracy,Holder_Name__Completeness,...,Email_Address__Accuracy,Policy_Start_Date__Completeness,Policy_Start_Date__Validity,Policy_Start_Date__Accuracy,Policy_Status__Completeness,Policy_Status__Validity,Policy_Status__Accuracy,Mobile_Numbers__Completeness,Mobile_Numbers__Validity,Mobile_Numbers__Accuracy
0,True,True,True,True,False,True,True,True,True,True,...,True,True,True,True,True,False,True,True,True,True
1,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,False,True,True,True,True
2,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
3,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,False,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,False,True,True,True,True
6,True,True,True,True,True,True,True,False,True,True,...,True,True,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
8,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
9,True,True,True,True,False,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
