## Data Quality Framework Implementation

**Description**: Implement a simple data quality measurement framework using ISO 8000 principles to assess key dimensions in a dataset.

In [1]:
# Write a conceptual framework described in Python pseudo-code:
import pandas as pd
import numpy as np

# Define the class for Data Quality Framework
class DataQualityFramework:
    def __init__(self, df: pd.DataFrame):
        self.df = df
    
    # Accuracy: Measures how close the data is to true values.
    def accuracy(self, reference_df):
        """
        Assumes that the reference DataFrame provides the true values for comparison.
        Returns a score (0-100) based on matching values.
        """
        matches = (self.df == reference_df).sum().sum()
        total = self.df.size
        accuracy_score = (matches / total) * 100
        return accuracy_score
    
    # Consistency: Measures if data is consistent across different sources (in our case, just check duplicates).
    def consistency(self):
        """
        Checks for duplicate rows and returns a score based on the proportion of duplicates.
        """
        duplicates = self.df.duplicated().sum()
        total = len(self.df)
        consistency_score = max(0, (1 - (duplicates / total)) * 100)
        return consistency_score
    
    # Completeness: Measures how much of the data is missing.
    def completeness(self):
        """
        Measures the percentage of missing values in the dataset.
        """
        missing_data = self.df.isnull().sum().sum()
        total = self.df.size
        completeness_score = max(0, (1 - (missing_data / total)) * 100)
        return completeness_score
    
    # Timeliness: Measures if the data is up-to-date. Here, we'll simulate a check for timeliness.
    def timeliness(self, last_updated_date):
        """
        Compares the dataset's last updated date with the current date to calculate the timeliness.
        Assumes `last_updated_date` is a string in 'YYYY-MM-DD' format.
        """
        today = pd.to_datetime("today")
        last_updated = pd.to_datetime(last_updated_date)
        days_diff = (today - last_updated).days
        
        # If updated within the last 30 days, considered timely.
        if days_diff <= 30:
            timeliness_score = 100
        elif days_diff <= 60:
            timeliness_score = 80
        elif days_diff <= 90:
            timeliness_score = 60
        else:
            timeliness_score = 40
            
        return timeliness_score
    
    # Uniqueness: Measures how many unique records exist.
    def uniqueness(self):
        """
        Measures the percentage of unique rows in the dataset.
        """
        unique_records = len(self.df.drop_duplicates())
        total = len(self.df)
        uniqueness_score = (unique_records / total) * 100
        return uniqueness_score
    
    # Integrity: Measures the data integrity based on missing foreign keys or relational constraints.
    def integrity(self, foreign_key_column=None, reference_column=None):
        """
        Ensures there are no missing or invalid references in the foreign key column.
        If a foreign key column is provided, it checks for missing values and invalid references.
        """
        if foreign_key_column and reference_column:
            invalid_references = self.df[foreign_key_column].isin(self.df[reference_column]).sum()
            total = len(self.df)
            integrity_score = (invalid_references / total) * 100
            return integrity_score
        return 100  # If no foreign key check is needed, return perfect score
    
    # Compute the overall data quality score by averaging all dimensions
    def overall_quality_score(self, reference_df=None, last_updated_date=None, foreign_key_column=None, reference_column=None):
        scores = [
            self.accuracy(reference_df) if reference_df is not None else 100,
            self.consistency(),
            self.completeness(),
            self.timeliness(last_updated_date) if last_updated_date else 100,
            self.uniqueness(),
            self.integrity(foreign_key_column, reference_column)
        ]
        overall_score = np.mean(scores)
        return overall_score

# Example usage of DataQualityFramework

# Sample DataFrame
data = {
    'Product_ID': [101, 102, 103, 104, 105, 101],  # Duplicate ID
    'Product_Name': ['Product A', 'Product B', 'Product C', np.nan, 'Product E', 'Product A'],  # Missing name
    'Price': [10, 20, 15, 25, 30, np.nan],  # Missing price
}

df = pd.DataFrame(data)

# Simulating reference DataFrame for accuracy comparison
reference_data = {
    'Product_ID': [101, 102, 103, 104, 105],
    'Product_Name': ['Product A', 'Product B', 'Product C', 'Product D', 'Product E'],
    'Price': [10, 20, 15, 25, 30],
}
reference_df = pd.DataFrame(reference_data)

# Initialize Data Quality Framework
dq = DataQualityFramework(df)

# Calculate scores
accuracy_score = dq.accuracy(reference_df)
consistency_score = dq.consistency()
completeness_score = dq.completeness()
uniqueness_score = dq.uniqueness()
integrity_score = dq.integrity()

# Assume the data was last updated on '2023-09-01'
timeliness_score = dq.timeliness('2023-09-01')

# Overall Data Quality Score
overall_score = dq.overall_quality_score(reference_df, '2023-09-01')

print(f"Accuracy: {accuracy_score}%")
print(f"Consistency: {consistency_score}%")
print(f"Completeness: {completeness_score}%")
print(f"Uniqueness: {uniqueness_score}%")
print(f"Timeliness: {timeliness_score}%")
print(f"Integrity: {integrity_score}%")
print(f"Overall Data Quality Score: {overall_score}%")

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects