In [4]:
import pandas as pd
import numpy as np
import re

# Example customer dataset
data = {
    'CustomerID': [101, 102, 103, 104, 105],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, 30, np.nan, 22, 27],
    'Email': ['alice@example.com', 'bob@example', 'charlie@example.com', 'david@example.com', np.nan],
    'Address': ['New York', 'California', 'Texas', 'Texas', 'California']
}

df_customers = pd.DataFrame(data)

# 1. Completeness Score (Percentage of missing values)
def completeness_score(column):
    missing_values = column.isnull().sum()
    total_values = len(column)
    completeness = (1 - missing_values / total_values) * 100
    return completeness

# 2. Uniqueness Score (Percentage of unique values)
def uniqueness_score(column):
    unique_values = column.nunique()
    total_values = len(column)
    uniqueness = (unique_values / total_values) * 100
    return uniqueness

# 3. Consistency Score (Email format and Age range check)
def consistency_score(column, column_name):
    if column_name == 'Email':
        # Simple email format check
        email_pattern = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
        valid_emails = column.apply(lambda x: bool(re.match(email_pattern, str(x))) if pd.notnull(x) else False)
        consistency = (valid_emails.sum() / len(column)) * 100
    elif column_name == 'Age':
        # Age should be between 18 and 100
        valid_ages = column.apply(lambda x: 18 <= x <= 100 if pd.notnull(x) else False)
        consistency = (valid_ages.sum() / len(column)) * 100
    else:
        consistency = 100  # For other columns, assume they are consistent

    return consistency

# Calculate scores for each column
completeness_scores = df_customers.apply(completeness_score)
uniqueness_scores = df_customers.apply(uniqueness_score)
consistency_scores = df_customers.apply(lambda col: consistency_score(col, col.name))

# Calculate overall score for the dataset
# Example weightings for completeness, uniqueness, and consistency (can be adjusted)
weights = {'Completeness': 0.4, 'Uniqueness': 0.3, 'Consistency': 0.3}
overall_score = (
    completeness_scores.mean() * weights['Completeness'] +
    uniqueness_scores.mean() * weights['Uniqueness'] +
    consistency_scores.mean() * weights['Consistency']
)

# Print the results
print("Completeness Scores:\n", completeness_scores)
print("\nUniqueness Scores:\n", uniqueness_scores)
print("\nConsistency Scores:\n", consistency_scores)
print("\nOverall Data Quality Score: {:.2f}".format(overall_score))








   


             
        
                          
                         


                     

                                # 1. Accuracy Score: Check for prices and quantities within realistic ranges

                                    # Product price should be in a reasonable range (example: 50 to 5000)
                
                                    
                                                
              

Completeness Scores:
 CustomerID    100.0
Name          100.0
Age            80.0
Email          80.0
Address       100.0
dtype: float64

Uniqueness Scores:
 CustomerID    100.0
Name          100.0
Age            80.0
Email          80.0
Address        60.0
dtype: float64

Consistency Scores:
 CustomerID    100.0
Name          100.0
Age            80.0
Email          60.0
Address       100.0
dtype: float64

Overall Data Quality Score: 88.40


In [5]:
import pandas as pd
import numpy as np
import datetime

# Example online shop dataset
data_shop = {
    'OrderID': [101, 102, 103, 104, 105],
    'CustomerID': [1, 2, 3, 4, np.nan],  # Missing customer ID for order 105
    'ProductID': [201, 202, 203, 204, 205],
    'ProductName': ['Laptop', 'Headphones', 'Smartphone', 'Keyboard', 'Monitor'],
    'Price': [1000, 200, 300, 150, 250],
    'Quantity': [2, 1, 3, 4, 5],
    'OrderDate': ['2024-04-01', '2024-04-03', '2024-04-02', '2024-04-01', '2024-04-02'],
    'LastUpdated': ['2024-04-01', '2024-04-03', '2024-04-01', '2024-04-02', '2024-04-02'],
}

df_shop = pd.DataFrame(data_shop)

# 1. Accuracy Score: Check for prices and quantities within realistic ranges
def accuracy_score(df):
    # Product price should be in a reasonable range (example: 50 to 5000)
    price_valid = df['Price'].between(50, 5000).sum()
    quantity_valid = df['Quantity'].between(1, 100).sum()
    
    # Accuracy score based on valid price and quantity
    accuracy = (price_valid + quantity_valid) / (2 * len(df)) * 100
    return accuracy

# 2. Timeliness Score: Check if OrderDate and LastUpdated are recent
def timeliness_score(df):
    current_date = datetime.datetime.today()
    # Convert string to datetime
    df['OrderDate'] = pd.to_datetime(df['OrderDate'])
    df['LastUpdated'] = pd.to_datetime(df['LastUpdated'])
    
    # Timeliness check: consider data timely if the order date is within the last 30 days
    timeliness = ((df['OrderDate'] > current_date - pd.Timedelta(days=30)).sum() /
                  len(df)) * 100
    return timeliness

# 3. Integrity Score: Check for missing data and inconsistencies (e.g., missing customer_id)
def integrity_score(df):
    missing_data = df.isnull().sum().sum()
    total_cells = df.size
    
    # Integrity is calculated as (total cells - missing data) / total cells
    integrity = ((total_cells - missing_data) / total_cells) * 100
    return integrity

# Calculate the scores for the dataset
accuracy = accuracy_score(df_shop)
timeliness = timeliness_score(df_shop)
integrity = integrity_score(df_shop)

# Overall Data Quality Score (example: weighted average)
weights = {'Accuracy': 0.4, 'Timeliness': 0.3, 'Integrity': 0.3}
overall_score = (
    accuracy * weights['Accuracy'] +
    timeliness * weights['Timeliness'] +
    integrity * weights['Integrity']
)

# Print the results
print(f"Accuracy Score: {accuracy:.2f}%")
print(f"Timeliness Score: {timeliness:.2f}%")
print(f"Integrity Score: {integrity:.2f}%")
print(f"Overall Data Quality Score: {overall_score:.2f}%")


Accuracy Score: 100.00%
Timeliness Score: 0.00%
Integrity Score: 97.50%
Overall Data Quality Score: 69.25%
