In [1]:
import pandas as pd
import numpy as np
import hashlib
import os

# --- Configuration (Update these paths to match your environment) ---
fn = r"Uganda_MFI_Credit_Scoring_Synthetic_With_IDs.xlsx"
pii_columns = ["national_id", "phone_number", "applicant_name", "applicant_id"]
categorical_cols_to_check = ["gender", "age_band", "home_ownership", "loan_status"]

# Load and preprocess the data as you did before
df = pd.read_excel(fn, sheet_name="Applicants", engine="openpyxl")
for c in pii_columns + categorical_cols_to_check:
    if c in df.columns:
        df[c] = df[c].astype("string").str.strip()


def document_data_issues(dataframe: pd.DataFrame, pii_cols: list, categorical_cols: list):
    """
    Generates a structured report documenting data volume, inconsistencies, and privacy risks.
    """
    print("="*80)
    print("           DATA QUALITY AND RISK DOCUMENTATION REPORT           ")
    print("="*80)

    # --- 1. Documenting Volume ---
    print("\n--- 1. Data Volume and Dimensions ---")
    print(f"Total Rows (Applicants): {dataframe.shape[0]}")
    print(f"Total Columns (Features): {dataframe.shape[1]}")
    print(f"Estimated Memory Usage: {dataframe.memory_usage(index=True, deep=True).sum() / (1024**2):.2f} MB")
    print("-" * 40)


    # --- 2. Documenting Inconsistencies ---
    print("\n--- 2. Data Inconsistencies and Quality Issues ---")
    
    # A. Duplicates
    duplicates_count = dataframe.duplicated().sum()
    print(f"  > Exact Duplicate Rows Found: {duplicates_count}")

    # B. Missing Values (Nulls)
    missing_data_summary = dataframe.isnull().sum()
    cols_with_missing = missing_data_summary[missing_data_summary > 0]
    print(f"  > Columns with Missing Values: {len(cols_with_missing)} found")
    if not cols_with_missing.empty:
        print(cols_with_missing.to_string())

    # C. Categorical Inconsistencies (Variations/Typos)
    print("\n  > Categorical Feature Examination (Unique values count):")
    for col in categorical_cols:
        if col in dataframe.columns:
            unique_counts = dataframe[col].nunique()
            print(f"    - '{col}': {unique_counts} unique values found.")
            if unique_counts > 10: # Flag for potential issues if too many unique values are found unexpectedly
                 print(f"      [!] Warning: High cardinality might indicate input inconsistencies or typos.")
            # print(f"      Values: {dataframe[col].unique()}") # Uncomment to see the exact values

    print("-" * 40)


    # --- 3. Documenting Privacy Risks ---
    print("\n--- 3. Privacy Risks (PII - Personally Identifiable Information) ---")
    
    print(f"  > Identified PII Columns: {pii_cols}")
    
    # Check if PII columns actually exist in the dataframe
    for col in pii_cols:
        if col in dataframe.columns:
            print(f"    - Risk Column '{col}': Exists in current dataset.")
            # You might add a check here if values look like real data
            
    print("\n  > Mitigation Strategy Used:")
    print("    - These columns must be dropped or hashed before model training to comply with privacy regulations.")
    # Show example of a dropped dataframe
    df_safe_example = dataframe.drop(columns=[c for c in pii_cols if c in dataframe.columns])
    print(f"    - Example safe dataframe shape (PII dropped): {df_safe_example.shape}")

    print("="*80)


# Execute the documentation function
document_data_issues(df, pii_columns, categorical_cols_to_check)


           DATA QUALITY AND RISK DOCUMENTATION REPORT           

--- 1. Data Volume and Dimensions ---
Total Rows (Applicants): 5000
Total Columns (Features): 15
Estimated Memory Usage: 2.47 MB
----------------------------------------

--- 2. Data Inconsistencies and Quality Issues ---
  > Exact Duplicate Rows Found: 0
  > Columns with Missing Values: 0 found

  > Categorical Feature Examination (Unique values count):
    - 'gender': 2 unique values found.
    - 'age_band': 2 unique values found.
    - 'home_ownership': 3 unique values found.
    - 'loan_status': 2 unique values found.
----------------------------------------

--- 3. Privacy Risks (PII - Personally Identifiable Information) ---
  > Identified PII Columns: ['national_id', 'phone_number', 'applicant_name', 'applicant_id']
    - Risk Column 'national_id': Exists in current dataset.
    - Risk Column 'phone_number': Exists in current dataset.
    - Risk Column 'applicant_name': Exists in current dataset.
    - Risk Column