# NovaCred Data Quality Pipeline

## Purpose
This notebook is intentionally limited to three objectives:

1. Identify and document all data quality issues.
2. Quantify the extent of each issue.
3. Demonstrate remediation steps and their impact.

The core downstream outputs are kept: `applications_analysis.csv` and `spending_items_clean.csv`. Governance-heavy reference artifacts are removed from the default pipeline.

In [1]:
# Setup imports and notebook configuration
from pathlib import Path
import shutil
import sys

import pandas as pd

PROJECT_ROOT = Path.cwd().resolve().parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src import clean, config, flatten, io_utils, privacy, quality, schema

pd.set_option('display.max_columns', 30)
pd.set_option('display.width', 160)


## Load and flatten

In [2]:
# Load raw JSON, confirm top-level list semantics, and flatten the nested records
records = io_utils.load_raw_json(config.RAW_JSON_PATH)
assert isinstance(records, list), 'Raw JSON must be a top-level list.'

print('Raw records:', len(records))
print('Redacted raw sample:')
privacy.redact_record(records[0])

applications_df = flatten.flatten_applications(records)
spending_df = flatten.flatten_spending_items(records)

print('Application rows:', len(applications_df))
print('Spending rows:', len(spending_df))
privacy.safe_preview_df(applications_df, config.DIRECT_PII_COLUMNS, n=3)


Raw records: 502
Redacted raw sample:
Application rows: 502
Spending rows: 827


Unnamed: 0,application_row_id,application_id,raw_processing_timestamp,raw_applicant_full_name,raw_applicant_email,raw_applicant_ssn,raw_applicant_ip_address,raw_applicant_gender,raw_applicant_date_of_birth,raw_applicant_zip_code,raw_financial_annual_income,raw_financial_annual_salary,raw_financial_credit_history_months,raw_financial_debt_to_income,raw_financial_savings_balance,raw_decision_loan_approved,raw_decision_interest_rate,raw_decision_approved_amount,raw_decision_rejection_reason,raw_loan_purpose,raw_notes
0,0,app_200,2024-01-15T00:00:00Z,[REDACTED_NAME],j***@hotmail.com,***-**-4340,[REDACTED_IP],Male,2001-**-**,10036,73000,,23,0.2,31212,False,,,algorithm_risk_score,,
1,1,app_037,,[REDACTED_NAME],b***@yahoo.com,***-**-4784,[REDACTED_IP],M,1992-**-**,10032,78000,,51,0.18,17915,False,,,algorithm_risk_score,,
2,2,app_215,,[REDACTED_NAME],s***@mail.com,***-**-5178,[REDACTED_IP],Male,1989-**-**,10075,61000,,41,0.21,37909,True,3.7,59000.0,,vacation,


## Duplicate handling

In [3]:
# Classify duplicate application IDs and compute canonical analysis rows
duplicate_report_df, duplicate_metadata_df = quality.analyze_duplicate_ids(applications_df)

print('Unique application_id values:', applications_df['application_id'].nunique())
print('Duplicate application_id values:', len(duplicate_report_df))
print('Canonical rows for analysis:', int(duplicate_metadata_df['is_canonical_for_analysis'].sum()))
duplicate_report_df


Unique application_id values: 500
Duplicate application_id values: 2
Canonical rows for analysis: 500


Unnamed: 0,application_id,dup_count,classification,canonical_row_id,canonical_reason,example_differences
0,app_001,2,conflict,455,missing_or_unparseable_timestamp_fallback_max_...,raw_applicant_ssn|raw_applicant_ip_address|raw...
1,app_042,2,versioned,354,missing_or_unparseable_timestamp_fallback_max_...,raw_notes


## Pre-clean issues

In [4]:
# Build the minimal rule catalog and pre-clean quality report
rule_catalog_df = schema.build_rule_catalog()

application_flags_pre = schema.validate_applications_preclean(applications_df)
spending_flags_pre = schema.validate_spending_preclean(spending_df)

pre_quality_report_df = quality.build_data_quality_report(
    applications_df=applications_df,
    application_flags=application_flags_pre,
    duplicate_report=duplicate_report_df,
    duplicate_metadata=duplicate_metadata_df,
    spending_df=spending_df,
    spending_flags=spending_flags_pre,
    stage='pre',
    rule_catalog=rule_catalog_df,
)

pre_summary_df = (
    pre_quality_report_df.groupby('issue_group', as_index=False)[['count']]
    .sum()
    .sort_values('count', ascending=False)
)
print('Pre-clean issue groups:')
pre_summary_df


Pre-clean issue groups:


Unnamed: 0,issue_group,count
3,Privacy,497
0,Completeness,460
1,Consistency,312
2,Cross-field logic,29
4,Uniqueness,14
5,Validity,8


## Cleaning and standardisation

In [5]:
# Apply deterministic cleaning rules and assemble the restricted curated dataset
applications_clean_df = clean.clean_applications(applications_df)
spending_clean_df = clean.clean_spending_items(spending_df)

applications_curated_full_df = applications_clean_df.merge(
    duplicate_metadata_df,
    on=['application_row_id', 'application_id'],
    how='left',
    validate='one_to_one',
)

allowed_app_flags = {
    'gender_standardized_flag',
    'dob_parse_failed_flag',
    'dob_ambiguous_flag',
    'annual_income_from_salary_flag',
    'credit_history_nullified_flag',
    'dti_nullified_flag',
    'savings_nullified_flag',
    'approved_missing_terms_flag',
    'rejected_missing_reason_flag',
}
actual_app_flags = {column for column in applications_curated_full_df.columns if column.endswith('_flag')}
assert actual_app_flags == allowed_app_flags, f'Unexpected curated application flags: {sorted(actual_app_flags - allowed_app_flags)}'

required_duplicate_metadata = {'is_duplicate_id', 'is_canonical_for_analysis', 'has_conflict'}
assert required_duplicate_metadata.issubset(applications_curated_full_df.columns), 'Missing duplicate metadata columns in curated output.'

allowed_spending_flags = {'category_missing_flag', 'amount_invalid_flag', 'amount_negative_flag'}
actual_spending_flags = {column for column in spending_clean_df.columns if column.endswith('_flag')}
assert actual_spending_flags == allowed_spending_flags, f'Unexpected spending flags: {sorted(actual_spending_flags - allowed_spending_flags)}'

print('Restricted curated rows:', len(applications_curated_full_df))
privacy.safe_preview_df(applications_curated_full_df, config.DIRECT_PII_COLUMNS, n=3)


Restricted curated rows: 502


Unnamed: 0,application_row_id,application_id,raw_processing_timestamp,raw_applicant_full_name,raw_applicant_email,raw_applicant_ssn,raw_applicant_ip_address,raw_applicant_gender,raw_applicant_date_of_birth,raw_applicant_zip_code,raw_financial_annual_income,raw_financial_annual_salary,raw_financial_credit_history_months,raw_financial_debt_to_income,raw_financial_savings_balance,...,credit_history_nullified_flag,clean_credit_history_months,dti_nullified_flag,clean_debt_to_income,savings_nullified_flag,clean_savings_balance,clean_loan_approved,clean_interest_rate,clean_approved_amount,clean_rejection_reason,approved_missing_terms_flag,rejected_missing_reason_flag,is_duplicate_id,is_canonical_for_analysis,has_conflict
0,0,app_200,2024-01-15T00:00:00Z,[REDACTED_NAME],j***@hotmail.com,***-**-4340,[REDACTED_IP],Male,2001-**-**,10036,73000,,23,0.2,31212,...,False,23,False,0.2,False,31212.0,False,,,algorithm_risk_score,False,False,False,True,False
1,1,app_037,,[REDACTED_NAME],b***@yahoo.com,***-**-4784,[REDACTED_IP],M,1992-**-**,10032,78000,,51,0.18,17915,...,False,51,False,0.18,False,17915.0,False,,,algorithm_risk_score,False,False,False,True,False
2,2,app_215,,[REDACTED_NAME],s***@mail.com,***-**-5178,[REDACTED_IP],Male,1989-**-**,10075,61000,,41,0.21,37909,...,False,41,False,0.21,False,37909.0,True,3.7,59000.0,,False,False,False,True,False


## Post-clean remediation evidence

In [6]:
# Build the post-clean quality report and compact before-versus-after evidence table
application_flags_post = schema.validate_applications_postclean(applications_curated_full_df)
spending_flags_post = schema.validate_spending_postclean(spending_clean_df)

post_quality_report_df = quality.build_data_quality_report(
    applications_df=applications_curated_full_df,
    application_flags=application_flags_post,
    duplicate_report=duplicate_report_df,
    duplicate_metadata=duplicate_metadata_df,
    spending_df=spending_clean_df,
    spending_flags=spending_flags_post,
    stage='post',
    rule_catalog=rule_catalog_df,
)

data_quality_report_df = pd.concat([pre_quality_report_df, post_quality_report_df], ignore_index=True)

before_after_comparison_df = quality.build_before_after_comparison(
    quality_report=data_quality_report_df,
    duplicate_report=duplicate_report_df,
    duplicate_metadata=duplicate_metadata_df,
    total_records=len(applications_df),
    canonical_count=int(duplicate_metadata_df['is_canonical_for_analysis'].sum()),
)

post_summary_df = (
    post_quality_report_df.groupby('issue_group', as_index=False)[['count']]
    .sum()
    .sort_values('count', ascending=False)
)
print('Post-clean issue groups:')
post_summary_df


Post-clean issue groups:


Unnamed: 0,issue_group,count
3,Privacy,497
0,Completeness,460
1,Consistency,44
2,Cross-field logic,28
4,Uniqueness,14
5,Validity,4


In [7]:
# Display the compact before-versus-after remediation evidence table
before_after_comparison_df


Unnamed: 0,issue_group,rule_id,metric_label,pre_count,post_count,delta_count,pre_percent,post_percent,delta_percent
0,Completeness,R_APP_002,Missing required applicant fields,8,8,0,1.59,1.59,0.0
1,Validity,R_APP_005,Invalid email format,4,4,0,0.8,0.8,0.0
2,Consistency,R_APP_006,Gender requires standardisation,111,0,-111,22.11,0.0,-22.11
3,Consistency,R_APP_008,DOB not in ISO format,157,0,-157,31.27,0.0,-31.27
4,Consistency,R_APP_009,DOB ambiguity,39,39,0,7.77,7.77,0.0
5,,R_APP_010,Annual income coercion issue,0,0,0,0.0,0.0,0.0
6,Consistency,R_APP_011,Annual salary field drift,5,5,0,1.0,1.0,0.0
7,Validity,R_APP_012,Negative credit history months,2,0,-2,0.4,0.0,-0.4
8,Validity,R_APP_013,Negative savings balance,1,0,-1,0.2,0.0,-0.2
9,Validity,R_APP_014,Debt-to-income out of range,1,0,-1,0.2,0.0,-0.2


## Analysis outputs

In [8]:
# Build the PII-safe application analysis dataset, the cleaned spending dataset, and the minimal PII inventory
applications_analysis_df = privacy.build_analysis_dataset(applications_curated_full_df)
pii_inventory_df = privacy.generate_pii_inventory(
    curated_full_df=applications_curated_full_df,
    analysis_df=applications_analysis_df,
)

print('Analysis rows:', len(applications_analysis_df))
print('Analysis columns:', applications_analysis_df.columns.tolist())
privacy.safe_preview_df(applications_analysis_df, config.DIRECT_PII_COLUMNS, n=3)


Analysis rows: 500
Analysis columns: ['application_id', 'applicant_pseudo_id', 'pseudo_id_source', 'pseudo_id_fallback_used_flag', 'age_band', 'age_band_missing_flag', 'clean_gender', 'clean_zip_code', 'clean_annual_income', 'clean_credit_history_months', 'clean_debt_to_income', 'clean_savings_balance', 'clean_loan_approved', 'clean_interest_rate', 'clean_approved_amount', 'clean_rejection_reason']


Unnamed: 0,application_id,applicant_pseudo_id,pseudo_id_source,pseudo_id_fallback_used_flag,age_band,age_band_missing_flag,clean_gender,clean_zip_code,clean_annual_income,clean_credit_history_months,clean_debt_to_income,clean_savings_balance,clean_loan_approved,clean_interest_rate,clean_approved_amount,clean_rejection_reason
0,app_001,fc4fb76803a008529455aa4130a4c9f4a5f72f06f7ad43...,email_fallback,True,,True,,,102000.0,37,0.42,0.0,False,,,high_dti_ratio
1,app_002,7fa4238022da5aed441f8c48a907a8f3cbe88186049c8e...,ssn,False,25-34,False,Male,10020.0,41000.0,5,0.36,18200.0,False,,,algorithm_risk_score
2,app_003,e626311f310f7fb80415229777be761b877b33d42ddcc1...,ssn,False,35-44,False,Female,90213.0,65000.0,74,0.43,7090.0,True,3.4,76000.0,


In [9]:
# Preview the cleaned spending analysis table with its reduced flag set
print('Spending analysis rows:', len(spending_clean_df))
print('Spending columns:', spending_clean_df.columns.tolist())
spending_clean_df.head(5)


Spending analysis rows: 827
Spending columns: ['application_row_id', 'application_id', 'spending_index', 'raw_category', 'raw_amount', 'category_clean', 'category_missing_flag', 'amount_invalid_flag', 'amount_negative_flag', 'amount_clean']


Unnamed: 0,application_row_id,application_id,spending_index,raw_category,raw_amount,category_clean,category_missing_flag,amount_invalid_flag,amount_negative_flag,amount_clean
0,0,app_200,0,Shopping,480,Shopping,False,False,False,480
1,0,app_200,1,Rent,790,Rent,False,False,False,790
2,0,app_200,2,Alcohol,247,Alcohol,False,False,False,247
3,1,app_037,0,Rent,608,Rent,False,False,False,608
4,1,app_037,1,Dining,96,Dining,False,False,False,96


## Export and checks

In [10]:
# Write the reduced output set, remove deprecated directories on a best-effort basis, and run acceptance checks
io_utils.ensure_output_dirs()

io_utils.write_csv(applications_curated_full_df, config.APPLICATIONS_CURATED_FULL_PATH)
io_utils.write_csv(applications_analysis_df, config.APPLICATIONS_ANALYSIS_PATH)
io_utils.write_csv(spending_clean_df, config.SPENDING_ITEMS_CLEAN_PATH)
io_utils.write_csv(data_quality_report_df, config.DATA_QUALITY_REPORT_PATH)
io_utils.write_csv(before_after_comparison_df, config.BEFORE_AFTER_COMPARISON_PATH)
io_utils.write_csv(duplicate_report_df, config.DUPLICATE_ID_REPORT_PATH)
io_utils.write_csv(rule_catalog_df, config.RULE_CATALOG_PATH)
io_utils.write_csv(pii_inventory_df, config.PII_INVENTORY_PATH)

for path in [config.QUALITY_DIR / 'reports', config.QUALITY_DIR / 'catalogs', config.QUALITY_DIR / 'duplicates']:
    if path.is_dir():
        shutil.rmtree(path, ignore_errors=True)

expected_outputs = [
    config.APPLICATIONS_CURATED_FULL_PATH,
    config.APPLICATIONS_ANALYSIS_PATH,
    config.SPENDING_ITEMS_CLEAN_PATH,
    config.DATA_QUALITY_REPORT_PATH,
    config.BEFORE_AFTER_COMPARISON_PATH,
    config.DUPLICATE_ID_REPORT_PATH,
    config.RULE_CATALOG_PATH,
    config.PII_INVENTORY_PATH,
]
for output_path in expected_outputs:
    assert output_path.exists(), f'Missing output: {output_path}'

assert set(data_quality_report_df['stage']) == {'pre', 'post'}, 'Combined quality report must contain pre and post stages.'
assert 'issue_group' in data_quality_report_df.columns, 'Combined quality report must include issue_group.'
assert not before_after_comparison_df.empty, 'Before/after comparison must be non-empty.'
assert duplicate_report_df is not None, 'Duplicate report must be produced.'
assert applications_analysis_df['application_id'].is_unique, 'Analysis dataset must have unique application_id.'
assert 'applicant_pseudo_id' in applications_analysis_df.columns, 'Missing applicant_pseudo_id in analysis dataset.'
assert not any(column.startswith('raw_') for column in applications_analysis_df.columns), 'Analysis dataset must not contain raw_* columns.'
allowed_analysis_flags = {'age_band_missing_flag', 'pseudo_id_fallback_used_flag'}
actual_analysis_flags = {column for column in applications_analysis_df.columns if column.endswith('_flag')}
assert actual_analysis_flags == allowed_analysis_flags, f'Unexpected analysis flags: {sorted(actual_analysis_flags - allowed_analysis_flags)}'
for forbidden_column in config.DIRECT_PII_COLUMNS:
    assert forbidden_column not in applications_analysis_df.columns, f'Forbidden PII column present: {forbidden_column}'

assert {'category_missing_flag', 'amount_invalid_flag', 'amount_negative_flag'} == {column for column in spending_clean_df.columns if column.endswith('_flag')}, 'Unexpected spending flags in cleaned spending dataset.'

for _, row in duplicate_report_df.iterrows():
    app_rows = applications_df.loc[applications_df['application_id'] == row['application_id']].copy()
    parsed_ts = pd.to_datetime(app_rows['raw_processing_timestamp'], errors='coerce', utc=True)
    if parsed_ts.notna().any():
        latest_ts = parsed_ts.max()
        expected_row_id = int(app_rows.loc[parsed_ts == latest_ts, 'application_row_id'].max())
    else:
        expected_row_id = int(app_rows['application_row_id'].max())
    assert int(row['canonical_row_id']) == expected_row_id, f'Canonical selection mismatch for {row["application_id"]}'

print('Outputs written:')
for output_path in expected_outputs:
    print('-', output_path)


Outputs written:
- C:\Users\conno\dev\db\nova_dego\DEGO_Project_Group03-1\data\curated\applications_curated_full.csv
- C:\Users\conno\dev\db\nova_dego\DEGO_Project_Group03-1\data\curated\applications_analysis.csv
- C:\Users\conno\dev\db\nova_dego\DEGO_Project_Group03-1\data\curated\spending_items_clean.csv
- C:\Users\conno\dev\db\nova_dego\DEGO_Project_Group03-1\data\quality\data_quality_report.csv
- C:\Users\conno\dev\db\nova_dego\DEGO_Project_Group03-1\data\quality\before_after_comparison.csv
- C:\Users\conno\dev\db\nova_dego\DEGO_Project_Group03-1\data\quality\duplicate_id_report.csv
- C:\Users\conno\dev\db\nova_dego\DEGO_Project_Group03-1\data\quality\rule_catalog.csv
- C:\Users\conno\dev\db\nova_dego\DEGO_Project_Group03-1\data\quality\pii_inventory.csv
