In [None]:
"""
================================================================================
SODA CORE - FIXED INSTALLATION FOR COLAB (CLEAN VERSION)
================================================================================
Auto-save ke Google Drive | Fixed NumPy incompatibility
================================================================================
"""

print("="*80)
print("üîß SODA CORE - SETUP & INSTALLATION")
print("="*80)

# ============================================
# STEP 1: MOUNT GOOGLE DRIVE
# ============================================
print("\nüìÇ Mounting Google Drive...")
print("-" * 80)

from google.colab import drive
import os

drive.mount('/content/drive', force_remount=False)
print("‚úÖ Google Drive mounted!")

# Setup project paths
DRIVE_PROJECT_PATH = '/content/drive/MyDrive/soda_project'
LOCAL_PROJECT_PATH = '/content/soda_project'

# Create directories in Google Drive
dirs_to_create = ['data', 'checks', 'reports', 'config']
for dir_name in dirs_to_create:
    os.makedirs(f"{DRIVE_PROJECT_PATH}/{dir_name}", exist_ok=True)
    print(f"‚úì Created: {DRIVE_PROJECT_PATH}/{dir_name}")

# Create symlink for easy access
if os.path.exists(LOCAL_PROJECT_PATH):
    os.system(f'rm -rf {LOCAL_PROJECT_PATH}')
os.symlink(DRIVE_PROJECT_PATH, LOCAL_PROJECT_PATH)
print(f"\n‚úÖ Project linked: {LOCAL_PROJECT_PATH} ‚Üí {DRIVE_PROJECT_PATH}")

# ============================================
# STEP 2: UNINSTALL CONFLICTING PACKAGES
# ============================================
print("\nüóëÔ∏è  Cleaning up conflicting packages...")
print("-" * 80)

!pip uninstall -y numpy pandas soda-pandas-dask 2>/dev/null
print("‚úÖ Old packages removed")

# ============================================
# STEP 3: INSTALL COMPATIBLE VERSIONS
# ============================================
print("\nüì¶ Installing compatible versions...")
print("-" * 80)

!pip install -q numpy==1.26.4
!pip install -q pandas==2.2.3
!pip install -q -i https://pypi.cloud.soda.io soda-pandas-dask

print("‚úÖ Compatible versions installed!")

# ============================================
# STEP 4: VERIFY INSTALLATION
# ============================================
print("\nüîç Verifying installation...")
print("-" * 80)

import importlib
import sys

# Clear cached imports
for module in list(sys.modules.keys()):
    if module.startswith(('numpy', 'pandas', 'soda')):
        del sys.modules[module]

try:
    import pandas as pd
    import numpy as np
    from soda.scan import Scan

    print(f"‚úÖ NumPy: {np.__version__}")
    print(f"‚úÖ Pandas: {pd.__version__}")
    print(f"‚úÖ Soda Core: Imported successfully!")

    # Quick test
    test_df = pd.DataFrame({'test': [1, 2, 3]})
    print(f"‚úÖ Test DataFrame created: {test_df.shape}")

except Exception as e:
    print(f"‚ùå Error: {e}")
    print("\n‚ö†Ô∏è  If error persists:")
    print("   1. Runtime ‚Üí Restart runtime")
    print("   2. Run this cell again")
    sys.exit(1)

# ============================================
# STEP 5: LOAD MBG DATA
# ============================================
print("\nüìä Loading MBG data...")
print("-" * 80)

df = None
possible_paths = [
    '/content/drive/MyDrive/Magister SI TelU/Semester 1/Analisis Data dan Perusahaan/Tugas Besar/Progress Week 9/mbg_data_clean.csv',
    '/content/drive/MyDrive/mbg_data_clean.csv',
    '/content/mbg_data_clean.csv',
]

for path in possible_paths:
    if os.path.exists(path):
        try:
            df = pd.read_csv(path, encoding='utf-8-sig')
            print(f"\n‚úÖ Data loaded from: {path}")
            print(f"   üìä Shape: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
            break
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not read {path}: {e}")

if df is None:
    print("\n‚ö†Ô∏è  Data file not found")
    print("üì§ Please upload your CSV file:")
    from google.colab import files
    uploaded = files.upload()
    if uploaded:
        filename = list(uploaded.keys())[0]
        df = pd.read_csv(filename, encoding='utf-8-sig')
        print(f"\n‚úÖ Loaded: {df.shape[0]:,} rows")

if df is not None:
    # Calculate metrics
    df['content_length'] = df['content'].fillna('').str.len()
    df['title_length'] = df['title'].fillna('').str.len()
    df['word_count'] = df['content'].fillna('').str.split().str.len()

    # Save to Google Drive
    data_path = f"{DRIVE_PROJECT_PATH}/data/mbg_data.csv"
    df.to_csv(data_path, index=False, encoding='utf-8-sig')
    print(f"\nüíæ Data saved to Google Drive: {data_path}")

# ============================================
# STEP 6: CREATE CONFIGURATION
# ============================================
print("\n‚öôÔ∏è  Creating Soda configuration...")
print("-" * 80)

config = """# Soda Configuration for MBG Data
data_source mbg_data:
  type: pandas
"""

config_path = f"{DRIVE_PROJECT_PATH}/config/configuration.yml"
with open(config_path, 'w') as f:
    f.write(config)
print(f"‚úÖ Config saved: {config_path}")

# ============================================
# STEP 7: DATA OVERVIEW
# ============================================
if df is not None:
    print("\n" + "="*80)
    print("üìä DATA OVERVIEW")
    print("="*80)

    print(f"\nüìã Dataset:")
    print(f"   ‚Ä¢ Rows: {len(df):,}")
    print(f"   ‚Ä¢ Columns: {len(df.columns)}")
    print(f"   ‚Ä¢ Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

    print(f"\nüìë Columns:")
    for i, col in enumerate(df.columns, 1):
        non_null = df[col].count()
        completeness = (non_null / len(df) * 100) if len(df) > 0 else 0
        print(f"   {i:2d}. {col:25s} {str(df[col].dtype):10s} "
              f"{completeness:5.1f}% complete")

    # Sample
    print(f"\nüìÑ Sample (first 3 rows):")
    print(df.head(3).to_string(index=False))

# ============================================
# COMPLETION
# ============================================
print("\n" + "="*80)
print("‚úÖ SETUP COMPLETE!")
print("="*80)

print("\nüìã Ready:")
print("   ‚úì Soda Core installed (compatible versions)")
print("   ‚úì NumPy incompatibility fixed")
print("   ‚úì Project saved to Google Drive")
print(f"   ‚úì Location: {DRIVE_PROJECT_PATH}")
if df is not None:
    print(f"   ‚úì Data loaded ({len(df):,} rows)")

print("\nüéØ Next: Run Step 2 to define quality checks!")
print("="*80)

üîß SODA CORE - SETUP & INSTALLATION

üìÇ Mounting Google Drive...
--------------------------------------------------------------------------------
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úÖ Google Drive mounted!
‚úì Created: /content/drive/MyDrive/soda_project/data
‚úì Created: /content/drive/MyDrive/soda_project/checks
‚úì Created: /content/drive/MyDrive/soda_project/reports
‚úì Created: /content/drive/MyDrive/soda_project/config

‚úÖ Project linked: /content/soda_project ‚Üí /content/drive/MyDrive/soda_project

üóëÔ∏è  Cleaning up conflicting packages...
--------------------------------------------------------------------------------
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: pandas 2.2.3
Uninstalling pandas-2.2.3:
  Successfully uninstalled pandas-2.2.3
Found existing installation: soda-pandas-das

  __import__(_dependency)


‚úÖ NumPy: 1.26.4
‚úÖ Pandas: 2.2.3
‚úÖ Soda Core: Imported successfully!
‚úÖ Test DataFrame created: (3, 1)

üìä Loading MBG data...
--------------------------------------------------------------------------------

‚úÖ Data loaded from: /content/drive/MyDrive/Magister SI TelU/Semester 1/Analisis Data dan Perusahaan/Tugas Besar/Progress Week 9/mbg_data_clean.csv
   üìä Shape: 244 rows √ó 13 columns

üíæ Data saved to Google Drive: /content/drive/MyDrive/soda_project/data/mbg_data.csv

‚öôÔ∏è  Creating Soda configuration...
--------------------------------------------------------------------------------
‚úÖ Config saved: /content/drive/MyDrive/soda_project/config/configuration.yml

üìä DATA OVERVIEW

üìã Dataset:
   ‚Ä¢ Rows: 244
   ‚Ä¢ Columns: 13
   ‚Ä¢ Memory: 1.1 MB

üìë Columns:
    1. url                       object     100.0% complete
    2. title                     object     100.0% complete
    3. date                      object     100.0% complete
    4. author       

In [None]:
"""
================================================================================
STEP 2: DEFINE DATA QUALITY CHECKS (REVISED - STRICTER VERSION)
================================================================================
Purpose: Create more realistic quality check rules with tighter thresholds
Auto-saves to Google Drive
================================================================================
"""

import pandas as pd
import os
from datetime import datetime

print("="*80)
print("üìù STEP 2: DEFINING DATA QUALITY CHECKS (STRICT MODE)")
print("="*80)

# ============================================
# 2.1 LOAD DATA
# ============================================
print("\nüìä Loading MBG data...")
print("-" * 80)

PROJECT_PATH = '/content/soda_project'
data_path = f'{PROJECT_PATH}/data/mbg_data.csv'

if os.path.exists(data_path):
    df = pd.read_csv(data_path, encoding='utf-8-sig')
    print(f"‚úÖ Data loaded: {len(df)} rows √ó {len(df.columns)} columns")
else:
    print("‚ùå Data file not found! Please run Step 1 first.")
    raise FileNotFoundError("mbg_data.csv not found")

# ============================================
# 2.2 DEEP DATA ANALYSIS
# ============================================
print("\nüîç Performing deep data analysis...")
print("-" * 80)

# Calculate comprehensive metrics
df['content_length'] = df['content'].fillna('').str.len()
df['title_length'] = df['title'].fillna('').str.len()
df['word_count'] = df['content'].fillna('').str.split().str.len()
df['has_date'] = df['date'].notna()
df['has_category'] = df['category'].notna()

# URL validation
if 'url' in df.columns:
    df['url_valid'] = df['url'].fillna('').str.contains(r'^https?://', regex=True)
    df['url_unique'] = ~df['url'].duplicated()

# Content quality indicators
df['content_quality_score'] = 0
df.loc[df['content_length'] >= 1000, 'content_quality_score'] += 1
df.loc[df['word_count'] >= 150, 'content_quality_score'] += 1
df.loc[df['title_length'] >= 30, 'content_quality_score'] += 1

print("\nüìä Data Quality Metrics:")
print(f"   ‚Ä¢ Rows: {len(df):,}")
print(f"   ‚Ä¢ Columns: {len(df.columns)}")
print(f"   ‚Ä¢ Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\nüìã Column Completeness:")
for col in ['title', 'content', 'url', 'date', 'category']:
    if col in df.columns:
        completeness = (df[col].notna().sum() / len(df) * 100)
        print(f"   ‚Ä¢ {col:15s}: {completeness:6.2f}% complete ({df[col].notna().sum():,}/{len(df):,})")

print("\nüìê Content Statistics:")
print(f"   ‚Ä¢ Content length - Min: {df['content_length'].min()}, Max: {df['content_length'].max()}, Avg: {df['content_length'].mean():.0f}")
print(f"   ‚Ä¢ Title length - Min: {df['title_length'].min()}, Max: {df['title_length'].max()}, Avg: {df['title_length'].mean():.0f}")
print(f"   ‚Ä¢ Word count - Min: {df['word_count'].min()}, Max: {df['word_count'].max()}, Avg: {df['word_count'].mean():.0f}")

if 'url' in df.columns:
    print(f"\nüîó URL Quality:")
    print(f"   ‚Ä¢ Valid URLs: {df['url_valid'].sum():,}/{df['url'].notna().sum():,} ({df['url_valid'].sum()/df['url'].notna().sum()*100:.1f}%)")
    print(f"   ‚Ä¢ Unique URLs: {df['url_unique'].sum():,}/{len(df):,} ({df['url_unique'].sum()/len(df)*100:.1f}%)")
    print(f"   ‚Ä¢ Duplicate URLs: {df['url'].duplicated().sum():,}")

# ============================================
# 2.3 CREATE STRICT QUALITY CHECKS
# ============================================
print("\n" + "="*80)
print("‚öôÔ∏è  CREATING STRICT QUALITY CHECK DEFINITIONS")
print("="*80)

# Calculate realistic thresholds based on actual data
content_p25 = df['content_length'].quantile(0.25)
content_p75 = df['content_length'].quantile(0.75)
title_p10 = df['title_length'].quantile(0.10)

checks_yaml = f"""# Data Quality Checks for MBG Dataset (STRICT MODE)
# Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
# Dataset: {len(df):,} rows √ó {len(df.columns)} columns
# Note: Stricter thresholds to identify real quality issues

checks for mbg_data:

  # ============================================================================
  # DIMENSION 1: COMPLETENESS (100% for critical fields)
  # ============================================================================

  - row_count > 0:
      name: Dataset must not be empty

  - missing_count(title) = 0:
      name: Title must be 100% complete (CRITICAL)

  - missing_count(content) = 0:
      name: Content must be 100% complete (CRITICAL)

  - missing_percent(url) = 0:
      name: URL must be 100% complete (STRICT)

  - missing_percent(date) < 5%:
      name: Date must be 95%+ complete
      warn: when > 2%

  - missing_percent(category) < 10%:
      name: Category must be 90%+ complete
      warn: when > 5%

  # ============================================================================
  # DIMENSION 2: VALIDITY (Format correctness)
  # ============================================================================

  - invalid_count(url) = 0:
      valid format: url
      name: All URLs must have valid format (STRICT)

  - invalid_percent(date) < 5%:
      valid format: date
      name: Date format must be 95%+ valid
      warn: when > 2%

  # ============================================================================
  # DIMENSION 3: ACCURACY (Realistic content ranges)
  # ============================================================================

  - min(content_length):
      fail: when < 500
      warn: when < 800
      name: Content minimum 500 characters (STRICT)

  - max(content_length):
      fail: when > 30000
      warn: when > 20000
      name: Content maximum 30k characters (reasonable limit)

  - avg(content_length):
      fail: when < 1500
      warn: when < 2000
      name: Average content must be 1500+ characters (quality threshold)

  - min(title_length):
      fail: when < 20
      warn: when < 30
      name: Title minimum 20 characters (STRICT)

  - max(title_length):
      fail: when > 200
      warn: when > 150
      name: Title maximum 200 characters

  - avg(title_length):
      fail: when < 40
      warn: when < 50
      name: Average title 40+ characters (quality threshold)

  - min(word_count):
      fail: when < 50
      warn: when < 100
      name: Minimum 50 words per article

  - avg(word_count):
      fail: when < 200
      warn: when < 300
      name: Average 200+ words per article (quality standard)

  # ============================================================================
  # DIMENSION 4: UNIQUENESS (No duplicates allowed)
  # ============================================================================

  - duplicate_count(url) = 0:
      name: URLs must be 100% unique (STRICT)

  - duplicate_percent(content) = 0:
      name: Content must be 100% unique (STRICT)

  - duplicate_percent(title) < 2%:
      name: Title duplication must be under 2%
      warn: when > 1%

  # ============================================================================
  # DIMENSION 5: CONSISTENCY (Cross-field validation)
  # ============================================================================

  - values in (category) must exist in ['Politik', 'Ekonomi', 'Pendidikan', 'Kesehatan', 'Sosial']:
      name: Category values must be from valid list
      warn: when other values found

  # ============================================================================
  # DIMENSION 6: SCHEMA VALIDATION
  # ============================================================================

  - schema:
      name: Schema validation (STRICT)
      fail:
        when required column missing:
          - title
          - content
          - url
        when wrong column type:
          title: text
          content: text
          url: text
"""

# Save checks
checks_path = f'{PROJECT_PATH}/checks/mbg_quality_checks_strict.yml'
with open(checks_path, 'w') as f:
    f.write(checks_yaml)

print(f"‚úÖ Strict quality checks saved!")
print(f"   üìÅ {checks_path}")

# Save enhanced data with all metrics
enhanced_path = f'{PROJECT_PATH}/data/mbg_data_with_metrics.csv'
df.to_csv(enhanced_path, index=False, encoding='utf-8-sig')
print(f"\n‚úÖ Enhanced data with metrics saved!")
print(f"   üìÅ {enhanced_path}")

# ============================================
# SUMMARY
# ============================================
print("\n" + "="*80)
print("‚úÖ STRICT QUALITY CHECKS DEFINED!")
print("="*80)

print("\nüìã Quality Dimensions Covered:")
print("   1. ‚úì COMPLETENESS - 6 strict checks (100% for critical fields)")
print("   2. ‚úì VALIDITY - 2 format checks")
print("   3. ‚úì ACCURACY - 8 range checks (realistic thresholds)")
print("   4. ‚úì UNIQUENESS - 3 duplication checks (0% tolerance)")
print("   5. ‚úì CONSISTENCY - 1 cross-field check")
print("   6. ‚úì SCHEMA - 1 structure check")
print(f"\n   üìä Total: 21 quality checks (vs 14 in original)")

print("\n‚ö†Ô∏è  Changes from Original:")
print("   ‚Ä¢ URL completeness: 95% ‚Üí 100% (stricter)")
print("   ‚Ä¢ Content min: 100 ‚Üí 500 chars (more realistic)")
print("   ‚Ä¢ Content avg: 500 ‚Üí 1500 chars (quality threshold)")
print("   ‚Ä¢ Title min: 10 ‚Üí 20 chars (stricter)")
print("   ‚Ä¢ Added: title length checks")
print("   ‚Ä¢ Added: word count checks")
print("   ‚Ä¢ Added: consistency checks")
print("   ‚Ä¢ Content duplication: 5% ‚Üí 0% (zero tolerance)")

print("\nüéØ Expected Outcome:")
print("   ‚ö†Ô∏è  Some checks will FAIL ‚Üí realistic quality assessment")
print("   ‚úÖ Identifies actual data quality issues")
print("   üìä Provides actionable improvement insights")

print("\nüéØ Next: Run Step 3 to execute strict validation!")
print("="*80)

üìù STEP 2: DEFINING DATA QUALITY CHECKS (STRICT MODE)

üìä Loading MBG data...
--------------------------------------------------------------------------------
‚úÖ Data loaded: 244 rows √ó 13 columns

üîç Performing deep data analysis...
--------------------------------------------------------------------------------

üìä Data Quality Metrics:
   ‚Ä¢ Rows: 244
   ‚Ä¢ Columns: 18
   ‚Ä¢ Memory: 1.08 MB

üìã Column Completeness:
   ‚Ä¢ title          : 100.00% complete (244/244)
   ‚Ä¢ content        : 100.00% complete (244/244)
   ‚Ä¢ url            : 100.00% complete (244/244)
   ‚Ä¢ date           : 100.00% complete (244/244)
   ‚Ä¢ category       : 100.00% complete (244/244)

üìê Content Statistics:
   ‚Ä¢ Content length - Min: 296, Max: 6378, Avg: 2211
   ‚Ä¢ Title length - Min: 27, Max: 100, Avg: 70
   ‚Ä¢ Word count - Min: 42, Max: 837, Avg: 298

üîó URL Quality:
   ‚Ä¢ Valid URLs: 244/244 (100.0%)
   ‚Ä¢ Unique URLs: 244/244 (100.0%)
   ‚Ä¢ Duplicate URLs: 0

‚öôÔ∏è  CREA

In [None]:
"""
================================================================================
STEP 3 PART 1: SODA CORE VALIDATION - SETUP & COMPLETENESS
================================================================================
"""

import pandas as pd
import os
import json
from datetime import datetime

print("="*80)
print("üî¨ STEP 3 PART 1: SODA CORE VALIDATION - SETUP")
print("="*80)

# ============================================
# 3.1 LOAD DATA
# ============================================
print("\nüìä Loading data with metrics...")
print("-" * 80)

PROJECT_PATH = '/content/soda_project'
data_path = f'{PROJECT_PATH}/data/mbg_data_with_metrics.csv'

df = pd.read_csv(data_path, encoding='utf-8-sig')
print(f"‚úÖ Data loaded: {len(df):,} rows √ó {len(df.columns)} columns")

# Calculate additional quality indicators
df['has_long_content'] = df['content_length'] >= 2000
df['has_quality_title'] = df['title_length'] >= 50
df['is_complete_record'] = df[['title', 'content', 'url', 'date', 'category']].notna().all(axis=1)
df['has_https'] = df['url'].str.startswith('https://') if 'url' in df.columns else False

print(f"\nüìä Quick Quality Overview:")
print(f"   ‚Ä¢ Long content (‚â•2000 chars): {df['has_long_content'].sum():,} ({df['has_long_content'].sum()/len(df)*100:.1f}%)")
print(f"   ‚Ä¢ Quality titles (‚â•50 chars): {df['has_quality_title'].sum():,} ({df['has_quality_title'].sum()/len(df)*100:.1f}%)")
print(f"   ‚Ä¢ Complete records: {df['is_complete_record'].sum():,} ({df['is_complete_record'].sum()/len(df)*100:.1f}%)")

# ============================================
# 3.2 VALIDATION SETUP
# ============================================
validation_results = []

def add_result(dimension, check_name, passed, actual, expected, severity="error", details=""):
    """Helper to add validation result with detailed info"""
    status = "PASSED" if passed else "FAILED"
    validation_results.append({
        'dimension': dimension,
        'check': check_name,
        'status': status,
        'actual': actual,
        'expected': expected,
        'severity': severity,
        'details': details
    })
    return status

print("\n" + "="*80)
print("üîç RUNNING COMPREHENSIVE VALIDATION (ADJUSTED THRESHOLDS)")
print("="*80)

# ============================================================================
# DIMENSION 1: COMPLETENESS (Critical = 100%, Others realistic)
# ============================================================================
print("\n1Ô∏è‚É£  COMPLETENESS VALIDATION")
print("-" * 80)

# 1.1 Dataset not empty
status = add_result('Completeness', 'Dataset not empty',
                   len(df) > 0,
                   f"{len(df):,} rows",
                   "> 0 rows")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Dataset not empty: {len(df):,} rows")

# 1.2 Title 100% complete (CRITICAL)
title_complete = df['title'].notna().sum()
title_pct = title_complete / len(df) * 100
status = add_result('Completeness', 'Title 100% complete',
                   df['title'].isna().sum() == 0,
                   f"{title_complete:,}/{len(df):,} ({title_pct:.2f}%)",
                   "100%",
                   details=f"{df['title'].isna().sum()} missing titles")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Title: {title_pct:.2f}%")

# 1.3 Content 100% complete (CRITICAL)
content_complete = df['content'].notna().sum()
content_pct = content_complete / len(df) * 100
status = add_result('Completeness', 'Content 100% complete',
                   df['content'].isna().sum() == 0,
                   f"{content_complete:,}/{len(df):,} ({content_pct:.2f}%)",
                   "100%",
                   details=f"{df['content'].isna().sum()} missing content")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Content: {content_pct:.2f}%")

# 1.4 URL 98%+ complete (STRICT)
if 'url' in df.columns:
    url_complete = df['url'].notna().sum()
    url_pct = url_complete / len(df) * 100
    status = add_result('Completeness', 'URL 98%+ complete',
                       url_pct >= 98,
                       f"{url_complete:,}/{len(df):,} ({url_pct:.2f}%)",
                       "‚â•98%",
                       severity="error" if url_pct < 98 else "warn",
                       details=f"{df['url'].isna().sum()} missing URLs")
    print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} URL: {url_pct:.2f}%")

# 1.5 Date 95%+ complete
if 'date' in df.columns:
    date_complete = df['date'].notna().sum()
    date_pct = date_complete / len(df) * 100
    status = add_result('Completeness', 'Date 95%+ complete',
                       date_pct >= 95,
                       f"{date_complete:,}/{len(df):,} ({date_pct:.2f}%)",
                       "‚â•95%",
                       severity="warn",
                       details=f"{df['date'].isna().sum()} missing dates")
    print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Date: {date_pct:.2f}%")

# 1.6 Category 90%+ complete
if 'category' in df.columns:
    cat_complete = df['category'].notna().sum()
    cat_pct = cat_complete / len(df) * 100
    status = add_result('Completeness', 'Category 90%+ complete',
                       cat_pct >= 90,
                       f"{cat_complete:,}/{len(df):,} ({cat_pct:.2f}%)",
                       "‚â•90%",
                       severity="warn",
                       details=f"{df['category'].isna().sum()} missing categories")
    print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Category: {cat_pct:.2f}%")

# 1.7 Complete records (all fields) ‚â•85%
complete_records = df['is_complete_record'].sum()
complete_pct = complete_records / len(df) * 100
status = add_result('Completeness', 'Complete records ‚â•85%',
                   complete_pct >= 85,
                   f"{complete_records:,}/{len(df):,} ({complete_pct:.2f}%)",
                   "‚â•85%",
                   severity="warn",
                   details=f"{len(df) - complete_records:,} incomplete records")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Complete records: {complete_pct:.2f}%")

print("\n‚úÖ Part 1 complete! Run Part 2 next...")
print("="*80)

"""
================================================================================
STEP 3 PART 2: VALIDITY, ACCURACY & UNIQUENESS VALIDATION (ADJUSTED)
================================================================================
Threshold yang disesuaikan:
- Content min: 100 chars (dari 800)
- Title min: 10 chars (dari 40)
- Word count min: 50 words (dari 100)
- Word count avg: 250 words (dari 300)
- Category: tambah kategori valid yang sesuai data MBG
================================================================================
"""

print("\n" + "="*80)
print("üîç PART 2: VALIDITY, ACCURACY & UNIQUENESS (ADJUSTED)")
print("="*80)

# ============================================================================
# DIMENSION 2: VALIDITY (Format & Type Correctness)
# ============================================================================
print("\n2Ô∏è‚É£  VALIDITY VALIDATION")
print("-" * 80)

# 2.1 URL format validation (95%+ valid)
if 'url' in df.columns:
    valid_urls = df['url'].dropna().str.contains(r'^https?://', regex=True, na=False).sum()
    total_urls = df['url'].notna().sum()
    url_validity_pct = (valid_urls / total_urls * 100) if total_urls > 0 else 0

    status = add_result('Validity', 'URL format 95%+ valid',
                       url_validity_pct >= 95,
                       f"{valid_urls:,}/{total_urls:,} ({url_validity_pct:.2f}%)",
                       "‚â•95%",
                       details=f"{total_urls - valid_urls} invalid URL formats")
    print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} URL format: {url_validity_pct:.2f}%")

# 2.2 HTTPS usage (70%+ - realistic threshold)
if 'url' in df.columns:
    https_urls = df['url'].dropna().str.contains(r'^https://', regex=True, na=False).sum()
    https_pct = (https_urls / total_urls * 100) if total_urls > 0 else 0

    status = add_result('Validity', 'HTTPS usage ‚â•70%',
                       https_pct >= 70,
                       f"{https_urls:,}/{total_urls:,} ({https_pct:.2f}%)",
                       "‚â•70%",
                       severity="warn",
                       details=f"{total_urls - https_urls} non-HTTPS URLs")
    print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} HTTPS: {https_pct:.2f}%")

# 2.3 Content not whitespace
valid_content = df['content'].dropna().str.strip().str.len() > 0
valid_content_count = valid_content.sum()
valid_content_pct = valid_content_count / len(df) * 100

status = add_result('Validity', 'Content not empty/whitespace',
                   valid_content_count == df['content'].notna().sum(),
                   f"{valid_content_count:,}/{df['content'].notna().sum():,}",
                   "100%",
                   details=f"{df['content'].notna().sum() - valid_content_count} empty/whitespace")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Valid content: {valid_content_pct:.2f}%")

# ============================================================================
# DIMENSION 3: ACCURACY (ADJUSTED THRESHOLDS)
# ============================================================================
print("\n3Ô∏è‚É£  ACCURACY VALIDATION (ADJUSTED THRESHOLDS)")
print("-" * 80)

content_min = int(df['content_length'].min())
content_max = int(df['content_length'].max())
content_avg = int(df['content_length'].mean())
content_median = int(df['content_length'].median())

title_min = int(df['title_length'].min())
title_max = int(df['title_length'].max())
title_avg = int(df['title_length'].mean())

word_min = int(df['word_count'].min())
word_max = int(df['word_count'].max())
word_avg = int(df['word_count'].mean())

# 3.1 Content minimum (ADJUSTED: 100 chars - dari 800)
status = add_result('Accuracy', 'Content min ‚â•100 chars',
                   content_min >= 100,
                   f"{content_min} chars",
                   "‚â•100 chars",
                   details=f"Shortest article: {content_min} chars")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Content min: {content_min} chars")

# 3.2 Content maximum (reasonable limit)
status = add_result('Accuracy', 'Content max ‚â§25,000 chars',
                   content_max <= 25000,
                   f"{content_max:,} chars",
                   "‚â§25,000",
                   details=f"Longest article: {content_max:,} chars")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Content max: {content_max:,} chars")

# 3.3 Content average (realistic: 1500 chars)
status = add_result('Accuracy', 'Content avg ‚â•1,500 chars',
                   content_avg >= 1500,
                   f"{content_avg:,} chars",
                   "‚â•1,500",
                   severity="warn",
                   details=f"Average: {content_avg:,} chars")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Content avg: {content_avg:,} chars")

# 3.4 Content median
status = add_result('Accuracy', 'Content median ‚â•1,200 chars',
                   content_median >= 1200,
                   f"{content_median:,} chars",
                   "‚â•1,200",
                   severity="warn")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Content median: {content_median:,} chars")

# 3.5 Title minimum (ADJUSTED: 10 chars - dari 40)
status = add_result('Accuracy', 'Title min ‚â•10 chars',
                   title_min >= 10,
                   f"{title_min} chars",
                   "‚â•10",
                   details=f"Shortest title: {title_min} chars")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Title min: {title_min} chars")

# 3.6 Title maximum
status = add_result('Accuracy', 'Title max ‚â§150 chars',
                   title_max <= 150,
                   f"{title_max} chars",
                   "‚â§150",
                   severity="warn")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Title max: {title_max} chars")

# 3.7 Title average (realistic: 50 chars)
status = add_result('Accuracy', 'Title avg ‚â•50 chars',
                   title_avg >= 50,
                   f"{title_avg} chars",
                   "‚â•50",
                   severity="warn",
                   details=f"Average title length: {title_avg} chars")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Title avg: {title_avg} chars")

# 3.8 Word count minimum (ADJUSTED: 50 words - dari 100)
status = add_result('Accuracy', 'Word count min ‚â•50 words',
                   word_min >= 50,
                   f"{word_min} words",
                   "‚â•50",
                   details=f"Shortest article: {word_min} words")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Word min: {word_min} words")

# 3.9 Word count average (ADJUSTED: 250 words - dari 300)
status = add_result('Accuracy', 'Word count avg ‚â•250 words',
                   word_avg >= 250,
                   f"{word_avg} words",
                   "‚â•250",
                   details=f"Average word count: {word_avg} words")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Word avg: {word_avg} words")

# 3.10 Quality content ratio (40% - adjusted)
long_content_pct = df['has_long_content'].sum() / len(df) * 100
status = add_result('Accuracy', 'Quality content ‚â•40% (‚â•2000 chars)',
                   long_content_pct >= 40,
                   f"{df['has_long_content'].sum():,}/{len(df):,} ({long_content_pct:.1f}%)",
                   "‚â•40%",
                   severity="warn",
                   details=f"{long_content_pct:.1f}% meet long-form standard")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Quality content: {long_content_pct:.1f}%")

# ============================================================================
# DIMENSION 4: UNIQUENESS (ZERO TOLERANCE)
# ============================================================================
print("\n4Ô∏è‚É£  UNIQUENESS VALIDATION")
print("-" * 80)

# 4.1 URL uniqueness (100% - STRICT)
if 'url' in df.columns:
    dup_urls = df['url'].duplicated().sum()
    status = add_result('Uniqueness', 'URL 100% unique',
                       dup_urls == 0,
                       f"{dup_urls:,} duplicates",
                       "0",
                       details=f"Found {dup_urls:,} duplicate URLs")
    print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} URL duplicates: {dup_urls:,}")

# 4.2 Content uniqueness (100% - STRICT)
content_dups = df['content'].duplicated().sum()
content_dup_pct = content_dups / len(df) * 100
status = add_result('Uniqueness', 'Content 100% unique',
                   content_dups == 0,
                   f"{content_dups:,} duplicates ({content_dup_pct:.2f}%)",
                   "0",
                   details=f"Found {content_dups:,} exact content duplicates")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ùå'} Content duplicates: {content_dups:,}")

# 4.3 Title duplication (<1%)
title_dups = df['title'].duplicated().sum()
title_dup_pct = title_dups / len(df) * 100
status = add_result('Uniqueness', 'Title duplication <1%',
                   title_dup_pct < 1.0,
                   f"{title_dups:,} duplicates ({title_dup_pct:.2f}%)",
                   "<1%",
                   severity="warn",
                   details=f"Found {title_dups:,} duplicate titles")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Title duplicates: {title_dup_pct:.2f}%")

# 4.4 Near-duplicates (similar titles)
if len(df) > 0:
    title_first_50 = df['title'].str[:50]
    near_dups = title_first_50.duplicated().sum()
    near_dup_pct = near_dups / len(df) * 100
    status = add_result('Uniqueness', 'Near-duplicate titles <3%',
                       near_dup_pct < 3.0,
                       f"{near_dups:,} similar ({near_dup_pct:.2f}%)",
                       "<3%",
                       severity="warn",
                       details=f"Titles with same first 50 chars: {near_dups:,}")
    print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Near-duplicates: {near_dup_pct:.2f}%")

print("\n‚úÖ Part 2 complete! Run Part 3 next...")
print("="*80)


"""
================================================================================
STEP 3 PART 3: CONSISTENCY & COMPREHENSIVE REPORT GENERATION
================================================================================
Kategori valid disesuaikan untuk data MBG:
- Politik, Ekonomi, Pendidikan, Kesehatan, Sosial, Teknologi, Olahraga,
  Hiburan, Internasional, Daerah
- PLUS: News, Rejabar, Ekonomi Syariah, Islam Digest, Visual
================================================================================
"""

print("\n" + "="*80)
print("üîç PART 3: CONSISTENCY & REPORT GENERATION")
print("="*80)

# ============================================================================
# DIMENSION 5: CONSISTENCY (ADJUSTED CATEGORIES)
# ============================================================================
print("\n5Ô∏è‚É£  CONSISTENCY VALIDATION")
print("-" * 80)

# 5.1 Category values (UPDATED untuk MBG)
if 'category' in df.columns:
    # Kategori standar + kategori khusus MBG
    valid_categories = [
        'Politik', 'Ekonomi', 'Pendidikan', 'Kesehatan', 'Sosial',
        'Teknologi', 'Olahraga', 'Hiburan', 'Internasional', 'Daerah',
        # Kategori khusus MBG
        'News', 'Rejabar', 'Ekonomi Syariah', 'Islam Digest', 'Visual'
    ]

    invalid_cats = df['category'].dropna()[~df['category'].dropna().isin(valid_categories)]
    invalid_cat_count = len(invalid_cats)

    # Tampilkan kategori unik yang ditemukan
    unique_cats = df['category'].dropna().unique()
    print(f"   üìã Kategori ditemukan: {len(unique_cats)}")
    for cat in sorted(unique_cats):
        count = (df['category'] == cat).sum()
        valid_mark = '‚úì' if cat in valid_categories else '‚úó'
        print(f"      {valid_mark} {cat}: {count}")

    status = add_result('Consistency', 'Category values valid',
                       invalid_cat_count == 0,
                       f"{invalid_cat_count} invalid",
                       "0",
                       severity="warn",
                       details=f"Invalid: {invalid_cats.unique().tolist()[:10] if invalid_cat_count > 0 else 'None'}")
    print(f"\n   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Invalid categories: {invalid_cat_count}")

# 5.2 Title-Content correlation (ADJUSTED: >0.0 lebih realistis)
title_content_corr = df[['title_length', 'content_length']].corr().iloc[0, 1]
status = add_result('Consistency', 'Title-Content correlation >0.0',
                   title_content_corr > 0.0,
                   f"{title_content_corr:.3f}",
                   ">0.0",
                   severity="warn",
                   details=f"Correlation shows {'positive' if title_content_corr > 0 else 'negative'} relationship")
print(f"   {'‚úÖ' if status == 'PASSED' else '‚ö†Ô∏è'} Title-Content corr: {title_content_corr:.3f}")

# ============================================
# 3.3 CALCULATE SUMMARY
# ============================================
print("\n" + "="*80)
print("üìä VALIDATION SUMMARY")
print("="*80)

passed_checks = sum(1 for r in validation_results if r['status'] == 'PASSED')
failed_checks = sum(1 for r in validation_results if r['status'] == 'FAILED')
total_checks = len(validation_results)
quality_score = (passed_checks / total_checks * 100) if total_checks > 0 else 0

print(f"\nüìä Overall Results:")
print(f"   ‚Ä¢ Total Checks: {total_checks}")
print(f"   ‚Ä¢ ‚úÖ PASSED: {passed_checks} ({passed_checks/total_checks*100:.1f}%)")
print(f"   ‚Ä¢ ‚ùå FAILED: {failed_checks} ({failed_checks/total_checks*100:.1f}%)")
print(f"   ‚Ä¢ Quality Score: {quality_score:.1f}/100")

if quality_score >= 90:
    quality_grade = "EXCELLENT üåü"
    grade_color = "#27ae60"
elif quality_score >= 75:
    quality_grade = "GOOD ‚úÖ"
    grade_color = "#2ecc71"
elif quality_score >= 60:
    quality_grade = "ACCEPTABLE ‚ö†Ô∏è"
    grade_color = "#f39c12"
else:
    quality_grade = "NEEDS IMPROVEMENT ‚ùå"
    grade_color = "#e74c3c"

print(f"   ‚Ä¢ Grade: {quality_grade}")

# Failed checks by dimension
if failed_checks > 0:
    print(f"\n‚ö†Ô∏è  Failed Checks by Dimension:")
    failed_by_dim = {}
    for r in validation_results:
        if r['status'] == 'FAILED':
            dim = r['dimension']
            failed_by_dim[dim] = failed_by_dim.get(dim, 0) + 1

    for dim, count in sorted(failed_by_dim.items(), key=lambda x: x[1], reverse=True):
        print(f"   ‚Ä¢ {dim}: {count} failed")

# Dimension scores
print(f"\nüìà Quality by Dimension:")
dimensions = {}
for r in validation_results:
    dim = r['dimension']
    if dim not in dimensions:
        dimensions[dim] = {'passed': 0, 'total': 0}
    dimensions[dim]['total'] += 1
    if r['status'] == 'PASSED':
        dimensions[dim]['passed'] += 1

for dim, stats in sorted(dimensions.items()):
    dim_score = (stats['passed'] / stats['total'] * 100) if stats['total'] > 0 else 0
    icon = '‚úÖ' if dim_score == 100 else '‚ö†Ô∏è' if dim_score >= 70 else '‚ùå'
    print(f"   {icon} {dim:20s}: {stats['passed']}/{stats['total']} ({dim_score:.1f}%)")

# ============================================
# 3.4 GENERATE HTML REPORT
# ============================================
print("\nüíæ Generating comprehensive HTML report...")

# Helper function for dimension cards
def generate_dimension_cards(results):
    dimensions = {}
    for r in results:
        dim = r['dimension']
        if dim not in dimensions:
            dimensions[dim] = []
        dimensions[dim].append(r)

    html = ""
    for dim, checks in dimensions.items():
        passed_count = sum(1 for c in checks if c['status'] == 'PASSED')
        total_count = len(checks)
        dim_score = (passed_count / total_count * 100) if total_count > 0 else 0

        if dim_score == 100:
            icon = '‚úÖ'
            color = '#27ae60'
            card_class = 'passed'
        elif dim_score >= 70:
            icon = '‚ö†Ô∏è'
            color = '#f39c12'
            card_class = 'partial'
        else:
            icon = '‚ùå'
            color = '#e74c3c'
            card_class = 'failed'

        html += f'''
        <div class="dimension-card {card_class}" style="border-left-color: {color};">
            <div class="dim-header">
                <h3>{icon} {dim}</h3>
                <div class="dim-score" style="color: {color};">
                    {passed_count}/{total_count} ({dim_score:.0f}%)
                </div>
            </div>
            <div class="checks-grid">
        '''

        for check in checks:
            status_class = check['status'].lower()
            icon = '‚úÖ' if check['status'] == 'PASSED' else '‚ùå'
            severity_badge = f'<span class="severity-badge {check["severity"]}">{check["severity"].upper()}</span>' if check.get('severity') else ''

            html += f'''
                <div class="check-item {status_class}">
                    <div class="check-header">
                        <span class="check-name">{check['check']}</span>
                        <div class="check-badges">
                            {severity_badge}
                            <span class="badge {status_class}">{icon} {check['status']}</span>
                        </div>
                    </div>
                    <div class="check-details">
                        <div><span class="detail-label">Actual:</span> <span class="detail-value">{check['actual']}</span></div>
                        <div><span class="detail-label">Expected:</span> <span class="detail-value">{check['expected']}</span></div>
                    </div>
                    {f'<div class="check-info">{check["details"]}</div>' if check.get('details') else ''}
                </div>
            '''

        html += '''
            </div>
        </div>
        '''

    return html

dimension_cards_html = generate_dimension_cards(validation_results)

# Generate recommendations
recommendations_html = ""
if failed_checks > 0:
    critical_failures = [r for r in validation_results if r['status'] == 'FAILED' and r.get('severity') == 'error']
    warn_failures = [r for r in validation_results if r['status'] == 'FAILED' and r.get('severity') == 'warn']

    recommendations_html = '<div class="recommendations">'

    if critical_failures:
        recommendations_html += '<h3>üö® Critical Issues (Must Fix)</h3><ul class="critical-list">'
        for r in critical_failures[:5]:
            recommendations_html += f'<li><strong>{r["check"]}</strong>: {r["details"] or f"Current {r["actual"]}, requires {r["expected"]}"}</li>'
        recommendations_html += '</ul>'

    if warn_failures:
        recommendations_html += '<h3>‚ö†Ô∏è Warnings (Should Improve)</h3><ul class="warn-list">'
        for r in warn_failures[:5]:
            recommendations_html += f'<li><strong>{r["check"]}</strong>: {r["details"] or f"Current {r["actual"]}, target {r["expected"]}"}</li>'
        recommendations_html += '</ul>'

    recommendations_html += '</div>'
else:
    recommendations_html = f'''<div class="success-banner">
        <h3>‚úÖ Excellent Data Quality!</h3>
        <p>All {total_checks} validation checks passed successfully. Data meets production quality standards.</p>
    </div>'''

# Full HTML Report (dipendekkan untuk muat dalam artifact)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
html_report = f"""<!DOCTYPE html>
<html>
<head>
    <title>Soda Core Quality Report - MBG Data (Adjusted)</title>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 30px; line-height: 1.6; color: #2c3e50; }}
        .container {{ max-width: 1400px; margin: 0 auto; background: white; border-radius: 20px; box-shadow: 0 20px 60px rgba(0,0,0,0.3); overflow: hidden; }}
        .header {{ background: linear-gradient(135deg, #3498db 0%, #2ecc71 100%); color: white; padding: 50px; text-align: center; }}
        .header h1 {{ font-size: 2.5em; margin-bottom: 15px; text-shadow: 2px 2px 4px rgba(0,0,0,0.2); }}
        .header .subtitle {{ font-size: 1.1em; opacity: 0.95; margin-bottom: 10px; }}
        .header .meta {{ margin-top: 15px; opacity: 0.85; font-size: 0.9em; }}
        .content {{ padding: 40px; }}
        .main-score {{ background: linear-gradient(135deg, {grade_color} 0%, {grade_color}dd 100%); color: white; padding: 50px; border-radius: 20px; text-align: center; margin: 30px 0; box-shadow: 0 10px 30px rgba(0,0,0,0.2); }}
        .main-score h2 {{ font-size: 1.5em; margin-bottom: 15px; opacity: 0.95; }}
        .main-score .score {{ font-size: 5em; font-weight: bold; margin: 20px 0; text-shadow: 3px 3px 6px rgba(0,0,0,0.3); }}
        .main-score .grade {{ font-size: 1.8em; font-weight: 600; text-transform: uppercase; letter-spacing: 2px; }}
        .score-dashboard {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin: 30px 0; }}
        .score-card {{ background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%); border-radius: 15px; padding: 25px; text-align: center; transition: transform 0.3s ease; box-shadow: 0 5px 15px rgba(0,0,0,0.1); }}
        .score-card:hover {{ transform: translateY(-5px); box-shadow: 0 10px 25px rgba(0,0,0,0.15); }}
        .score-card h3 {{ color: #7f8c8d; font-size: 0.85em; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 10px; }}
        .score-card .value {{ font-size: 2.5em; font-weight: bold; margin: 15px 0; }}
        .dimension-section {{ margin: 40px 0; }}
        .dimension-section h2 {{ color: #2c3e50; font-size: 1.8em; margin-bottom: 25px; padding-bottom: 12px; border-bottom: 3px solid #3498db; }}
        .dimension-card {{ background: #f8f9fa; border-radius: 15px; padding: 25px; margin-bottom: 20px; border-left: 5px solid #3498db; transition: all 0.3s ease; }}
        .dimension-card:hover {{ box-shadow: 0 6px 20px rgba(0,0,0,0.1); transform: translateX(3px); }}
        .dimension-card.passed {{ border-left-color: #27ae60; background: linear-gradient(to right, #f0f9f4, #f8f9fa); }}
        .dimension-card.failed {{ border-left-color: #e74c3c; background: linear-gradient(to right, #fef5f5, #f8f9fa); }}
        .dimension-card.partial {{ border-left-color: #f39c12; background: linear-gradient(to right, #fffbf0, #f8f9fa); }}
        .dim-header {{ display: flex; justify-content: space-between; align-items: center; margin-bottom: 15px; }}
        .dim-header h3 {{ color: #2c3e50; font-size: 1.3em; margin: 0; }}
        .dim-score {{ font-size: 1.1em; font-weight: 600; }}
        .checks-grid {{ display: grid; gap: 10px; }}
        .check-item {{ padding: 15px; background: white; border-radius: 8px; border-left: 3px solid #e9ecef; transition: all 0.2s ease; }}
        .check-item:hover {{ box-shadow: 0 3px 10px rgba(0,0,0,0.08); }}
        .check-item.passed {{ border-left-color: #27ae60; }}
        .check-item.failed {{ border-left-color: #e74c3c; }}
        .check-header {{ display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px; }}
        .check-name {{ font-weight: 600; color: #2c3e50; font-size: 0.95em; }}
        .check-badges {{ display: flex; gap: 6px; align-items: center; }}
        .badge {{ padding: 4px 10px; border-radius: 15px; font-weight: 600; font-size: 0.8em; }}
        .badge.passed {{ background: #d4edda; color: #155724; }}
        .badge.failed {{ background: #f8d7da; color: #721c24; }}
        .severity-badge {{ padding: 3px 8px; border-radius: 12px; font-size: 0.7em; font-weight: 700; text-transform: uppercase; }}
        .severity-badge.error {{ background: #ff6b6b; color: white; }}
        .severity-badge.warn {{ background: #ffd93d; color: #856404; }}
        .check-details {{ display: grid; grid-template-columns: 1fr 1fr; gap: 8px; color: #6c757d; font-size: 0.85em; margin-bottom: 6px; }}
        .detail-label {{ font-weight: 600; color: #495057; }}
        .detail-value {{ color: #6c757d; }}
        .check-info {{ margin-top: 6px; padding: 8px; background: #f8f9fa; border-radius: 5px; font-size: 0.85em; color: #6c757d; border-left: 2px solid #dee2e6; }}
        .recommendations {{ background: linear-gradient(135deg, #fff3cd 0%, #ffe5a0 100%); border-left: 5px solid #f39c12; padding: 25px; border-radius: 12px; margin: 30px 0; box-shadow: 0 4px 15px rgba(243, 156, 18, 0.2); }}
        .recommendations h3 {{ color: #856404; margin-bottom: 15px; font-size: 1.3em; }}
        .recommendations ul {{ list-style: none; padding-left: 0; }}
        .recommendations li {{ padding: 10px 0; padding-left: 25px; position: relative; color: #856404; line-height: 1.5; }}
        .critical-list li::before {{ content: 'üö®'; position: absolute; left: 0; font-size: 1.1em; }}
        .warn-list li::before {{ content: '‚ö†Ô∏è'; position: absolute; left: 0; font-size: 1.1em; }}
        .success-banner {{ background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%); border-left: 5px solid #27ae60; padding: 25px; border-radius: 12px; margin: 30px 0; box-shadow: 0 4px 15px rgba(39, 174, 96, 0.2); }}
        .success-banner h3 {{ color: #155724; margin-bottom: 12px; font-size: 1.3em; }}
        .success-banner p {{ color: #155724; font-size: 1em; line-height: 1.5; }}
        .footer {{ background: linear-gradient(135deg, #2c3e50 0%, #34495e 100%); color: white; padding: 30px; text-align: center; margin-top: 40px; }}
        .footer p {{ margin: 8px 0; opacity: 0.9; }}
        @media (max-width: 768px) {{
            .header h1 {{ font-size: 1.8em; }}
            .main-score .score {{ font-size: 3.5em; }}
            .check-details {{ grid-template-columns: 1fr; }}
        }}
        .academic-header {{
            background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
            color: white;
            padding: 25px 50px;
            border-bottom: 5px solid #f39c12;
        }}
        .academic-info {{
            display: grid;
            grid-template-columns: 2fr 1fr;
            gap: 30px;
            align-items: center;
        }}
        .course-info h2 {{
            font-size: 1.4em;
            margin-bottom: 8px;
            font-weight: 700;
            text-transform: uppercase;
            letter-spacing: 1px;
        }}
        .course-info p {{
            margin: 5px 0;
            opacity: 0.95;
            font-size: 0.95em;
        }}
        .student-info {{
            background: rgba(255,255,255,0.1);
            padding: 20px;
            border-radius: 12px;
            backdrop-filter: blur(10px);
        }}
        .student-info h3 {{
            font-size: 1.1em;
            margin-bottom: 12px;
            border-bottom: 2px solid rgba(255,255,255,0.3);
            padding-bottom: 8px;
        }}
        .student-info p {{
            margin: 6px 0;
            font-size: 0.9em;
        }}
        .tools-badge {{
            display: inline-block;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 8px 16px;
            border-radius: 20px;
            font-size: 0.85em;
            font-weight: 600;
            margin-top: 10px;
            box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
        }}
        .project-description {{
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            padding: 25px 50px;
            border-left: 5px solid #3498db;
        }}
        .project-description h3 {{
            color: #2c3e50;
            font-size: 1.3em;
            margin-bottom: 12px;
            display: flex;
            align-items: center;
            gap: 10px;
        }}
        .project-description p {{
            color: #34495e;
            line-height: 1.7;
            font-size: 0.95em;
        }}
        .footer-enhanced {{
            background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
            color: white;
            padding: 40px 50px;
            border-top: 5px solid #f39c12;
        }}
        .footer-grid {{
            display: grid;
            grid-template-columns: repeat(3, 1fr);
            gap: 30px;
            margin-bottom: 25px;
        }}
        .footer-section h4 {{
            font-size: 1.1em;
            margin-bottom: 12px;
            padding-bottom: 8px;
            border-bottom: 2px solid rgba(255,255,255,0.3);
        }}
        .footer-section p {{
            margin: 6px 0;
            opacity: 0.9;
            font-size: 0.9em;
        }}
        .footer-badge {{
            display: inline-block;
            background: rgba(255,255,255,0.2);
            padding: 5px 12px;
            border-radius: 15px;
            font-size: 0.8em;
            margin: 3px;
        }}
        .footer-bottom {{
            text-align: center;
            padding-top: 20px;
            border-top: 1px solid rgba(255,255,255,0.2);
            opacity: 0.85;
        }}
        @media (max-width: 768px) {{
            .academic-info, .footer-grid {{
                grid-template-columns: 1fr;
            }}
            .academic-header, .project-description, .footer-enhanced {{
                padding: 20px 25px;
            }}
        }}
</style>
</head>
<body>
    <div class="container">
        <!-- HEADER AKADEMIK -->
        <div class="academic-header">
            <div class="academic-info">
                <div class="course-info">
                    <h2>üìö ANALISIS BISNIS DATA PERUSAHAAN</h2>
                    <p><strong>Judul Tugas:</strong> Aplikasi 8 - Sistem Monitoring Kualitas Data Berbasis AI untuk Program Gratis</p>
                    <p><strong>Fokus:</strong> Otomatisasi deteksi anomali data distribusi makanan bergizi dengan AI untuk rekomendasi perbaikan</p>
                    <div class="tools-badge">üîß Tools: Soda Core (Observability) & DQOps (Testing Sensor-Based)</div>
                </div>
                <div class="student-info">
                    <h3>üë• Tim Mahasiswa</h3>
                    <p><strong>202022510021</strong><br>MADE MARSHALL VIRA DEVA</p>
                    <p><strong>202022420034</strong><br>IRFAN VENNY RAHMAYANTI</p>
                </div>
            </div>
        </div>


        <!-- HEADER REPORT (yang sudah ada) -->
        <div class="header">
            <h1>üìä Laporan Kualitas Data Soda Core</h1>
            <div class="subtitle">Data MBG - Threshold yang Disesuaikan</div>
            <div class="meta">
                Dibuat: {datetime.now().strftime('%d %B %Y, %H:%M:%S')} |
                Dataset: {len(df):,} baris √ó {len(df.columns)} kolom
            </div>
        </div>

        <div class="content">
            <div class="main-score">
                <h2>Overall Data Quality Score</h2>
                <div class="score">{quality_score:.1f}/100</div>
                <div class="grade">{quality_grade}</div>
            </div>

            <div class="score-dashboard">
                <div class="score-card">
                    <h3>Total Checks</h3>
                    <div class="value" style="color: #3498db;">{total_checks}</div>
                </div>
                <div class="score-card">
                    <h3>Passed</h3>
                    <div class="value" style="color: #27ae60;">‚úÖ {passed_checks}</div>
                </div>
                <div class="score-card">
                    <h3>Failed</h3>
                    <div class="value" style="color: #e74c3c;">‚ùå {failed_checks}</div>
                </div>
                <div class="score-card">
                    <h3>Success Rate</h3>
                    <div class="value" style="color: #9b59b6;">{passed_checks/total_checks*100:.1f}%</div>
                </div>
            </div>

            {recommendations_html}

            <div class="dimension-section">
                <h2>üìã Detailed Results by Dimension</h2>
                {dimension_cards_html}
            </div>
        </div>

        <!-- FOOTER ENHANCED (ganti footer lama) -->
        <div class="footer-enhanced">
            <div class="footer-grid">
                <div class="footer-section">
                    <h4>üìö Informasi Akademik</h4>
                    <p><strong>Mata Kuliah:</strong><br>Analisis Bisnis Data Perusahaan</p>
                    <p><strong>Aplikasi:</strong> #8 - Monitoring Kualitas Data AI</p>
                    <p><strong>Metode:</strong> Process Improvement</p>
                </div>

                <div class="footer-section">
                    <h4>üë• Tim Pengembang</h4>
                    <p><strong>202022510021</strong><br>Made Marshall Vira Deva</p>
                    <p><strong>202022420034</strong><br>Irfan Venny Rahmayanti</p>
                </div>

                <div class="footer-section">
                    <h4>üîß Framework & Tools</h4>
                    <div class="footer-badge">Soda Core</div>
                    <div class="footer-badge">DQOps</div>
                    <div class="footer-badge">Python</div>
                    <div class="footer-badge">Pandas</div>
                    <div class="footer-badge">AI-Powered</div>
                </div>
            </div>

            <div class="footer-bottom">
                <p><strong>üèÜ Framework Pengujian Sensor DQOps</strong></p>
                <p>Didukung oleh Validasi Berbasis Sensor Mendalam dengan Threshold yang Disesuaikan</p>
                <p>ID Laporan: {timestamp} | 6 Sensor √ó {total_checks} Pengujian</p>
                <p>¬© 2024 Proyek Kualitas Data MBG | Tugas Analisis Bisnis Data Perusahaan</p>
            </div>
        </div>
    </div>
</body>
</html>"""

# ============================================
# 3.5 SAVE REPORTS
# ============================================
print("\n" + "="*80)
print("üíæ SAVING REPORTS")
print("="*80)

reports_dir = f'{PROJECT_PATH}/reports'
os.makedirs(reports_dir, exist_ok=True)

# 1. Save JSON Report
report = {
    'framework': 'Soda Core (Adjusted)',
    'approach': 'Observability & Monitoring',
    'adjustments': {
        'content_min': '100 chars (from 800)',
        'title_min': '10 chars (from 40)',
        'word_min': '50 words (from 100)',
        'word_avg': '250 words (from 300)',
        'categories': 'Added MBG categories'
    },
    'scan_info': {
        'timestamp': datetime.now().isoformat(),
        'dataset_rows': len(df),
        'dataset_columns': len(df.columns)
    },
    'summary': {
        'total_checks': total_checks,
        'passed': passed_checks,
        'failed': failed_checks,
        'quality_score': quality_score,
        'quality_grade': quality_grade
    },
    'dimension_scores': {dim: {'passed': stats['passed'], 'total': stats['total'],
                               'score': stats['passed']/stats['total']*100}
                        for dim, stats in dimensions.items()},
    'detailed_results': validation_results
}

json_path = f'{reports_dir}/quality_report_adjusted_{timestamp}.json'
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(report, f, indent=2, ensure_ascii=False)
print(f"\n‚úÖ JSON Report: {json_path}")

# 2. Save HTML Report
html_path = f'{reports_dir}/soda_core_report_adjusted_{timestamp}.html'
with open(html_path, 'w', encoding='utf-8') as f:
    f.write(html_report)
print(f"‚úÖ HTML Report: {html_path}")

# 3. Save CSV Results
results_df = pd.DataFrame(validation_results)
csv_path = f'{reports_dir}/soda_core_results_adjusted_{timestamp}.csv'
results_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"‚úÖ CSV Results: {csv_path}")

# ============================================
# COMPLETION
# ============================================
print("\n" + "="*80)
print("‚úÖ SODA CORE VALIDATION COMPLETE (ADJUSTED)!")
print("="*80)

print(f"\nüìä Final Results:")
print(f"   ‚Ä¢ Quality Score: {quality_score:.1f}/100")
print(f"   ‚Ä¢ Grade: {quality_grade}")
print(f"   ‚Ä¢ Passed: {passed_checks}/{total_checks} ({passed_checks/total_checks*100:.1f}%)")
print(f"   ‚Ä¢ Failed: {failed_checks}/{total_checks} ({failed_checks/total_checks*100:.1f}%)")

if failed_checks > 0:
    print(f"\n‚ö†Ô∏è  Top Issues:")
    for r in [r for r in validation_results if r['status'] == 'FAILED'][:3]:
        print(f"   ‚Ä¢ {r['dimension']}: {r['check']}")

print(f"\nüìÅ Reports saved to: {reports_dir}/")
print(f"\nüéØ Adjustments made:")
print(f"   ‚Ä¢ Content min: 100 chars (was 800)")
print(f"   ‚Ä¢ Title min: 10 chars (was 40)")
print(f"   ‚Ä¢ Word min: 50 words (was 100)")
print(f"   ‚Ä¢ Word avg: 250 words (was 300)")
print(f"   ‚Ä¢ Categories: Added News, Rejabar, Ekonomi Syariah, Islam Digest, Visual")
print(f"   ‚Ä¢ Title-Content correlation: >0.0 (was >0.2)")
print("="*80)

üî¨ STEP 3 PART 1: SODA CORE VALIDATION - SETUP

üìä Loading data with metrics...
--------------------------------------------------------------------------------
‚úÖ Data loaded: 244 rows √ó 18 columns

üìä Quick Quality Overview:
   ‚Ä¢ Long content (‚â•2000 chars): 136 (55.7%)
   ‚Ä¢ Quality titles (‚â•50 chars): 222 (91.0%)
   ‚Ä¢ Complete records: 244 (100.0%)

üîç RUNNING COMPREHENSIVE VALIDATION (ADJUSTED THRESHOLDS)

1Ô∏è‚É£  COMPLETENESS VALIDATION
--------------------------------------------------------------------------------
   ‚úÖ Dataset not empty: 244 rows
   ‚úÖ Title: 100.00%
   ‚úÖ Content: 100.00%
   ‚úÖ URL: 100.00%
   ‚úÖ Date: 100.00%
   ‚úÖ Category: 100.00%
   ‚úÖ Complete records: 100.00%

‚úÖ Part 1 complete! Run Part 2 next...

üîç PART 2: VALIDITY, ACCURACY & UNIQUENESS (ADJUSTED)

2Ô∏è‚É£  VALIDITY VALIDATION
--------------------------------------------------------------------------------
   ‚úÖ URL format: 100.00%
   ‚úÖ HTTPS: 100.00%
   ‚úÖ Valid co

In [None]:
"""
================================================================================
STEP 4: DQOPS-STYLE SENSOR TESTING (PART 1 - SETUP & BASE)
================================================================================
Purpose: Deep sensor-based validation with ADJUSTED realistic thresholds
Expected: Validation with appropriate quality standards
Generates both JSON and comprehensive HTML report
================================================================================
"""

import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
from typing import Dict, Any, List, Tuple

print("="*80)
print("üî¨ STEP 4 PART 1: DQOPS SENSOR TESTING - SETUP")
print("="*80)

# ============================================
# HELPER: JSON Serialization Fix
# ============================================
def convert_to_native(obj):
    """Convert NumPy types to Python native types"""
    if isinstance(obj, dict):
        return {k: convert_to_native(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_native(item) for item in obj]
    elif isinstance(obj, (np.integer, np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.floating, np.float64, np.float32)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif pd.isna(obj):
        return None
    return obj

# ============================================
# ENHANCED SENSOR BASE CLASS
# ============================================
class DataQualitySensor:
    """Enhanced base class with severity levels"""
    def __init__(self, name: str, description: str, category: str):
        self.name = name
        self.description = description
        self.category = category
        self.results = []

    def add_check(self, rule_name: str, passed: bool, actual: Any,
                  expected: Any, severity: str = "error", details: str = ""):
        """Add a check result with severity"""
        self.results.append({
            'rule': rule_name,
            'status': 'PASSED' if passed else 'FAILED',
            'actual': str(actual),
            'expected': str(expected),
            'severity': severity,
            'details': details
        })
        return passed

    def collect(self, df: pd.DataFrame) -> Dict[str, Any]:
        raise NotImplementedError

    def evaluate(self, metrics: Dict[str, Any]) -> Dict[str, Any]:
        raise NotImplementedError

    def get_summary(self) -> Dict[str, int]:
        """Get summary of results"""
        passed = sum(1 for r in self.results if r['status'] == 'PASSED')
        failed = sum(1 for r in self.results if r['status'] == 'FAILED')
        critical = sum(1 for r in self.results if r['status'] == 'FAILED' and r['severity'] == 'critical')

        return {
            'total': len(self.results),
            'passed': passed,
            'failed': failed,
            'critical_failures': critical,
            'success_rate': (passed / len(self.results) * 100) if self.results else 0
        }

# ============================================
# LOAD DATA
# ============================================
print("\nüìä Loading data with metrics...")
print("-" * 80)

PROJECT_PATH = '/content/soda_project'
data_path = f'{PROJECT_PATH}/data/mbg_data_with_metrics.csv'

if not os.path.exists(data_path):
    print("‚ùå Data file not found! Please run Steps 1-3 first.")
    raise FileNotFoundError(f"{data_path} not found")

df = pd.read_csv(data_path, encoding='utf-8-sig')
print(f"‚úÖ Data loaded: {len(df):,} rows √ó {len(df.columns)} columns")

# Calculate additional metrics if not present
if 'content_length' not in df.columns:
    df['content_length'] = df['content'].fillna('').str.len()
if 'title_length' not in df.columns:
    df['title_length'] = df['title'].fillna('').str.len()
if 'word_count' not in df.columns:
    df['word_count'] = df['content'].fillna('').str.split().str.len()

print(f"\nüìä Quick Overview:")
print(f"   ‚Ä¢ Content length: min={df['content_length'].min()}, max={df['content_length'].max():,}, avg={df['content_length'].mean():.0f}")
print(f"   ‚Ä¢ Title length: min={df['title_length'].min()}, max={df['title_length'].max()}, avg={df['title_length'].mean():.0f}")
print(f"   ‚Ä¢ Word count: min={df['word_count'].min()}, max={df['word_count'].max():,}, avg={df['word_count'].mean():.0f}")

print("\n‚úÖ PART 1 COMPLETE - Base classes and data loaded!")
print("="*80)
print("‚ñ∂Ô∏è  Run Part 2 next to define sensors!")

"""
================================================================================
STEP 4 PART 2: DEFINING SENSORS 1-3 (ADJUSTED THRESHOLDS)
================================================================================
Run this after Part 1!
Thresholds adjusted based on actual data characteristics
================================================================================
"""

print("="*80)
print("üî¨ STEP 4 PART 2: DEFINING SENSORS WITH ADJUSTED THRESHOLDS")
print("="*80)

# ============================================
# SENSOR 1: COMPLETENESS (STRICT - 100% for critical)
# ============================================
class CompletenessSensor(DataQualitySensor):
    def __init__(self):
        super().__init__(
            "Completeness Sensor",
            "Monitors data completeness with zero-tolerance for critical fields",
            "Data Completeness"
        )

    def collect(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = {}
        for col in df.columns:
            total = int(len(df))
            non_null = int(df[col].notna().sum())
            null_count = int(total - non_null)
            pct = float((non_null / total * 100) if total > 0 else 0)

            metrics[col] = {
                'total': total,
                'non_null': non_null,
                'null': null_count,
                'completeness_pct': pct
            }
        return metrics

    def evaluate(self, metrics: Dict[str, Any]) -> Dict[str, Any]:
        # Critical fields: 100% complete (STRICT)
        critical_fields = ['title', 'content', 'url']
        for field in critical_fields:
            if field in metrics:
                pct = metrics[field]['completeness_pct']
                null_count = metrics[field]['null']
                self.add_check(
                    f'{field}_100_complete',
                    pct == 100,
                    f"{pct:.2f}% ({null_count} missing)",
                    "100% (0 missing)",
                    severity='critical',
                    details=f"CRITICAL: {field} must be 100% complete"
                )

        # Important fields: 98%+ complete
        important_fields = ['date', 'category']
        for field in important_fields:
            if field in metrics:
                pct = metrics[field]['completeness_pct']
                null_count = metrics[field]['null']
                self.add_check(
                    f'{field}_98_complete',
                    pct >= 98,
                    f"{pct:.2f}% ({null_count} missing)",
                    "‚â•98%",
                    severity='error',
                    details=f"{field} should be 98%+ complete"
                )

        # Overall completeness rate
        avg_completeness = np.mean([m['completeness_pct'] for m in metrics.values()])
        self.add_check(
            'overall_completeness_95',
            avg_completeness >= 95,
            f"{avg_completeness:.2f}%",
            "‚â•95%",
            severity='error',
            details="Overall dataset completeness"
        )

        return {'sensor': self.name, 'checks': self.results, **self.get_summary()}

# ============================================
# SENSOR 2: VALIDITY (STRICT FORMAT CHECKS)
# ============================================
class ValiditySensor(DataQualitySensor):
    def __init__(self):
        super().__init__(
            "Validity Sensor",
            "Validates data formats and types with strict rules",
            "Data Validity"
        )

    def collect(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = {}

        # URL validation (strict)
        if 'url' in df.columns:
            urls = df['url'].dropna()
            total = int(len(urls))

            valid_format = urls.str.match(r'^https?://[^\s]+$', na=False)
            https_only = urls.str.startswith('https://', na=False)
            has_domain = urls.str.contains(r'\.[a-z]{2,}', regex=True, na=False)

            metrics['url'] = {
                'total': total,
                'valid_format': int(valid_format.sum()),
                'https_count': int(https_only.sum()),
                'has_domain': int(has_domain.sum()),
                'format_pct': float(valid_format.sum() / total * 100) if total > 0 else 0,
                'https_pct': float(https_only.sum() / total * 100) if total > 0 else 0,
                'domain_pct': float(has_domain.sum() / total * 100) if total > 0 else 0
            }

        # Content validation (not empty/whitespace)
        if 'content' in df.columns:
            content = df['content'].dropna()
            total = int(len(content))

            not_empty = content.str.strip().str.len() > 0
            has_text = content.str.contains(r'[a-zA-Z]', regex=True, na=False)

            metrics['content'] = {
                'total': total,
                'not_empty': int(not_empty.sum()),
                'has_text': int(has_text.sum()),
                'valid_pct': float(has_text.sum() / total * 100) if total > 0 else 0
            }

        # Title validation
        if 'title' in df.columns:
            titles = df['title'].dropna()
            total = int(len(titles))

            not_empty = titles.str.strip().str.len() > 0
            has_text = titles.str.contains(r'[a-zA-Z]', regex=True, na=False)

            metrics['title'] = {
                'total': total,
                'not_empty': int(not_empty.sum()),
                'has_text': int(has_text.sum()),
                'valid_pct': float(has_text.sum() / total * 100) if total > 0 else 0
            }

        return metrics

    def evaluate(self, metrics: Dict[str, Any]) -> Dict[str, Any]:
        # URL format must be 98%+ valid (STRICT)
        if 'url' in metrics:
            url = metrics['url']
            self.add_check(
                'url_format_98_valid',
                url['format_pct'] >= 98,
                f"{url['format_pct']:.2f}%",
                "‚â•98%",
                severity='critical',
                details=f"{url['total'] - url['valid_format']} URLs have invalid format"
            )

            # HTTPS usage should be 80%+ (security best practice)
            self.add_check(
                'url_https_80',
                url['https_pct'] >= 80,
                f"{url['https_pct']:.2f}%",
                "‚â•80%",
                severity='error',
                details="Security: Most URLs should use HTTPS"
            )

            # All URLs must have valid domain
            self.add_check(
                'url_has_domain',
                url['domain_pct'] >= 99,
                f"{url['domain_pct']:.2f}%",
                "‚â•99%",
                severity='error',
                details="URLs must contain valid domain"
            )

        # Content must be valid text (100%)
        if 'content' in metrics:
            content = metrics['content']
            self.add_check(
                'content_valid_text',
                content['valid_pct'] == 100,
                f"{content['valid_pct']:.2f}%",
                "100%",
                severity='critical',
                details="All content must contain readable text"
            )

        # Title must be valid text (100%)
        if 'title' in metrics:
            title = metrics['title']
            self.add_check(
                'title_valid_text',
                title['valid_pct'] == 100,
                f"{title['valid_pct']:.2f}%",
                "100%",
                severity='critical',
                details="All titles must contain readable text"
            )

        return {'sensor': self.name, 'checks': self.results, **self.get_summary()}

# ============================================
# SENSOR 3: ACCURACY (ADJUSTED THRESHOLDS) ‚ú® UPDATED
# ============================================
class AccuracySensor(DataQualitySensor):
    def __init__(self):
        super().__init__(
            "Accuracy Sensor",
            "Validates data ranges with adjusted realistic standards",
            "Data Accuracy"
        )

    def collect(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = {}

        # Content length analysis
        if 'content_length' in df.columns:
            content_len = df['content_length']
            metrics['content_length'] = {
                'min': int(content_len.min()),
                'max': int(content_len.max()),
                'mean': float(content_len.mean()),
                'median': float(content_len.median()),
                'p25': float(content_len.quantile(0.25)),
                'p75': float(content_len.quantile(0.75)),
                'below_500': int((content_len < 500).sum()),
                'above_1500': int((content_len >= 1500).sum()),
                'pct_quality': float((content_len >= 1500).sum() / len(df) * 100)
            }

        # Title length analysis
        if 'title_length' in df.columns:
            title_len = df['title_length']
            metrics['title_length'] = {
                'min': int(title_len.min()),
                'max': int(title_len.max()),
                'mean': float(title_len.mean()),
                'median': float(title_len.median()),
                'below_20': int((title_len < 20).sum()),
                'above_40': int((title_len >= 40).sum()),
                'pct_quality': float((title_len >= 40).sum() / len(df) * 100)
            }

        # Word count analysis
        if 'word_count' in df.columns:
            word_cnt = df['word_count']
            metrics['word_count'] = {
                'min': int(word_cnt.min()),
                'max': int(word_cnt.max()),
                'mean': float(word_cnt.mean()),
                'median': float(word_cnt.median()),
                'below_100': int((word_cnt < 100).sum()),
                'above_250': int((word_cnt >= 250).sum()),
                'pct_quality': float((word_cnt >= 250).sum() / len(df) * 100)
            }

        return metrics

    def evaluate(self, metrics: Dict[str, Any]) -> Dict[str, Any]:
        if 'content_length' in metrics:
            c = metrics['content_length']

            # ‚ú® ADJUSTED: Minimum content length: 100 chars (realistic)
            self.add_check(
                'content_min_100',
                c['min'] >= 100,
                f"{c['min']} chars",
                "‚â•100 chars",
                severity='critical',
                details="Articles should be at least 100 characters"
            )

            # Maximum content length: 25,000 chars (reasonable limit)
            self.add_check(
                'content_max_25000',
                c['max'] <= 25000,
                f"{c['max']:,} chars",
                "‚â§25,000 chars",
                severity='error',
                details="Exceptionally long content may indicate data quality issues"
            )

            # Average content length: 1,200+ chars (quality standard)
            self.add_check(
                'content_avg_1200',
                c['mean'] >= 1200,
                f"{c['mean']:.0f} chars",
                "‚â•1,200 chars",
                severity='error',
                details="Average article length indicates overall content quality"
            )

            # 50%+ articles should be quality length (‚â•1500 chars)
            self.add_check(
                'content_quality_50pct',
                c['pct_quality'] >= 50,
                f"{c['pct_quality']:.1f}%",
                "‚â•50%",
                severity='error',
                details=f"Only {c['above_1500']:,} of {len(df):,} articles meet quality length standard"
            )

        if 'title_length' in metrics:
            t = metrics['title_length']

            # ‚ú® ADJUSTED: Minimum title length: 10 chars (realistic)
            self.add_check(
                'title_min_10',
                t['min'] >= 10,
                f"{t['min']} chars",
                "‚â•10 chars",
                severity='critical',
                details="Titles should be at least 10 characters"
            )

            # Maximum title length: 150 chars
            self.add_check(
                'title_max_150',
                t['max'] <= 150,
                f"{t['max']} chars",
                "‚â§150 chars",
                severity='error',
                details="Titles should be concise (under 150 chars)"
            )

            # Average title length: 40+ chars
            self.add_check(
                'title_avg_40',
                t['mean'] >= 40,
                f"{t['mean']:.1f} chars",
                "‚â•40 chars",
                severity='error',
                details="Average title length indicates quality headlines"
            )

            # 50%+ titles should be quality length (‚â•40 chars)
            self.add_check(
                'title_quality_50pct',
                t['pct_quality'] >= 50,
                f"{t['pct_quality']:.1f}%",
                "‚â•50%",
                severity='error',
                details=f"Only {t['above_40']:,} of {len(df):,} titles meet quality standard"
            )

        if 'word_count' in metrics:
            w = metrics['word_count']

            # ‚ú® ADJUSTED: Minimum word count: 50 words (realistic)
            self.add_check(
                'word_count_min_50',
                w['min'] >= 50,
                f"{w['min']} words",
                "‚â•50 words",
                severity='critical',
                details="Articles should contain at least 50 words"
            )

            # ‚ú® ADJUSTED: Average word count: 250+ words (quality standard)
            self.add_check(
                'word_count_avg_250',
                w['mean'] >= 250,
                f"{w['mean']:.0f} words",
                "‚â•250 words",
                severity='error',
                details="Average word count indicates content depth"
            )

            # 50%+ articles should be quality (‚â•250 words)
            self.add_check(
                'word_count_quality_50pct',
                w['pct_quality'] >= 50,
                f"{w['pct_quality']:.1f}%",
                "‚â•50%",
                severity='error',
                details=f"Only {w['above_250']:,} of {len(df):,} articles have 250+ words"
            )

        return {'sensor': self.name, 'checks': self.results, **self.get_summary()}

print("\n‚úÖ 3 Sensors defined (Completeness, Validity, Accuracy with adjusted thresholds)")
print("="*80)
print("üìä ADJUSTED THRESHOLDS:")
print("   ‚Ä¢ Content min: 100 chars (was 800)")
print("   ‚Ä¢ Title min: 10 chars (was 30)")
print("   ‚Ä¢ Word count min: 50 words (was 100)")
print("   ‚Ä¢ Word count avg: 250 words (was 300)")
print("="*80)
print("‚ñ∂Ô∏è  Run Part 3 next to define remaining sensors!")

"""
================================================================================
STEP 4 PART 3: SENSORS 4-6 (ADJUSTED THRESHOLDS)
================================================================================
Run this after Part 1 & 2!
================================================================================
"""

print("="*80)
print("üî¨ STEP 4 PART 3: ADDITIONAL SENSORS WITH ADJUSTED THRESHOLDS")
print("="*80)

# ============================================
# SENSOR 4: UNIQUENESS (ZERO TOLERANCE)
# ============================================
class UniquenessSensor(DataQualitySensor):
    def __init__(self):
        super().__init__(
            "Uniqueness Sensor",
            "Detects duplicates with zero tolerance for critical fields",
            "Data Uniqueness"
        )

    def collect(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = {}

        # URL uniqueness (CRITICAL - must be 100%)
        if 'url' in df.columns:
            urls = df['url'].dropna()
            total = int(len(urls))
            unique = int(urls.nunique())
            duplicates = int(total - unique)

            # Find duplicate URLs
            dup_urls = urls[urls.duplicated()].unique()[:5].tolist()

            metrics['url'] = {
                'total': total,
                'unique': unique,
                'duplicates': duplicates,
                'uniqueness_pct': float(unique / total * 100) if total > 0 else 0,
                'dup_examples': dup_urls
            }

        # Content uniqueness (CRITICAL)
        if 'content' in df.columns:
            content = df['content'].dropna()
            total = int(len(content))
            unique = int(content.nunique())
            duplicates = int(total - unique)

            metrics['content'] = {
                'total': total,
                'unique': unique,
                'duplicates': duplicates,
                'uniqueness_pct': float(unique / total * 100) if total > 0 else 0
            }

        # Title uniqueness (should be high but not necessarily 100%)
        if 'title' in df.columns:
            titles = df['title'].dropna()
            total = int(len(titles))
            unique = int(titles.nunique())
            duplicates = int(total - unique)

            metrics['title'] = {
                'total': total,
                'unique': unique,
                'duplicates': duplicates,
                'uniqueness_pct': float(unique / total * 100) if total > 0 else 0
            }

        return metrics

    def evaluate(self, metrics: Dict[str, Any]) -> Dict[str, Any]:
        # URL must be 100% unique (CRITICAL - STRICT)
        if 'url' in metrics:
            url = metrics['url']
            self.add_check(
                'url_100_unique',
                url['duplicates'] == 0,
                f"{url['duplicates']} duplicates ({url['uniqueness_pct']:.2f}% unique)",
                "0 duplicates (100% unique)",
                severity='critical',
                details=f"CRITICAL: Found {url['duplicates']} duplicate URLs - each article must have unique URL"
            )

        # Content must be 100% unique (CRITICAL - STRICT)
        if 'content' in metrics:
            content = metrics['content']
            self.add_check(
                'content_100_unique',
                content['duplicates'] == 0,
                f"{content['duplicates']} duplicates ({content['uniqueness_pct']:.2f}% unique)",
                "0 duplicates (100% unique)",
                severity='critical',
                details=f"CRITICAL: Found {content['duplicates']} duplicate articles - plagiarism or duplication issue"
            )

        # Title uniqueness should be 98%+ (some similar titles OK)
        if 'title' in metrics:
            title = metrics['title']
            self.add_check(
                'title_98_unique',
                title['uniqueness_pct'] >= 98,
                f"{title['duplicates']} duplicates ({title['uniqueness_pct']:.2f}% unique)",
                "‚â§2% duplicates (98%+ unique)",
                severity='error',
                details=f"Found {title['duplicates']} duplicate titles"
            )

        return {'sensor': self.name, 'checks': self.results, **self.get_summary()}

# ============================================
# SENSOR 5: CONSISTENCY (ADJUSTED) ‚ú® UPDATED
# ============================================
class ConsistencySensor(DataQualitySensor):
    def __init__(self):
        super().__init__(
            "Consistency Sensor",
            "Validates data consistency and relationships with flexible rules",
            "Data Consistency"
        )

    def collect(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = {}

        # Category consistency - ‚ú® EXPANDED valid categories
        if 'category' in df.columns:
            # Extended valid categories including the ones in your data
            valid_categories = [
                'Politik', 'Ekonomi', 'Pendidikan', 'Kesehatan',
                'Sosial', 'Teknologi', 'Olahraga', 'Hiburan',
                'Internasional', 'Daerah', 'Hukum', 'Budaya',
                # ‚ú® Additional categories from actual data
                'News', 'Rejabar', 'Ekonomi Syariah', 'Islam Digest', 'Visual'
            ]

            cats = df['category'].dropna()
            total = int(len(cats))
            valid = cats.isin(valid_categories)
            valid_count = int(valid.sum())
            invalid = cats[~valid].unique().tolist()

            metrics['category'] = {
                'total': total,
                'valid': valid_count,
                'invalid_count': int(total - valid_count),
                'validity_pct': float(valid_count / total * 100) if total > 0 else 0,
                'invalid_values': invalid[:10],
                'valid_categories': valid_categories
            }

        # Title-Content correlation (should be positively correlated)
        if 'title_length' in df.columns and 'content_length' in df.columns:
            corr = float(df[['title_length', 'content_length']].corr().iloc[0, 1])
            metrics['title_content_correlation'] = {
                'correlation': corr,
                'strength': 'strong' if abs(corr) > 0.5 else 'moderate' if abs(corr) > 0.3 else 'weak'
            }

        # Title-Word count correlation
        if 'title_length' in df.columns and 'word_count' in df.columns:
            corr = float(df[['title_length', 'word_count']].corr().iloc[0, 1])
            metrics['title_wordcount_correlation'] = {
                'correlation': corr,
                'strength': 'strong' if abs(corr) > 0.5 else 'moderate' if abs(corr) > 0.3 else 'weak'
            }

        return metrics

    def evaluate(self, metrics: Dict[str, Any]) -> Dict[str, Any]:
        # ‚ú® ADJUSTED: Category values should be valid (95%+ instead of 98%+)
        if 'category' in metrics:
            cat = metrics['category']
            self.add_check(
                'category_95_valid',
                cat['validity_pct'] >= 95,
                f"{cat['validity_pct']:.2f}% valid ({cat['invalid_count']} invalid)",
                "‚â•95% valid",
                severity='warning',  # Changed to warning
                details=f"Invalid categories found: {', '.join(map(str, cat['invalid_values'][:5]))}" if cat['invalid_values'] else "All categories valid"
            )

        # ‚ú® RELAXED: Title-Content correlation can be weak (>0 instead of >0.2)
        if 'title_content_correlation' in metrics:
            corr = metrics['title_content_correlation']['correlation']
            self.add_check(
                'title_content_correlation',
                corr > 0,
                f"{corr:.3f} ({metrics['title_content_correlation']['strength']})",
                ">0 (positive)",
                severity='warning',  # Changed to warning
                details="Title and content lengths should show positive correlation"
            )

        # ‚ú® RELAXED: Title-Word count correlation can be weak (>0 instead of >0.2)
        if 'title_wordcount_correlation' in metrics:
            corr = metrics['title_wordcount_correlation']['correlation']
            self.add_check(
                'title_wordcount_correlation',
                corr > 0,
                f"{corr:.3f} ({metrics['title_wordcount_correlation']['strength']})",
                ">0 (positive)",
                severity='warning',  # Changed to warning
                details="Title length should show positive correlation with word count"
            )

        return {'sensor': self.name, 'checks': self.results, **self.get_summary()}

# ============================================
# SENSOR 6: STATISTICAL (DISTRIBUTION ANALYSIS)
# ============================================
class StatisticalSensor(DataQualitySensor):
    def __init__(self):
        super().__init__(
            "Statistical Sensor",
            "Analyzes statistical distributions and outliers",
            "Statistical Quality"
        )

    def collect(self, df: pd.DataFrame) -> Dict[str, Any]:
        metrics = {}

        # Content length distribution
        if 'content_length' in df.columns:
            content_len = df['content_length']

            # Calculate IQR for outlier detection
            q1 = float(content_len.quantile(0.25))
            q3 = float(content_len.quantile(0.75))
            iqr = q3 - q1
            lower_bound = q1 - (1.5 * iqr)
            upper_bound = q3 + (1.5 * iqr)

            outliers_low = int((content_len < lower_bound).sum())
            outliers_high = int((content_len > upper_bound).sum())
            outliers_pct = float((outliers_low + outliers_high) / len(df) * 100)

            # Skewness and kurtosis
            skewness = float(content_len.skew())
            kurtosis = float(content_len.kurtosis())

            metrics['content_length_distribution'] = {
                'q1': q1,
                'q3': q3,
                'iqr': iqr,
                'outliers_low': outliers_low,
                'outliers_high': outliers_high,
                'outliers_total': outliers_low + outliers_high,
                'outliers_pct': outliers_pct,
                'skewness': skewness,
                'kurtosis': kurtosis
            }

        # Standard deviation check (consistency)
        if 'content_length' in df.columns:
            std = float(df['content_length'].std())
            mean = float(df['content_length'].mean())
            cv = float((std / mean) * 100) if mean > 0 else 0  # Coefficient of variation

            metrics['content_length_consistency'] = {
                'std': std,
                'mean': mean,
                'cv': cv,
                'consistency': 'high' if cv < 30 else 'moderate' if cv < 50 else 'low'
            }

        return metrics

    def evaluate(self, metrics: Dict[str, Any]) -> Dict[str, Any]:
        # Outliers should be less than 15% (adjusted from 10%)
        if 'content_length_distribution' in metrics:
            dist = metrics['content_length_distribution']
            self.add_check(
                'outliers_under_15pct',
                dist['outliers_pct'] < 15,
                f"{dist['outliers_pct']:.2f}% ({dist['outliers_total']} articles)",
                "<15%",
                severity='warning',
                details=f"Outliers: {dist['outliers_low']} too short, {dist['outliers_high']} too long"
            )

            # Skewness should be reasonable (between -2 and 2)
            self.add_check(
                'skewness_reasonable',
                -2 < dist['skewness'] < 2,
                f"{dist['skewness']:.3f}",
                "-2 to 2",
                severity='warning',
                details="High skewness indicates unbalanced distribution"
            )

        # Coefficient of variation (consistency check) - relaxed to 70%
        if 'content_length_consistency' in metrics:
            cons = metrics['content_length_consistency']
            self.add_check(
                'content_consistency',
                cons['cv'] < 70,
                f"{cons['cv']:.1f}% ({cons['consistency']})",
                "<70%",
                severity='warning',
                details="High variation indicates inconsistent article lengths"
            )

        return {'sensor': self.name, 'checks': self.results, **self.get_summary()}

print("\n‚úÖ 3 More sensors defined (Uniqueness, Consistency, Statistical)")
print("   Total: 6 comprehensive sensors with adjusted thresholds")
print("="*80)
print("üìä KEY ADJUSTMENTS:")
print("   ‚Ä¢ Category validation: now includes News, Rejabar, etc.")
print("   ‚Ä¢ Correlations: relaxed to >0 (flexible)")
print("   ‚Ä¢ Outliers: relaxed to <15%")
print("   ‚Ä¢ All warnings changed to 'warning' severity")
print("="*80)
print("‚ñ∂Ô∏è  Run Part 4 next to execute all sensors!")

"""
================================================================================
STEP 4 PART 4: EXECUTE SENSORS & GENERATE REPORTS
================================================================================
Run this after Parts 1, 2, and 3!
================================================================================
"""

print("="*80)
print("üöÄ STEP 4 PART 4: EXECUTING ALL 6 SENSORS")
print("="*80)

# Initialize all sensors
sensors = [
    CompletenessSensor(),
    ValiditySensor(),
    AccuracySensor(),
    UniquenessSensor(),
    ConsistencySensor(),
    StatisticalSensor()
]

# Execute sensors
all_sensor_results = []
total_checks = 0
total_passed = 0
total_failed = 0
critical_failures = 0
warnings_count = 0

for i, sensor in enumerate(sensors, 1):
    print(f"\n{i}. Running {sensor.name}...")
    print("-" * 80)

    # Collect metrics
    metrics = sensor.collect(df)

    # Evaluate
    result = sensor.evaluate(metrics)

    # Store results
    all_sensor_results.append({
        'sensor_name': sensor.name,
        'sensor_description': sensor.description,
        'category': sensor.category,
        'metrics': convert_to_native(metrics),
        'evaluation': convert_to_native(result)
    })

    # Update totals
    summary = sensor.get_summary()
    total_checks += summary['total']
    total_passed += summary['passed']
    total_failed += summary['failed']
    critical_failures += summary['critical_failures']

    # Count warnings
    warnings_in_sensor = sum(1 for r in sensor.results if r['status'] == 'FAILED' and r.get('severity') == 'warning')
    warnings_count += warnings_in_sensor

    # Display results
    print(f"   ‚úì Tests: {summary['total']}")
    print(f"   ‚úì Passed: {summary['passed']} ({summary['success_rate']:.1f}%)")
    print(f"   ‚úì Failed: {summary['failed']}")
    if summary['critical_failures'] > 0:
        print(f"   üö® CRITICAL FAILURES: {summary['critical_failures']}")
    if warnings_in_sensor > 0:
        print(f"   ‚ö†Ô∏è  WARNINGS: {warnings_in_sensor}")

# Calculate overall quality score
overall_quality_score = (total_passed / total_checks * 100) if total_checks > 0 else 0

# Determine grade
if overall_quality_score >= 95:
    grade = "EXCELLENT üåü"
    grade_color = "#27ae60"
elif overall_quality_score >= 85:
    grade = "GOOD ‚úÖ"
    grade_color = "#2ecc71"
elif overall_quality_score >= 75:
    grade = "ACCEPTABLE ‚ö†Ô∏è"
    grade_color = "#f39c12"
elif overall_quality_score >= 60:
    grade = "NEEDS IMPROVEMENT ‚ö†Ô∏è"
    grade_color = "#e67e22"
else:
    grade = "POOR ‚ùå"
    grade_color = "#e74c3c"

# ============================================
# SUMMARY
# ============================================
print("\n" + "="*80)
print("üìä SENSOR TESTING SUMMARY")
print("="*80)

print(f"\nüìä Overall Results:")
print(f"   ‚Ä¢ Total Sensors: {len(sensors)}")
print(f"   ‚Ä¢ Total Tests: {total_checks}")
print(f"   ‚Ä¢ ‚úÖ PASSED: {total_passed} ({overall_quality_score:.1f}%)")
print(f"   ‚Ä¢ ‚ùå FAILED: {total_failed} ({total_failed/total_checks*100:.1f}%)")
if critical_failures > 0:
    print(f"   ‚Ä¢ üö® CRITICAL: {critical_failures}")
if warnings_count > 0:
    print(f"   ‚Ä¢ ‚ö†Ô∏è  WARNINGS: {warnings_count}")
print(f"   ‚Ä¢ Quality Score: {overall_quality_score:.1f}/100")
print(f"   ‚Ä¢ Grade: {grade}")

# Failed checks by sensor
if total_failed > 0:
    print(f"\n‚ö†Ô∏è  Failed Tests by Sensor:")
    for result in all_sensor_results:
        failed = result['evaluation']['failed']
        if failed > 0:
            critical_in_sensor = sum(1 for r in result['evaluation']['checks'] if r['status'] == 'FAILED' and r.get('severity') == 'critical')
            warnings_in_sensor = sum(1 for r in result['evaluation']['checks'] if r['status'] == 'FAILED' and r.get('severity') == 'warning')

            status_str = f"{result['sensor_name']}: {failed} failed"
            if critical_in_sensor > 0:
                status_str += f" (üö® {critical_in_sensor} critical)"
            if warnings_in_sensor > 0:
                status_str += f" (‚ö†Ô∏è  {warnings_in_sensor} warnings)"
            print(f"   ‚Ä¢ {status_str}")

print("\n‚úÖ Sensor execution complete!")
print("="*80)
print("‚ñ∂Ô∏è  Run Part 5 next to generate comprehensive HTML report!")

"""
================================================================================
STEP 4 PART 5: GENERATE COMPREHENSIVE HTML REPORT & SAVE RESULTS
================================================================================
Run this after Parts 1-4!
================================================================================
"""

print("="*80)
print("üìÑ STEP 4 PART 5: GENERATING COMPREHENSIVE HTML REPORT")
print("="*80)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Helper function to generate sensor cards
def generate_sensor_cards(sensor_results):
    html = ""

    for result in sensor_results:
        sensor_name = result['sensor_name']
        sensor_desc = result['sensor_description']
        category = result['category']
        evaluation = result['evaluation']

        passed = evaluation['passed']
        failed = evaluation['failed']
        total = evaluation['total']
        success_rate = evaluation['success_rate']
        critical = evaluation.get('critical_failures', 0)

        # Count warnings
        warnings_in_sensor = sum(1 for c in evaluation['checks'] if c['status'] == 'FAILED' and c.get('severity') == 'warning')

        # Determine card style
        if success_rate == 100:
            card_class = 'success'
            status_color = '#27ae60'
            status_icon = '‚úÖ'
        elif critical > 0:
            card_class = 'danger'
            status_color = '#e74c3c'
            status_icon = 'üö®'
        elif warnings_in_sensor > 0:
            card_class = 'warning'
            status_color = '#f39c12'
            status_icon = '‚ö†Ô∏è'
        else:
            card_class = 'success'
            status_color = '#27ae60'
            status_icon = '‚úÖ'

        html += f'''
        <div class="sensor-card {card_class}">
            <div class="sensor-header">
                <div class="sensor-title">
                    <h3>{status_icon} {sensor_name}</h3>
                    <p class="sensor-desc">{sensor_desc}</p>
                    <span class="sensor-category">{category}</span>
                </div>
                <div class="sensor-score" style="color: {status_color};">
                    <div class="score-big">{success_rate:.1f}%</div>
                    <div class="score-detail">{passed}/{total} passed</div>
                </div>
            </div>

            <div class="checks-container">
        '''

        # Add each check
        for check in evaluation['checks']:
            check_status = check['status']
            check_class = 'passed' if check_status == 'PASSED' else 'failed'
            check_icon = '‚úÖ' if check_status == 'PASSED' else '‚ùå'
            severity = check.get('severity', 'error')

            severity_badge = ''
            if check_status == 'FAILED':
                if severity == 'critical':
                    severity_badge = '<span class="severity critical">üö® CRITICAL</span>'
                elif severity == 'warning':
                    severity_badge = '<span class="severity warning">‚ö†Ô∏è WARNING</span>'
                else:
                    severity_badge = '<span class="severity error">‚ö†Ô∏è ERROR</span>'

            html += f'''
                <div class="check-row {check_class}">
                    <div class="check-info">
                        <div class="check-title">
                            {check_icon} <strong>{check['rule'].replace('_', ' ').title()}</strong>
                            {severity_badge}
                        </div>
                        <div class="check-metrics">
                            <span class="metric-item"><strong>Actual:</strong> {check['actual']}</span>
                            <span class="metric-item"><strong>Expected:</strong> {check['expected']}</span>
                        </div>
                        {f'<div class="check-details">{check.get("details", "")}</div>' if check.get('details') else ''}
                    </div>
                </div>
            '''

        html += '''
            </div>
        </div>
        '''

    return html

sensor_cards_html = generate_sensor_cards(all_sensor_results)

# Generate recommendations based on failures
def generate_recommendations(sensor_results):
    critical_issues = []
    high_priority = []
    warnings_list = []

    for result in sensor_results:
        sensor_name = result['sensor_name']
        for check in result['evaluation']['checks']:
            if check['status'] == 'FAILED':
                severity = check.get('severity', 'error')
                issue = {
                    'sensor': sensor_name,
                    'rule': check['rule'],
                    'actual': check['actual'],
                    'expected': check['expected'],
                    'details': check.get('details', '')
                }

                if severity == 'critical':
                    critical_issues.append(issue)
                elif severity == 'warning':
                    warnings_list.append(issue)
                else:
                    high_priority.append(issue)

    html = '<div class="recommendations">'

    if critical_issues:
        html += '<div class="rec-section critical-section">'
        html += '<h3>üö® Critical Issues (Must Fix Immediately)</h3>'
        html += '<ul>'
        for issue in critical_issues[:10]:
            html += f'''<li>
                <strong>{issue['sensor']} - {issue['rule'].replace('_', ' ').title()}</strong><br>
                <span class="issue-detail">Current: {issue['actual']} | Required: {issue['expected']}</span><br>
                <span class="issue-desc">{issue['details']}</span>
            </li>'''
        html += '</ul></div>'

    if high_priority:
        html += '<div class="rec-section error-section">'
        html += '<h3>‚ö†Ô∏è High Priority Issues (Should Fix Soon)</h3>'
        html += '<ul>'
        for issue in high_priority[:10]:
            html += f'''<li>
                <strong>{issue['sensor']} - {issue['rule'].replace('_', ' ').title()}</strong><br>
                <span class="issue-detail">Current: {issue['actual']} | Target: {issue['expected']}</span><br>
                <span class="issue-desc">{issue['details']}</span>
            </li>'''
        html += '</ul></div>'

    if warnings_list:
        html += '<div class="rec-section warning-section">'
        html += '<h3>üí° Warnings (Nice to Improve)</h3>'
        html += '<ul>'
        for issue in warnings_list[:10]:
            html += f'''<li>
                <strong>{issue['sensor']} - {issue['rule'].replace('_', ' ').title()}</strong><br>
                <span class="issue-detail">Current: {issue['actual']} | Target: {issue['expected']}</span><br>
                <span class="issue-desc">{issue['details']}</span>
            </li>'''
        html += '</ul></div>'

    if not critical_issues and not high_priority and not warnings_list:
        html += '<div class="success-message">'
        html += '<h3>‚úÖ Excellent Data Quality!</h3>'
        html += f'<p>All {total_checks} sensor tests passed successfully. Your data meets production quality standards.</p>'
        html += '</div>'

    html += '</div>'
    return html

recommendations_html = generate_recommendations(all_sensor_results)

print("‚úÖ HTML components generated!")
print("‚ñ∂Ô∏è  Compiling full HTML report...")
print("\n" + "="*80)


"""
================================================================================
STEP 4 PART 6: FINAL HTML TEMPLATE & SAVE ALL RESULTS
================================================================================
Run this after Parts 1-5!
Final step to generate HTML and save all reports
================================================================================
"""

print("="*80)
print("üìÑ STEP 4 PART 6: GENERATING FINAL HTML & SAVING RESULTS")
print("="*80)

# Full HTML Report with complete CSS
html_report = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>DQOps Sensor Testing Report - MBG Data (Adjusted Thresholds)</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}

        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            padding: 30px;
            line-height: 1.6;
            color: #2c3e50;
        }}

        .container {{
            max-width: 1600px;
            margin: 0 auto;
            background: white;
            border-radius: 25px;
            box-shadow: 0 25px 70px rgba(0,0,0,0.3);
            overflow: hidden;
        }}

        .header {{
            background: linear-gradient(135deg, #9b59b6 0%, #e74c3c 100%);
            color: white;
            padding: 60px;
            text-align: center;
            position: relative;
            overflow: hidden;
        }}

        .header::before {{
            content: '';
            position: absolute;
            top: -50%;
            left: -50%;
            width: 200%;
            height: 200%;
            background: repeating-linear-gradient(
                45deg,
                transparent,
                transparent 20px,
                rgba(255,255,255,0.05) 20px,
                rgba(255,255,255,0.05) 40px
            );
            animation: movePattern 30s linear infinite;
        }}

        @keyframes movePattern {{
            0% {{ transform: translate(0, 0); }}
            100% {{ transform: translate(70px, 70px); }}
        }}

        .header h1 {{
            font-size: 3.5em;
            margin-bottom: 15px;
            text-shadow: 3px 3px 6px rgba(0,0,0,0.3);
            position: relative;
            z-index: 1;
        }}

        .header .subtitle {{
            font-size: 1.4em;
            opacity: 0.95;
            position: relative;
            z-index: 1;
            margin-bottom: 10px;
        }}

        .header .meta {{
            margin-top: 20px;
            opacity: 0.85;
            font-size: 1em;
            position: relative;
            z-index: 1;
        }}

        .content {{
            padding: 60px;
        }}

        .hero-score {{
            background: linear-gradient(135deg, {grade_color} 0%, {grade_color}dd 100%);
            color: white;
            padding: 70px;
            border-radius: 25px;
            text-align: center;
            margin: 50px 0;
            box-shadow: 0 15px 40px rgba(0,0,0,0.3);
            position: relative;
            overflow: hidden;
        }}

        .hero-score::before {{
            content: '';
            position: absolute;
            top: 0;
            left: 0;
            right: 0;
            bottom: 0;
            background: radial-gradient(circle at 30% 50%, rgba(255,255,255,0.15) 0%, transparent 50%);
        }}

        .hero-score h2 {{
            font-size: 1.8em;
            margin-bottom: 30px;
            opacity: 0.95;
            position: relative;
            z-index: 1;
        }}

        .hero-score .score {{
            font-size: 8em;
            font-weight: bold;
            margin: 40px 0;
            text-shadow: 4px 4px 8px rgba(0,0,0,0.3);
            position: relative;
            z-index: 1;
            letter-spacing: -5px;
        }}

        .hero-score .grade {{
            font-size: 2.5em;
            font-weight: 600;
            position: relative;
            z-index: 1;
            text-transform: uppercase;
            letter-spacing: 3px;
        }}

        .stats-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 25px;
            margin: 50px 0;
        }}

        .stat-card {{
            background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
            padding: 35px;
            border-radius: 15px;
            text-align: center;
            box-shadow: 0 5px 20px rgba(0,0,0,0.1);
            transition: transform 0.3s ease;
        }}

        .stat-card:hover {{
            transform: translateY(-8px);
            box-shadow: 0 15px 35px rgba(0,0,0,0.15);
        }}

        .stat-card .icon {{
            font-size: 3em;
            margin-bottom: 15px;
        }}

        .stat-card .label {{
            color: #7f8c8d;
            font-size: 0.9em;
            text-transform: uppercase;
            letter-spacing: 1.5px;
            margin-bottom: 12px;
        }}

        .stat-card .value {{
            font-size: 3em;
            font-weight: bold;
            color: #2c3e50;
            margin: 15px 0;
        }}

        .stat-card .subvalue {{
            color: #95a5a6;
            font-size: 0.9em;
        }}

        .sensors-section {{
            margin: 60px 0;
        }}

        .sensors-section h2 {{
            color: #2c3e50;
            font-size: 2.5em;
            margin-bottom: 40px;
            text-align: center;
            padding-bottom: 20px;
            border-bottom: 3px solid #9b59b6;
        }}

        .sensor-card {{
            background: white;
            border-radius: 20px;
            padding: 40px;
            margin-bottom: 35px;
            box-shadow: 0 8px 25px rgba(0,0,0,0.1);
            border-left: 6px solid #3498db;
            transition: all 0.3s ease;
        }}

        .sensor-card:hover {{
            transform: translateX(8px);
            box-shadow: 0 12px 35px rgba(0,0,0,0.15);
        }}

        .sensor-card.success {{
            border-left-color: #27ae60;
            background: linear-gradient(to right, #f0f9f4, white);
        }}

        .sensor-card.warning {{
            border-left-color: #f39c12;
            background: linear-gradient(to right, #fffbf0, white);
        }}

        .sensor-card.danger {{
            border-left-color: #e74c3c;
            background: linear-gradient(to right, #fef5f5, white);
        }}

        .sensor-header {{
            display: flex;
            justify-content: space-between;
            align-items: center;
            margin-bottom: 30px;
            padding-bottom: 20px;
            border-bottom: 2px solid #ecf0f1;
        }}

        .sensor-title h3 {{
            color: #2c3e50;
            font-size: 1.8em;
            margin-bottom: 10px;
        }}

        .sensor-desc {{
            color: #7f8c8d;
            font-size: 1.05em;
            margin-bottom: 10px;
        }}

        .sensor-category {{
            display: inline-block;
            background: #3498db;
            color: white;
            padding: 5px 15px;
            border-radius: 20px;
            font-size: 0.85em;
            font-weight: 600;
        }}

        .sensor-score {{
            text-align: right;
        }}

        .score-big {{
            font-size: 4em;
            font-weight: bold;
            line-height: 1;
        }}

        .score-detail {{
            font-size: 1.1em;
            color: #7f8c8d;
            margin-top: 5px;
        }}

        .checks-container {{
            display: grid;
            gap: 15px;
        }}

        .check-row {{
            background: #f8f9fa;
            padding: 20px;
            border-radius: 12px;
            border-left: 4px solid #e9ecef;
            transition: all 0.2s ease;
        }}

        .check-row:hover {{
            background: #f1f3f5;
            box-shadow: 0 4px 12px rgba(0,0,0,0.08);
        }}

        .check-row.passed {{
            border-left-color: #27ae60;
            background: #f0f9f4;
        }}

        .check-row.failed {{
            border-left-color: #e74c3c;
            background: #fef5f5;
        }}

        .check-title {{
            font-size: 1.1em;
            color: #2c3e50;
            margin-bottom: 12px;
            display: flex;
            align-items: center;
            gap: 10px;
            flex-wrap: wrap;
        }}

        .severity {{
            padding: 4px 12px;
            border-radius: 15px;
            font-size: 0.75em;
            font-weight: 700;
            text-transform: uppercase;
            margin-left: 10px;
        }}

        .severity.critical {{
            background: #e74c3c;
            color: white;
            animation: pulse 2s infinite;
        }}

        .severity.error {{
            background: #f39c12;
            color: white;
        }}

        .severity.warning {{
            background: #f39c12;
            color: white;
            opacity: 0.8;
        }}

        @keyframes pulse {{
            0%, 100% {{ opacity: 1; }}
            50% {{ opacity: 0.7; }}
        }}

        .check-metrics {{
            display: flex;
            gap: 30px;
            margin-bottom: 10px;
            color: #6c757d;
            font-size: 0.95em;
            flex-wrap: wrap;
        }}

        .metric-item {{
            display: flex;
            gap: 8px;
        }}

        .check-details {{
            margin-top: 12px;
            padding: 12px;
            background: white;
            border-radius: 8px;
            color: #6c757d;
            font-size: 0.9em;
            border-left: 3px solid #dee2e6;
        }}

        .recommendations {{
            margin: 60px 0;
        }}

        .rec-section {{
            padding: 35px;
            border-radius: 15px;
            margin-bottom: 30px;
            box-shadow: 0 5px 20px rgba(0,0,0,0.1);
        }}

        .critical-section {{
            background: linear-gradient(135deg, #f8d7da 0%, #f5c6cb 100%);
            border-left: 6px solid #e74c3c;
        }}

        .critical-section h3 {{
            color: #721c24;
            font-size: 1.8em;
            margin-bottom: 20px;
        }}

        .critical-section ul {{
            list-style: none;
            padding: 0;
        }}

        .critical-section li {{
            padding: 15px;
            margin-bottom: 15px;
            background: white;
            border-radius: 10px;
            border-left: 4px solid #e74c3c;
        }}

        .error-section {{
            background: linear-gradient(135deg, #fff3cd 0%, #ffe5a0 100%);
            border-left: 6px solid #f39c12;
        }}

        .error-section h3 {{
            color: #856404;
            font-size: 1.8em;
            margin-bottom: 20px;
        }}

        .error-section ul {{
            list-style: none;
            padding: 0;
        }}

        .error-section li {{
            padding: 15px;
            margin-bottom: 15px;
            background: white;
            border-radius: 10px;
            border-left: 4px solid #f39c12;
        }}

        .warning-section {{
            background: linear-gradient(135deg, #e8f4fd 0%, #d1e7fd 100%);
            border-left: 6px solid #3498db;
        }}

        .warning-section h3 {{
            color: #004085;
            font-size: 1.8em;
            margin-bottom: 20px;
        }}

        .warning-section ul {{
            list-style: none;
            padding: 0;
        }}

        .warning-section li {{
            padding: 15px;
            margin-bottom: 15px;
            background: white;
            border-radius: 10px;
            border-left: 4px solid #3498db;
        }}

        .issue-detail {{
            color: #6c757d;
            font-size: 0.95em;
            display: block;
            margin: 5px 0;
        }}

        .issue-desc {{
            color: #495057;
            font-style: italic;
            display: block;
            margin-top: 5px;
        }}

        .success-message {{
            background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
            border-left: 6px solid #27ae60;
            padding: 35px;
            border-radius: 15px;
            box-shadow: 0 5px 20px rgba(39, 174, 96, 0.2);
        }}

        .success-message h3 {{
            color: #155724;
            font-size: 1.8em;
            margin-bottom: 15px;
        }}

        .success-message p {{
            color: #155724;
            font-size: 1.1em;
            line-height: 1.6;
        }}

        .methodology {{
            background: #f8f9fa;
            padding: 40px;
            border-radius: 15px;
            margin: 60px 0;
        }}

        .methodology h2 {{
            color: #2c3e50;
            font-size: 2em;
            margin-bottom: 25px;
        }}

        .methodology p {{
            color: #495057;
            font-size: 1.05em;
            line-height: 1.8;
            margin: 15px 0;
        }}

        .adjustments-info {{
            background: linear-gradient(135deg, #e3f2fd 0%, #bbdefb 100%);
            border-left: 6px solid #2196f3;
            padding: 30px;
            border-radius: 15px;
            margin: 40px 0;
        }}

        .adjustments-info h3 {{
            color: #0d47a1;
            font-size: 1.6em;
            margin-bottom: 20px;
        }}

        .adjustments-info ul {{
            color: #1565c0;
            font-size: 1.05em;
            line-height: 1.8;
            padding-left: 25px;
        }}

        .adjustments-info li {{
            margin: 10px 0;
        }}

        .footer {{
            background: linear-gradient(135deg, #2c3e50 0%, #34495e 100%);
            color: white;
            padding: 50px;
            text-align: center;
            margin-top: 60px;
        }}

        .footer h3 {{
            font-size: 1.8em;
            margin-bottom: 20px;
        }}

        .footer p {{
            margin: 12px 0;
            opacity: 0.9;
            font-size: 1.05em;
        }}

        @media (max-width: 768px) {{
            .header h1 {{ font-size: 2.2em; }}
            .hero-score .score {{ font-size: 5em; }}
            .sensor-header {{ flex-direction: column; text-align: center; }}
            .check-metrics {{ flex-direction: column; gap: 10px; }}
            .stats-grid {{ grid-template-columns: 1fr; }}
        }}
        .academic-header {{
            background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
            color: white;
            padding: 25px 50px;
            border-bottom: 5px solid #f39c12;
        }}
        .academic-info {{
            display: grid;
            grid-template-columns: 2fr 1fr;
            gap: 30px;
            align-items: center;
        }}
        .course-info h2 {{
            font-size: 1.4em;
            margin-bottom: 8px;
            font-weight: 700;
            text-transform: uppercase;
            letter-spacing: 1px;
        }}
        .course-info p {{
            margin: 5px 0;
            opacity: 0.95;
            font-size: 0.95em;
        }}
        .student-info {{
            background: rgba(255,255,255,0.1);
            padding: 20px;
            border-radius: 12px;
            backdrop-filter: blur(10px);
        }}
        .student-info h3 {{
            font-size: 1.1em;
            margin-bottom: 12px;
            border-bottom: 2px solid rgba(255,255,255,0.3);
            padding-bottom: 8px;
        }}
        .student-info p {{
            margin: 6px 0;
            font-size: 0.9em;
        }}
        .tools-badge {{
            display: inline-block;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 8px 16px;
            border-radius: 20px;
            font-size: 0.85em;
            font-weight: 600;
            margin-top: 10px;
            box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
        }}
        .project-description {{
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            padding: 25px 50px;
            border-left: 5px solid #3498db;
        }}
        .project-description h3 {{
            color: #2c3e50;
            font-size: 1.3em;
            margin-bottom: 12px;
            display: flex;
            align-items: center;
            gap: 10px;
        }}
        .project-description p {{
            color: #34495e;
            line-height: 1.7;
            font-size: 0.95em;
        }}
        .footer-enhanced {{
            background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
            color: white;
            padding: 40px 50px;
            border-top: 5px solid #f39c12;
        }}
        .footer-grid {{
            display: grid;
            grid-template-columns: repeat(3, 1fr);
            gap: 30px;
            margin-bottom: 25px;
        }}
        .footer-section h4 {{
            font-size: 1.1em;
            margin-bottom: 12px;
            padding-bottom: 8px;
            border-bottom: 2px solid rgba(255,255,255,0.3);
        }}
        .footer-section p {{
            margin: 6px 0;
            opacity: 0.9;
            font-size: 0.9em;
        }}
        .footer-badge {{
            display: inline-block;
            background: rgba(255,255,255,0.2);
            padding: 5px 12px;
            border-radius: 15px;
            font-size: 0.8em;
            margin: 3px;
        }}
        .footer-bottom {{
            text-align: center;
            padding-top: 20px;
            border-top: 1px solid rgba(255,255,255,0.2);
            opacity: 0.85;
        }}
        @media (max-width: 768px) {{
            .academic-info, .footer-grid {{
                grid-template-columns: 1fr;
            }}
            .academic-header, .project-description, .footer-enhanced {{
                padding: 20px 25px;
            }}
        }}
    </style>
</head>
<body>
    <div class="container">
         <!-- HEADER AKADEMIK -->
        <div class="academic-header">
            <div class="academic-info">
                <div class="course-info">
                    <h2>üìö ANALISIS BISNIS DATA PERUSAHAAN</h2>
                    <p><strong>Judul Tugas:</strong> Aplikasi 8 - Sistem Monitoring Kualitas Data Berbasis AI untuk Program Gratis</p>
                    <p><strong>Fokus:</strong> Otomatisasi deteksi anomali data distribusi makanan bergizi dengan AI untuk rekomendasi perbaikan</p>
                    <div class="tools-badge">üîß Tools: Soda Core (Observability) & DQOps (Testing Sensor-Based)</div>
                </div>
                <div class="student-info">
                    <h3>üë• Tim Mahasiswa</h3>
                    <p><strong>202022510021</strong><br>MADE MARSHALL VIRA DEVA</p>
                    <p><strong>202022420034</strong><br>IRFAN VENNY RAHMAYANTI</p>
                </div>
            </div>
        </div>

        <!-- HEADER REPORT (yang sudah ada) -->
        <div class="header">
            <h1>üî¨ Laporan Pengujian Sensor DQOps</h1>
            <div class="subtitle">Validasi Kualitas Data Berbasis Sensor Mendalam </div>
            <div class="meta">
                Dataset: Data MBG ({len(df):,} baris √ó {len(df.columns)} kolom)<br>
                Dibuat: {datetime.now().strftime('%A, %d %B %Y pukul %H:%M:%S')}
            </div>
        </div>

        <div class="content">
            <!-- Hero Score -->
            <div class="hero-score">
                <h2>üéØ Overall Data Quality Score</h2>
                <div class="score">{overall_quality_score:.1f}</div>
                <div class="grade">{grade}</div>
            </div>

            <!-- Statistics Dashboard -->
            <div class="stats-grid">
                <div class="stat-card">
                    <div class="icon">üî¨</div>
                    <div class="label">Total Sensors</div>
                    <div class="value">{len(sensors)}</div>
                    <div class="subvalue">6 categories</div>
                </div>

                <div class="stat-card">
                    <div class="icon">üéØ</div>
                    <div class="label">Total Tests</div>
                    <div class="value">{total_checks}</div>
                    <div class="subvalue">Comprehensive</div>
                </div>

                <div class="stat-card">
                    <div class="icon">‚úÖ</div>
                    <div class="label">Passed</div>
                    <div class="value" style="color: #27ae60;">{total_passed}</div>
                    <div class="subvalue">{overall_quality_score:.1f}% success</div>
                </div>

                <div class="stat-card">
                    <div class="icon">‚ùå</div>
                    <div class="label">Failed</div>
                    <div class="value" style="color: #e74c3c;">{total_failed}</div>
                    <div class="subvalue">{total_failed/total_checks*100:.1f}% failure</div>
                </div>

                <div class="stat-card">
                    <div class="icon">üö®</div>
                    <div class="label">Critical</div>
                    <div class="value" style="color: #c0392b;">{critical_failures}</div>
                    <div class="subvalue">Must fix</div>
                </div>

                <div class="stat-card">
                    <div class="icon">‚ö†Ô∏è</div>
                    <div class="label">Warnings</div>
                    <div class="value" style="color: #f39c12;">{warnings_count}</div>
                    <div class="subvalue">Nice to fix</div>
                </div>
            </div>

            <!-- Adjustments Info -->
            <div class="adjustments-info">
                <h3>üìä Adjusted Thresholds Applied</h3>
                <ul>
                    <li><strong>Content Length:</strong> Minimum 100 chars (was 800 chars) - more realistic for short articles</li>
                    <li><strong>Title Length:</strong> Minimum 10 chars (was 30 chars) - allows for concise headlines</li>
                    <li><strong>Word Count:</strong> Minimum 50 words (was 100 words) - realistic baseline</li>
                    <li><strong>Average Word Count:</strong> 250 words target (was 300 words) - adjusted to actual data</li>
                    <li><strong>Categories:</strong> Extended list including News, Rejabar, Ekonomi Syariah, Islam Digest, Visual</li>
                    <li><strong>Correlations:</strong> Relaxed to >0 (was >0.2) - allows for weaker relationships</li>
                    <li><strong>Severity Levels:</strong> Many checks downgraded to "warning" instead of "error"</li>
                </ul>
            </div>

            <!-- Recommendations -->
            {recommendations_html}

            <!-- Sensor Details -->
            <div class="sensors-section">
                <h2>üî¨ Detailed Sensor Results</h2>
                {sensor_cards_html}
            </div>

            <!-- Methodology -->
            <div class="methodology">
                <h2>üìö DQOps Sensor Methodology</h2>
                <p><strong>Approach:</strong> DQOps (Data Quality Operations) uses sensor-based testing to continuously monitor and validate data quality across multiple dimensions.</p>

                <p><strong>6 Sensor Categories:</strong></p>
                <p>1. <strong>Completeness Sensor:</strong> Monitors data completeness with zero-tolerance for critical fields (title, content, URL must be 100% complete)</p>
                <p>2. <strong>Validity Sensor:</strong> Validates data formats, URL structures, HTTPS usage, and content integrity</p>
                <p>3. <strong>Accuracy Sensor:</strong> Enforces realistic quality standards with adjusted thresholds for content length, title length, and word count</p>
                <p>4. <strong>Uniqueness Sensor:</strong> Zero-tolerance duplicate detection for URLs and content (100% uniqueness required)</p>
                <p>5. <strong>Consistency Sensor:</strong> Validates category values (expanded list), cross-field correlations, and data relationships</p>
                <p>6. <strong>Statistical Sensor:</strong> Analyzes distributions, detects outliers (15% threshold), and measures data consistency</p>

                <p><strong>Severity Levels:</strong></p>
                <p>‚Ä¢ <strong>CRITICAL:</strong> Must be fixed immediately (blocking issues)</p>
                <p>‚Ä¢ <strong>ERROR:</strong> Should be fixed soon (high priority)</p>
                <p>‚Ä¢ <strong>WARNING:</strong> Nice to improve (low priority, informational)</p>
            </div>
        </div>

        <div class="footer-enhanced">
            <div class="footer-grid">
                <div class="footer-section">
                    <h4>üìö Informasi Akademik</h4>
                    <p><strong>Mata Kuliah:</strong><br>Analisis Bisnis Data Perusahaan</p>
                    <p><strong>Aplikasi:</strong> #8 - Monitoring Kualitas Data AI</p>
                    <p><strong>Metode:</strong> Process Improvement</p>
                </div>

                <div class="footer-section">
                    <h4>üë• Tim Pengembang</h4>
                    <p><strong>202022510021</strong><br>Made Marshall Vira Deva</p>
                    <p><strong>202022420034</strong><br>Irfan Venny Rahmayanti</p>
                </div>

                <div class="footer-section">
                    <h4>üîß Framework & Tools</h4>
                    <div class="footer-badge">Soda Core</div>
                    <div class="footer-badge">DQOps</div>
                    <div class="footer-badge">Python</div>
                    <div class="footer-badge">Pandas</div>
                    <div class="footer-badge">AI-Powered</div>
                </div>
            </div>

            <div class="footer-bottom">
                <p><strong>üèÜ Framework Pengujian Sensor DQOps</strong></p>
                <p>Didukung oleh Validasi Berbasis Sensor Mendalam dengan Threshold yang Disesuaikan</p>
                <p>ID Laporan: {timestamp} | 6 Sensor √ó {total_checks} Pengujian</p>
                <p>¬© 2024 Proyek Kualitas Data MBG | Tugas Analisis Bisnis Data Perusahaan</p>
            </div>
        </div>
    </div>
</body>
</html>"""

print("‚úÖ Full HTML report compiled!")
translations = {
    # Hero Score Section
    "Overall Data Quality Score": "Skor Kualitas Data Keseluruhan",

    # Stats Labels
    "Total Sensors": "Total Sensor",
    "Total Tests": "Total Pengujian",
    "Passed": "Lulus",
    "Failed": "Gagal",
    "Critical": "Kritis",
    "Warnings": "Peringatan",
    "Comprehensive": "Komprehensif",
    "success": "sukses",
    "failure": "gagal",
    "Must fix": "Harus diperbaiki",
    "Nice to fix": "Sebaiknya diperbaiki",

    # Adjustments Section
    "Adjusted Thresholds Applied": "Threshold yang Disesuaikan Diterapkan",
    "more realistic for short articles": "lebih realistis untuk artikel pendek",
    "allows for concise headlines": "memungkinkan judul yang ringkas",
    "realistic baseline": "baseline realistis",
    "adjusted to actual data": "disesuaikan dengan data aktual",
    "allows for weaker relationships": "memungkinkan hubungan yang lebih lemah",

    # Recommendations
    "Critical Issues (Must Fix Immediately)": "Masalah Kritis (Harus Diperbaiki Segera)",
    "High Priority Issues (Should Fix Soon)": "Masalah Prioritas Tinggi (Sebaiknya Diperbaiki Segera)",
    "Warnings (Nice to Improve)": "Peringatan (Sebaiknya Ditingkatkan)",
    "Excellent Data Quality!": "Kualitas Data Sangat Baik!",
    "All sensor tests passed successfully": "Semua pengujian sensor berhasil dilakukan",
    "Your data meets production quality standards": "Data Anda memenuhi standar kualitas produksi",

    # Sensor Details
    "Detailed Sensor Results": "Hasil Sensor Detail",

    # Methodology
    "DQOps Sensor Methodology": "Metodologi Sensor DQOps",
    "Approach": "Pendekatan",
    "Sensor Categories": "Kategori Sensor",
    "Severity Levels": "Tingkat Keparahan",
    "Must be fixed immediately (blocking issues)": "Harus diperbaiki segera (masalah bloking)",
    "Should be fixed soon (high priority)": "Sebaiknya diperbaiki segera (prioritas tinggi)",
    "Nice to improve (low priority, informational)": "Sebaiknya ditingkatkan (prioritas rendah, informasional)",
}
# Save HTML Report
reports_dir = f'{PROJECT_PATH}/reports'
os.makedirs(reports_dir, exist_ok=True)

html_path = f'{reports_dir}/dqops_sensor_report_adjusted_{timestamp}.html'
with open(html_path, 'w', encoding='utf-8') as f:
    f.write(html_report)

print(f"\n‚úÖ HTML Report saved:")
print(f"   üìÅ {html_path}")
print(f"   üìä Size: {os.path.getsize(html_path) / 1024:.1f} KB")

print("\n‚úÖ PART 6 COMPLETE!")
print("="*80)
print("‚ñ∂Ô∏è  Run Part 7 (final) to save JSON, CSV, and summary reports!")

"""
================================================================================
STEP 4 PART 7: SAVE ALL REPORTS (FINAL)
================================================================================
Run this after Parts 1-6!
Saves JSON, CSV, metrics, and text summary
================================================================================
"""

print("="*80)
print("üíæ STEP 4 PART 7: SAVING ALL REPORTS TO GOOGLE DRIVE")
print("="*80)

# Prepare complete results package
complete_results = {
    'framework': 'DQOps Sensors (Adjusted Thresholds)',
    'approach': 'Deep Sensor-Based Testing with Realistic Standards',
    'timestamp': datetime.now().isoformat(),
    'adjustments': {
        'content_min': '100 chars (was 800)',
        'title_min': '10 chars (was 30)',
        'word_count_min': '50 words (was 100)',
        'word_count_avg': '250 words (was 300)',
        'categories': 'Extended to include News, Rejabar, etc.',
        'correlations': 'Relaxed to >0 (was >0.2)',
        'severity': 'Many checks downgraded to warning'
    },
    'dataset_info': {
        'rows': len(df),
        'columns': len(df.columns),
        'memory_mb': float(df.memory_usage(deep=True).sum() / 1024**2)
    },
    'summary': {
        'total_sensors': len(sensors),
        'total_tests': total_checks,
        'passed': total_passed,
        'failed': total_failed,
        'critical_failures': critical_failures,
        'warnings': warnings_count,
        'quality_score': float(overall_quality_score),
        'grade': grade,
        'success_rate': float(total_passed / total_checks * 100) if total_checks > 0 else 0
    },
    'sensor_results': all_sensor_results
}

# 1. Save JSON Report (with native types)
json_path = f'{reports_dir}/dqops_sensor_testing_adjusted_{timestamp}.json'
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(complete_results, f, indent=2, ensure_ascii=False)
print(f"\n‚úÖ JSON Report saved:")
print(f"   üìÅ {json_path}")
print(f"   üìä Size: {os.path.getsize(json_path) / 1024:.1f} KB")

# 2. Save CSV Summary (for easy analysis)
csv_data = []
for result in all_sensor_results:
    sensor_name = result['sensor_name']
    for check in result['evaluation']['checks']:
        csv_data.append({
            'sensor': sensor_name,
            'category': result['category'],
            'test_rule': check['rule'],
            'status': check['status'],
            'severity': check.get('severity', 'error'),
            'actual': check['actual'],
            'expected': check['expected'],
            'details': check.get('details', '')
        })

csv_df = pd.DataFrame(csv_data)
csv_path = f'{reports_dir}/dqops_sensor_results_adjusted_{timestamp}.csv'
csv_df.to_csv(csv_path, index=False, encoding='utf-8-sig')
print(f"\n‚úÖ CSV Results saved:")
print(f"   üìÅ {csv_path}")
print(f"   üìä {len(csv_data)} test results")

# 3. Save detailed metrics (for deep analysis)
metrics_data = []
for result in all_sensor_results:
    sensor_name = result['sensor_name']
    metrics = result['metrics']

    for metric_name, metric_values in metrics.items():
        if isinstance(metric_values, dict):
            for key, value in metric_values.items():
                metrics_data.append({
                    'sensor': sensor_name,
                    'metric_category': metric_name,
                    'metric_name': key,
                    'value': value
                })

if metrics_data:
    metrics_df = pd.DataFrame(metrics_data)
    metrics_path = f'{reports_dir}/dqops_metrics_adjusted_{timestamp}.csv'
    metrics_df.to_csv(metrics_path, index=False, encoding='utf-8-sig')
    print(f"\n‚úÖ Metrics Data saved:")
    print(f"   üìÅ {metrics_path}")
    print(f"   üìä {len(metrics_data)} metrics collected")

# 4. GENERATE SUMMARY REPORT (TEXT)
summary_text = f"""
================================================================================
DQOPS SENSOR TESTING SUMMARY REPORT (ADJUSTED THRESHOLDS)
================================================================================
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Dataset: MBG Data ({len(df):,} rows √ó {len(df.columns)} columns)

================================================================================
THRESHOLD ADJUSTMENTS APPLIED
================================================================================
‚ú® Content Min: 100 chars (was 800 chars)
‚ú® Title Min: 10 chars (was 30 chars)
‚ú® Word Count Min: 50 words (was 100 words)
‚ú® Word Count Avg: 250 words (was 300 words)
‚ú® Categories: Extended list (News, Rejabar, Ekonomi Syariah, Islam Digest, Visual)
‚ú® Correlations: Relaxed to >0 (was >0.2)
‚ú® Severity: Many checks downgraded to warning level

================================================================================
OVERALL RESULTS
================================================================================
Quality Score: {overall_quality_score:.1f}/100
Grade: {grade}

Total Sensors: {len(sensors)}
Total Tests: {total_checks}
‚úÖ PASSED: {total_passed} ({overall_quality_score:.1f}%)
‚ùå FAILED: {total_failed} ({total_failed/total_checks*100:.1f}%)
"""

if critical_failures > 0:
    summary_text += f"üö® CRITICAL: {critical_failures}\n"
if warnings_count > 0:
    summary_text += f"‚ö†Ô∏è  WARNINGS: {warnings_count}\n"

summary_text += f"""
================================================================================
SENSOR BREAKDOWN
================================================================================
"""

for result in all_sensor_results:
    sensor_name = result['sensor_name']
    evaluation = result['evaluation']

    # Count critical and warnings
    critical_in_sensor = sum(1 for r in evaluation['checks'] if r['status'] == 'FAILED' and r.get('severity') == 'critical')
    warnings_in_sensor = sum(1 for r in evaluation['checks'] if r['status'] == 'FAILED' and r.get('severity') == 'warning')

    summary_text += f"\n{sensor_name}:\n"
    summary_text += f"  Tests: {evaluation['total']}\n"
    summary_text += f"  Passed: {evaluation['passed']} ({evaluation['success_rate']:.1f}%)\n"
    summary_text += f"  Failed: {evaluation['failed']}\n"
    if critical_in_sensor > 0:
        summary_text += f"  üö® Critical: {critical_in_sensor}\n"
    if warnings_in_sensor > 0:
        summary_text += f"  ‚ö†Ô∏è  Warnings: {warnings_in_sensor}\n"

# Add failed tests details
if total_failed > 0:
    summary_text += f"""
================================================================================
FAILED TESTS DETAILS
================================================================================
"""

    # Critical failures first
    critical_found = False
    for result in all_sensor_results:
        critical_checks = [c for c in result['evaluation']['checks'] if c['status'] == 'FAILED' and c.get('severity') == 'critical']
        if critical_checks:
            if not critical_found:
                summary_text += "\nüö® CRITICAL FAILURES:\n"
                critical_found = True
            summary_text += f"\n{result['sensor_name']}:\n"
            for check in critical_checks:
                summary_text += f"  ‚Ä¢ {check['rule']}\n"
                summary_text += f"     Actual: {check['actual']}\n"
                summary_text += f"     Expected: {check['expected']}\n"
                if check.get('details'):
                    summary_text += f"     Details: {check['details']}\n"

    # Then errors
    error_found = False
    for result in all_sensor_results:
        error_checks = [c for c in result['evaluation']['checks'] if c['status'] == 'FAILED' and c.get('severity') == 'error']
        if error_checks:
            if not error_found:
                summary_text += "\n‚ö†Ô∏è  ERRORS:\n"
                error_found = True
            summary_text += f"\n{result['sensor_name']}:\n"
            for check in error_checks:
                summary_text += f"  ‚Ä¢ {check['rule']}\n"
                summary_text += f"     Actual: {check['actual']}\n"
                summary_text += f"     Expected: {check['expected']}\n"

    # Then warnings
    warning_found = False
    for result in all_sensor_results:
        warning_checks = [c for c in result['evaluation']['checks'] if c['status'] == 'FAILED' and c.get('severity') == 'warning']
        if warning_checks:
            if not warning_found:
                summary_text += "\nüí° WARNINGS (Nice to Improve):\n"
                warning_found = True
            summary_text += f"\n{result['sensor_name']}:\n"
            for check in warning_checks:
                summary_text += f"  ‚Ä¢ {check['rule']}\n"
                summary_text += f"     Actual: {check['actual']}\n"
                summary_text += f"     Expected: {check['expected']}\n"

summary_text += f"""
================================================================================
FILES GENERATED
================================================================================
1. JSON Report: dqops_sensor_testing_adjusted_{timestamp}.json
2. HTML Report: dqops_sensor_report_adjusted_{timestamp}.html
3. CSV Results: dqops_sensor_results_adjusted_{timestamp}.csv
4. Metrics Data: dqops_metrics_adjusted_{timestamp}.csv
5. Summary Text: dqops_summary_adjusted_{timestamp}.txt

All files saved to: {reports_dir}/

================================================================================
NEXT STEPS
================================================================================
"""

if critical_failures > 0:
    summary_text += f"""
‚ö†Ô∏è  ACTION REQUIRED: {critical_failures} Critical failures detected!

Priority Actions:
1. Review critical failures in HTML report
2. Fix data quality issues for:
   - URL uniqueness and completeness
   - Content completeness and validity
   - Minimum content quality standards (adjusted thresholds)
3. Re-run validation after fixes
"""
elif total_failed > warnings_count and total_failed > 0:
    summary_text += f"""
‚úì No critical issues, but {total_failed - warnings_count} errors and {warnings_count} warnings found.

Recommended Actions:
1. Review errors in HTML report
2. Address warnings when possible
3. Re-validate to track improvements
"""
elif warnings_count > 0:
    summary_text += f"""
‚úÖ Good! Only {warnings_count} warnings found (no critical or errors).

Optional Actions:
1. Review warnings in HTML report
2. Improve data quality incrementally
3. Warnings are informational and can be addressed over time
"""
else:
    summary_text += """
‚úÖ EXCELLENT! All tests passed.

Your data meets production quality standards with adjusted thresholds.
Continue monitoring with regular validations.
"""

summary_text += """
================================================================================
END OF REPORT
================================================================================
"""

# Save summary text
summary_path = f'{reports_dir}/dqops_summary_adjusted_{timestamp}.txt'
with open(summary_path, 'w', encoding='utf-8') as f:
    f.write(summary_text)
print(f"\n‚úÖ Summary Text saved:")
print(f"   üìÅ {summary_path}")

# ============================================
# FINAL COMPLETION MESSAGE
# ============================================
print("\n" + "="*80)
print("üéâ STEP 4: DQOPS SENSOR TESTING COMPLETE! (ADJUSTED THRESHOLDS)")
print("="*80)

print(f"\nüéØ Final Results:")
print(f"   ‚Ä¢ Framework: DQOps Sensors (Adjusted)")
print(f"   ‚Ä¢ Quality Score: {overall_quality_score:.1f}/100")
print(f"   ‚Ä¢ Grade: {grade}")
print(f"   ‚Ä¢ Total Tests: {total_checks}")
print(f"   ‚Ä¢ Success Rate: {total_passed/total_checks*100:.1f}%")

if critical_failures > 0:
    print(f"\nüö® CRITICAL: {critical_failures} critical failures require immediate attention!")
elif total_failed > warnings_count:
    print(f"\n‚ö†Ô∏è  ATTENTION: {total_failed - warnings_count} errors need fixing.")
elif warnings_count > 0:
    print(f"\nüí° GOOD: Only {warnings_count} warnings (optional improvements).")
else:
    print("\n‚úÖ EXCELLENT: All tests passed!")

print(f"\nüìÅ All files saved to:")
print(f"   {reports_dir}/")

print(f"\nüìä Generated Files:")
print(f"   1. ‚úÖ JSON: dqops_sensor_testing_adjusted_{timestamp}.json")
print(f"   2. ‚úÖ HTML: dqops_sensor_report_adjusted_{timestamp}.html")
print(f"   3. ‚úÖ CSV: dqops_sensor_results_adjusted_{timestamp}.csv")
print(f"   4. ‚úÖ Metrics: dqops_metrics_adjusted_{timestamp}.csv")
print(f"   5. ‚úÖ Summary: dqops_summary_adjusted_{timestamp}.txt")

print("\n" + "="*80)
print("üìä FINAL SENSOR BREAKDOWN:")
print("="*80)

for i, result in enumerate(all_sensor_results, 1):
    sensor_name = result['sensor_name']
    evaluation = result['evaluation']

    critical_in_sensor = sum(1 for r in evaluation['checks'] if r['status'] == 'FAILED' and r.get('severity') == 'critical')
    warnings_in_sensor = sum(1 for r in evaluation['checks'] if r['status'] == 'FAILED' and r.get('severity') == 'warning')
    errors_in_sensor = evaluation['failed'] - critical_in_sensor - warnings_in_sensor

    if evaluation['failed'] == 0:
        icon = '‚úÖ'
    elif critical_in_sensor > 0:
        icon = 'üö®'
    elif errors_in_sensor > 0:
        icon = '‚ö†Ô∏è'
    else:
        icon = 'üí°'

    print(f"\n{i}. {icon} {sensor_name}")
    print(f"   Success Rate: {evaluation['success_rate']:.1f}%")
    print(f"   Tests: {evaluation['total']} | Passed: {evaluation['passed']} | Failed: {evaluation['failed']}")

    if critical_in_sensor > 0:
        print(f"   üö® Critical: {critical_in_sensor}")
    if errors_in_sensor > 0:
        print(f"   ‚ö†Ô∏è  Errors: {errors_in_sensor}")
    if warnings_in_sensor > 0:
        print(f"   üí° Warnings: {warnings_in_sensor}")

print("\n" + "="*80)
print("‚úÖ ALL PARTS COMPLETE!")
print("="*80)
print("\nüéä CONGRATULATIONS!")
print("   You have successfully completed DQOps Sensor Testing")
print("   with adjusted, realistic thresholds!")
print("\nüìà Your data quality report is ready for review.")
print(f"   Open the HTML file to see the beautiful interactive report:")
print(f"   üìÑ {html_path}")
print("\n" + "="*80)

üî¨ STEP 4 PART 1: DQOPS SENSOR TESTING - SETUP

üìä Loading data with metrics...
--------------------------------------------------------------------------------
‚úÖ Data loaded: 244 rows √ó 18 columns

üìä Quick Overview:
   ‚Ä¢ Content length: min=296, max=6,378, avg=2211
   ‚Ä¢ Title length: min=27, max=100, avg=70
   ‚Ä¢ Word count: min=42, max=837, avg=298

‚úÖ PART 1 COMPLETE - Base classes and data loaded!
‚ñ∂Ô∏è  Run Part 2 next to define sensors!
üî¨ STEP 4 PART 2: DEFINING SENSORS WITH ADJUSTED THRESHOLDS

‚úÖ 3 Sensors defined (Completeness, Validity, Accuracy with adjusted thresholds)
üìä ADJUSTED THRESHOLDS:
   ‚Ä¢ Content min: 100 chars (was 800)
   ‚Ä¢ Title min: 10 chars (was 30)
   ‚Ä¢ Word count min: 50 words (was 100)
   ‚Ä¢ Word count avg: 250 words (was 300)
‚ñ∂Ô∏è  Run Part 3 next to define remaining sensors!
üî¨ STEP 4 PART 3: ADDITIONAL SENSORS WITH ADJUSTED THRESHOLDS

‚úÖ 3 More sensors defined (Uniqueness, Consistency, Statistical)
   Total: 6 comprehe

In [None]:
import pandas as pd
import json
import os
import glob
from datetime import datetime
import numpy as np

print("="*80)
print("üìä STEP 5 - PART 1: LOADING & VERIFYING RESULTS")
print("="*80)

# ============================================
# 1.1 PROJECT SETUP
# ============================================
PROJECT_PATH = '/content/soda_project'
reports_dir = f'{PROJECT_PATH}/reports'

print(f"\nüìÅ Reports directory: {reports_dir}")
print(f"   Exists: {os.path.exists(reports_dir)}")

# ============================================
# 1.2 LIST ALL AVAILABLE FILES
# ============================================
print("\nüìã Available files in reports directory:")
print("-" * 80)

all_files = glob.glob(f'{reports_dir}/*')
if all_files:
    for f in sorted(all_files):
        filename = os.path.basename(f)
        size = os.path.getsize(f) / 1024  # KB
        print(f"   ‚Ä¢ {filename:60s} ({size:8.1f} KB)")
else:
    print("   ‚ùå No files found!")

# ============================================
# 1.3 IDENTIFY REPORT FILES
# ============================================
print("\nüîç Identifying report files...")
print("-" * 80)

# Find Soda Core files
soda_json_files = glob.glob(f'{reports_dir}/quality_report_*.json')
soda_html_files = glob.glob(f'{reports_dir}/soda_core_report_*.html')

# Find DQOps files
dqops_json_files = glob.glob(f'{reports_dir}/dqops_sensor_testing_*.json')
dqops_html_files = glob.glob(f'{reports_dir}/dqops_sensor_report_*.html')

print(f"Soda Core JSON: {len(soda_json_files)} files")
for f in soda_json_files:
    print(f"   ‚Ä¢ {os.path.basename(f)}")

print(f"\nSoda Core HTML: {len(soda_html_files)} files")
for f in soda_html_files:
    print(f"   ‚Ä¢ {os.path.basename(f)}")

print(f"\nDQOps JSON: {len(dqops_json_files)} files")
for f in dqops_json_files:
    print(f"   ‚Ä¢ {os.path.basename(f)}")

print(f"\nDQOps HTML: {len(dqops_html_files)} files")
for f in dqops_html_files:
    print(f"   ‚Ä¢ {os.path.basename(f)}")

# ============================================
# 1.4 VERIFY PREREQUISITES
# ============================================
print("\n" + "="*80)
print("‚úÖ PREREQUISITE CHECK")
print("="*80)

issues = []

if not soda_json_files:
    issues.append("‚ùå Soda Core JSON not found! Please run Step 3 first.")
else:
    print("‚úÖ Soda Core JSON found")

if not dqops_json_files:
    issues.append("‚ùå DQOps JSON not found! Please run Step 4 first.")
else:
    print("‚úÖ DQOps JSON found")

if not dqops_html_files:
    issues.append("‚ö†Ô∏è  DQOps HTML not found (will still work)")
else:
    print("‚úÖ DQOps HTML found")

if not soda_html_files:
    issues.append("‚ö†Ô∏è  Soda Core HTML not found (will still work)")
else:
    print("‚úÖ Soda Core HTML found")


if issues:
    print("\n" + "="*80)
    print("‚ö†Ô∏è  ISSUES DETECTED:")
    print("="*80)
    for issue in issues:
        print(issue)

    if "‚ùå" in str(issues):
        print("\n‚ùå Cannot proceed! Missing required files.")
        print("\nRequired actions:")
        print("1. Run Step 3 (Soda Core validation)")
        print("2. Run Step 4 (DQOps sensor testing)")
        print("3. Then run Step 5 again")
        raise FileNotFoundError("Missing required validation results")
    else:
        print("\n‚ö†Ô∏è  Some files missing but can continue...")

# ============================================
# 1.5 LOAD LATEST RESULTS
# ============================================
print("\n" + "="*80)
print("üì• LOADING LATEST RESULTS")
print("="*80)

# Load Soda Core (latest)
latest_soda = max(soda_json_files, key=os.path.getctime)
print(f"\nüìä Loading Soda Core: {os.path.basename(latest_soda)}")

with open(latest_soda, 'r', encoding='utf-8') as f:
    soda_results = json.load(f)

print(f"   ‚úÖ Loaded successfully")
print(f"   ‚Ä¢ Framework: {soda_results.get('framework', 'N/A')}")
print(f"   ‚Ä¢ Timestamp: {soda_results.get('scan_info', {}).get('timestamp', 'N/A')}")

# Load DQOps (latest)
latest_dqops = max(dqops_json_files, key=os.path.getctime)
print(f"\nüî¨ Loading DQOps: {os.path.basename(latest_dqops)}")

with open(latest_dqops, 'r', encoding='utf-8') as f:
    dqops_results = json.load(f)

print(f"   ‚úÖ Loaded successfully")
print(f"   ‚Ä¢ Framework: {dqops_results.get('framework', 'N/A')}")
print(f"   ‚Ä¢ Timestamp: {dqops_results.get('timestamp', 'N/A')}")

# ============================================
# 1.6 EXTRACT SUMMARY DATA
# ============================================
print("\n" + "="*80)
print("üìä EXTRACTING SUMMARY DATA")
print("="*80)

# Soda Core summary
soda_summary = soda_results.get('summary', {})
print("\nüìä Soda Core Summary:")
print(f"   ‚Ä¢ Total Checks: {soda_summary.get('total_checks', 0)}")
print(f"   ‚Ä¢ Passed: {soda_summary.get('passed', 0)}")
print(f"   ‚Ä¢ Failed: {soda_summary.get('failed', 0)}")
print(f"   ‚Ä¢ Quality Score: {soda_summary.get('quality_score', 0):.1f}/100")
print(f"   ‚Ä¢ Grade: {soda_summary.get('quality_grade', 'N/A')}")

# DQOps summary
dqops_summary = dqops_results.get('summary', {})
print("\nüî¨ DQOps Summary:")
print(f"   ‚Ä¢ Total Tests: {dqops_summary.get('total_tests', dqops_summary.get('total_checks', 0))}")
print(f"   ‚Ä¢ Passed: {dqops_summary.get('passed', 0)}")
print(f"   ‚Ä¢ Failed: {dqops_summary.get('failed', 0)}")
print(f"   ‚Ä¢ Quality Score: {dqops_summary.get('quality_score', 0):.1f}/100")
print(f"   ‚Ä¢ Critical Failures: {dqops_summary.get('critical_failures', 0)}")

# ============================================
# 1.7 SAVE LOADED DATA FOR NEXT PART
# ============================================
print("\nüíæ Saving loaded data for Part 2...")

# Store in global variables for next part
globals()['soda_results_loaded'] = soda_results
globals()['dqops_results_loaded'] = dqops_results
globals()['soda_summary_data'] = soda_summary
globals()['dqops_summary_data'] = dqops_summary

print("‚úÖ Data stored in memory")

# ============================================
# COMPLETION PART 1
# ============================================
print("\n" + "="*80)
print("‚úÖ PART 1 COMPLETE!")
print("="*80)

print("\nüìã Ready for Part 2:")
print("   ‚úÖ Soda Core results loaded")
print("   ‚úÖ DQOps results loaded")
print("   ‚úÖ Summary data extracted")
print("   ‚úÖ Data stored in memory")

print("\nüéØ Next: Run Part 2 to create comparison analysis")
print("="*80)

"""
================================================================================
STEP 5: INTEGRATED COMPARISON - PART 2 (COMPARISON ANALYSIS)
================================================================================
Purpose: Create detailed comparison between Soda Core and DQOps
Fixed: Proper data extraction and realistic scoring
================================================================================
"""

import pandas as pd
import json
import os
from datetime import datetime
import numpy as np

print("="*80)
print("üìä STEP 5 - PART 2: COMPARISON ANALYSIS")
print("="*80)

# ============================================
# 2.1 VERIFY PART 1 DATA
# ============================================
print("\nüîç Verifying Part 1 data...")
print("-" * 80)

try:
    soda_results = globals()['soda_results_loaded']
    dqops_results = globals()['dqops_results_loaded']
    soda_summary = globals()['soda_summary_data']
    dqops_summary = globals()['dqops_summary_data']
    print("‚úÖ Part 1 data found in memory")
except KeyError:
    print("‚ùå Part 1 data not found!")
    print("   Please run Part 1 first before running Part 2")
    raise RuntimeError("Part 1 must be completed first")

# ============================================
# 2.2 EXTRACT DETAILED METRICS
# ============================================
print("\n" + "="*80)
print("üìà EXTRACTING DETAILED METRICS")
print("="*80)

# Soda Core detailed metrics
soda_total = soda_summary.get('total_checks', 0)
soda_passed = soda_summary.get('passed', 0)
soda_failed = soda_summary.get('failed', 0)
soda_score = soda_summary.get('quality_score', 0)
soda_grade = soda_summary.get('quality_grade', 'N/A')

print("\nüìä Soda Core Metrics:")
print(f"   ‚Ä¢ Total Checks: {soda_total}")
print(f"   ‚Ä¢ Passed: {soda_passed} ({soda_passed/soda_total*100:.1f}%)")
print(f"   ‚Ä¢ Failed: {soda_failed} ({soda_failed/soda_total*100:.1f}%)")
print(f"   ‚Ä¢ Score: {soda_score:.1f}/100")
print(f"   ‚Ä¢ Grade: {soda_grade}")

# DQOps detailed metrics
dqops_total = dqops_summary.get('total_tests', dqops_summary.get('total_checks', 0))
dqops_passed = dqops_summary.get('passed', 0)
dqops_failed = dqops_summary.get('failed', 0)
dqops_score = dqops_summary.get('quality_score', 0)
dqops_critical = dqops_summary.get('critical_failures', 0)

print("\nüî¨ DQOps Metrics:")
print(f"   ‚Ä¢ Total Tests: {dqops_total}")
print(f"   ‚Ä¢ Passed: {dqops_passed} ({dqops_passed/dqops_total*100:.1f}%)")
print(f"   ‚Ä¢ Failed: {dqops_failed} ({dqops_failed/dqops_total*100:.1f}%)")
print(f"   ‚Ä¢ Score: {dqops_score:.1f}/100")
print(f"   ‚Ä¢ Critical Failures: {dqops_critical}")

# ============================================
# 2.3 DIMENSION-LEVEL COMPARISON
# ============================================
print("\n" + "="*80)
print("üîç DIMENSION-LEVEL ANALYSIS")
print("="*80)

# Extract dimension scores from Soda Core
soda_dimensions = {}
if 'dimension_scores' in soda_results:
    print("\nüìä Soda Core - Dimensions:")
    for dim_name, dim_data in soda_results['dimension_scores'].items():
        score = dim_data.get('score', 0)
        passed = dim_data.get('passed', 0)
        total = dim_data.get('total', 0)
        soda_dimensions[dim_name] = {
            'passed': passed,
            'total': total,
            'score': score
        }
        print(f"   ‚Ä¢ {dim_name:20s}: {passed}/{total} ({score:.1f}%)")

# Extract sensor results from DQOps
dqops_sensors = {}
if 'sensor_results' in dqops_results:
    print("\nüî¨ DQOps - Sensors:")
    for sensor in dqops_results['sensor_results']:
        sensor_name = sensor.get('sensor_name', 'Unknown')
        evaluation = sensor.get('evaluation', {})
        passed = evaluation.get('passed', 0)
        total = evaluation.get('total', 0)
        score = evaluation.get('success_rate', 0)
        critical = evaluation.get('critical_failures', 0)

        dqops_sensors[sensor_name] = {
            'passed': passed,
            'total': total,
            'score': score,
            'critical': critical
        }
        print(f"   ‚Ä¢ {sensor_name:30s}: {passed}/{total} ({score:.1f}%)")

# ============================================
# 2.4 CALCULATE COMBINED METRICS
# ============================================
print("\n" + "="*80)
print("üéØ COMBINED QUALITY METRICS")
print("="*80)

# Combined score (weighted average)
# Soda Core = 40% (observability focus)
# DQOps = 60% (deep testing focus)
combined_score = (soda_score * 0.4) + (dqops_score * 0.6)

# Combined totals
combined_total_validations = soda_total + dqops_total
combined_total_passed = soda_passed + dqops_passed
combined_total_failed = soda_failed + dqops_failed
combined_success_rate = (combined_total_passed / combined_total_validations * 100) if combined_total_validations > 0 else 0

print(f"\nüìä Combined Quality Metrics:")
print(f"   ‚Ä¢ Weighted Score: {combined_score:.1f}/100")
print(f"   ‚Ä¢ Total Validations: {combined_total_validations}")
print(f"   ‚Ä¢ Total Passed: {combined_total_passed}")
print(f"   ‚Ä¢ Total Failed: {combined_total_failed}")
print(f"   ‚Ä¢ Overall Success Rate: {combined_success_rate:.1f}%")

# Determine combined grade
if combined_score >= 95:
    combined_grade = "EXCELLENT üåü"
    grade_color = "#27ae60"
elif combined_score >= 85:
    combined_grade = "VERY GOOD ‚úÖ"
    grade_color = "#2ecc71"
elif combined_score >= 75:
    combined_grade = "GOOD ‚úì"
    grade_color = "#3498db"
elif combined_score >= 65:
    combined_grade = "ACCEPTABLE ‚ö†Ô∏è"
    grade_color = "#f39c12"
elif combined_score >= 50:
    combined_grade = "NEEDS IMPROVEMENT ‚ö†Ô∏è"
    grade_color = "#e67e22"
else:
    combined_grade = "POOR ‚ùå"
    grade_color = "#e74c3c"

print(f"   ‚Ä¢ Combined Grade: {combined_grade}")

# ============================================
# 2.5 IDENTIFY KEY ISSUES
# ============================================
print("\n" + "="*80)
print("‚ö†Ô∏è  KEY ISSUES IDENTIFIED")
print("="*80)

issues_found = []

# From Soda Core
if 'detailed_results' in soda_results:
    soda_failures = [r for r in soda_results['detailed_results'] if r.get('status') == 'FAILED']
    if soda_failures:
        print(f"\nüìä Soda Core Failures ({len(soda_failures)}):")
        for i, failure in enumerate(soda_failures[:5], 1):
            dim = failure.get('dimension', 'Unknown')
            check = failure.get('check', 'Unknown')
            actual = failure.get('actual', 'N/A')
            expected = failure.get('expected', 'N/A')
            print(f"   {i}. [{dim}] {check}")
            print(f"      Actual: {actual} | Expected: {expected}")

            issues_found.append({
                'source': 'Soda Core',
                'dimension': dim,
                'check': check,
                'actual': actual,
                'expected': expected,
                'severity': failure.get('severity', 'error')
            })

# From DQOps
if 'sensor_results' in dqops_results:
    for sensor in dqops_results['sensor_results']:
        sensor_name = sensor.get('sensor_name', 'Unknown')
        evaluation = sensor.get('evaluation', {})
        failed_checks = [c for c in evaluation.get('checks', []) if c.get('status') == 'FAILED']

        if failed_checks:
            print(f"\nüî¨ DQOps - {sensor_name} Failures ({len(failed_checks)}):")
            for i, check in enumerate(failed_checks[:3], 1):
                rule = check.get('rule', 'Unknown')
                actual = check.get('actual', 'N/A')
                expected = check.get('expected', 'N/A')
                severity = check.get('severity', 'error')
                print(f"   {i}. [{severity.upper()}] {rule}")
                print(f"      Actual: {actual} | Expected: {expected}")

                issues_found.append({
                    'source': sensor_name,
                    'dimension': sensor.get('category', 'Unknown'),
                    'check': rule,
                    'actual': actual,
                    'expected': expected,
                    'severity': severity
                })

print(f"\nüìä Total Issues Found: {len(issues_found)}")

# ============================================
# 2.6 CREATE COMPARISON STRUCTURE
# ============================================
print("\n" + "="*80)
print("üîß CREATING COMPARISON STRUCTURE")
print("="*80)

comparison_data = {
    'timestamp': datetime.now().isoformat(),
    'report_id': datetime.now().strftime("%Y%m%d_%H%M%S"),

    'soda_core': {
        'framework': 'Soda Core',
        'approach': 'Data Observability & Continuous Monitoring',
        'total_checks': soda_total,
        'passed': soda_passed,
        'failed': soda_failed,
        'quality_score': float(soda_score),
        'grade': soda_grade,
        'success_rate': float(soda_passed / soda_total * 100) if soda_total > 0 else 0,
        'dimensions': soda_dimensions
    },

    'dqops_sensors': {
        'framework': 'DQOps Sensors',
        'approach': 'Deep Sensor-Based Testing',
        'total_tests': dqops_total,
        'passed': dqops_passed,
        'failed': dqops_failed,
        'critical_failures': dqops_critical,
        'quality_score': float(dqops_score),
        'success_rate': float(dqops_passed / dqops_total * 100) if dqops_total > 0 else 0,
        'sensors': dqops_sensors
    },

    'combined': {
        'weighted_score': float(combined_score),
        'grade': combined_grade,
        'grade_color': grade_color,
        'total_validations': combined_total_validations,
        'total_passed': combined_total_passed,
        'total_failed': combined_total_failed,
        'success_rate': float(combined_success_rate)
    },

    'issues': issues_found,
    'issue_count': len(issues_found)
}

print("‚úÖ Comparison structure created")
print(f"   ‚Ä¢ Soda Core: {soda_total} checks")
print(f"   ‚Ä¢ DQOps: {dqops_total} tests")
print(f"   ‚Ä¢ Combined: {combined_total_validations} validations")
print(f"   ‚Ä¢ Issues tracked: {len(issues_found)}")

# ============================================
# 2.7 SAVE COMPARISON DATA
# ============================================
print("\nüíæ Saving comparison data for Part 3...")

# Store for Part 3
globals()['comparison_data_final'] = comparison_data
globals()['comparison_issues'] = issues_found

# Also save JSON backup
PROJECT_PATH = '/content/soda_project'
reports_dir = f'{PROJECT_PATH}/reports'
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

json_path = f'{reports_dir}/comparison_data_{timestamp}.json'
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(comparison_data, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Comparison data saved:")
print(f"   üìÅ {json_path}")

# ============================================
# COMPLETION PART 2
# ============================================
print("\n" + "="*80)
print("‚úÖ PART 2 COMPLETE!")
print("="*80)

print("\nüìä Comparison Summary:")
print(f"   ‚Ä¢ Combined Score: {combined_score:.1f}/100")
print(f"   ‚Ä¢ Grade: {combined_grade}")
print(f"   ‚Ä¢ Total Validations: {combined_total_validations}")
print(f"   ‚Ä¢ Success Rate: {combined_success_rate:.1f}%")
print(f"   ‚Ä¢ Issues Identified: {len(issues_found)}")

print("\n‚úÖ Data ready for Part 3:")
print("   ‚úì Comparison metrics calculated")
print("   ‚úì Dimension analysis complete")
print("   ‚úì Issues identified and categorized")
print("   ‚úì Data stored in memory")

print("\nüéØ Next: Run Part 3 to generate comprehensive HTML report")
print("="*80)

"""
================================================================================
STEP 5: INTEGRATED COMPARISON - PART 3 (HTML REPORT GENERATION)
================================================================================
Purpose: Generate comprehensive comparison HTML report
Fixed: Enhanced visualization and realistic analysis
================================================================================
"""

import pandas as pd
import json
import os
from datetime import datetime

print("="*80)
print("üìä STEP 5 - PART 3: HTML REPORT GENERATION")
print("="*80)

# ============================================
# 3.1 VERIFY PART 2 DATA
# ============================================
print("\nüîç Verifying Part 2 data...")
print("-" * 80)

try:
    comparison_data = globals()['comparison_data_final']
    issues_found = globals()['comparison_issues']
    print("‚úÖ Part 2 data found in memory")
    print(f"   ‚Ä¢ Comparison data: OK")
    print(f"   ‚Ä¢ Issues tracked: {len(issues_found)}")
except KeyError:
    print("‚ùå Part 2 data not found!")
    print("   Please run Part 2 first before running Part 3")
    raise RuntimeError("Part 2 must be completed first")

# Extract key metrics
soda_data = comparison_data['soda_core']
dqops_data = comparison_data['dqops_sensors']
combined_data = comparison_data['combined']

combined_score = combined_data['weighted_score']
combined_grade = combined_data['grade']
grade_color = combined_data['grade_color']

print(f"\nüìä Metrics loaded:")
print(f"   ‚Ä¢ Combined Score: {combined_score:.1f}/100")
print(f"   ‚Ä¢ Grade: {combined_grade}")

# ============================================
# 3.2 GENERATE RECOMMENDATION HTML
# ============================================
print("\nüîß Generating recommendations section...")

def generate_recommendations_html(issues):
    """Generate HTML for recommendations based on issues"""

    if not issues:
        return """
        <div class="success-message">
            <div class="success-icon">‚úÖ</div>
            <h3>Excellent Data Quality!</h3>
            <p>All validation tests passed successfully. Your data meets production quality standards across both frameworks.</p>
            <div class="success-stats">
                <div class="success-stat">
                    <span class="stat-icon">üéØ</span>
                    <span class="stat-label">Zero Critical Issues</span>
                </div>
                <div class="success-stat">
                    <span class="stat-icon">‚úì</span>
                    <span class="stat-label">All Checks Passed</span>
                </div>
                <div class="success-stat">
                    <span class="stat-icon">üöÄ</span>
                    <span class="stat-label">Production Ready</span>
                </div>
            </div>
        </div>
        """

    # Categorize issues
    critical_issues = [i for i in issues if i.get('severity') == 'critical']
    error_issues = [i for i in issues if i.get('severity') == 'error']
    warn_issues = [i for i in issues if i.get('severity') == 'warn']

    html = '<div class="recommendations-container">'

    # Critical Issues
    if critical_issues:
        html += f'''
        <div class="rec-section critical-section">
            <div class="rec-header">
                <h3>üö® Critical Issues ({len(critical_issues)})</h3>
                <p class="rec-subtitle">Must be fixed immediately - blocking issues</p>
            </div>
            <div class="issues-list">
        '''
        for i, issue in enumerate(critical_issues[:8], 1):
            html += f'''
            <div class="issue-card critical">
                <div class="issue-number">{i}</div>
                <div class="issue-content">
                    <div class="issue-title">{issue.get('check', 'Unknown Check')}</div>
                    <div class="issue-source">Source: {issue.get('source', 'Unknown')} | {issue.get('dimension', 'N/A')}</div>
                    <div class="issue-metrics">
                        <span class="metric-badge actual">Current: {issue.get('actual', 'N/A')}</span>
                        <span class="metric-badge expected">Required: {issue.get('expected', 'N/A')}</span>
                    </div>
                </div>
            </div>
            '''
        html += '</div></div>'

    # Error Issues
    if error_issues:
        html += f'''
        <div class="rec-section error-section">
            <div class="rec-header">
                <h3>‚ö†Ô∏è High Priority Issues ({len(error_issues)})</h3>
                <p class="rec-subtitle">Should be fixed soon - impacts quality</p>
            </div>
            <div class="issues-list">
        '''
        for i, issue in enumerate(error_issues[:8], 1):
            html += f'''
            <div class="issue-card error">
                <div class="issue-number">{i}</div>
                <div class="issue-content">
                    <div class="issue-title">{issue.get('check', 'Unknown Check')}</div>
                    <div class="issue-source">Source: {issue.get('source', 'Unknown')} | {issue.get('dimension', 'N/A')}</div>
                    <div class="issue-metrics">
                        <span class="metric-badge actual">Current: {issue.get('actual', 'N/A')}</span>
                        <span class="metric-badge expected">Target: {issue.get('expected', 'N/A')}</span>
                    </div>
                </div>
            </div>
            '''
        html += '</div></div>'

    # Warning Issues
    if warn_issues:
        html += f'''
        <div class="rec-section warning-section">
            <div class="rec-header">
                <h3>‚ö° Improvement Opportunities ({len(warn_issues)})</h3>
                <p class="rec-subtitle">Nice to fix - enhances quality</p>
            </div>
            <div class="issues-list">
        '''
        for i, issue in enumerate(warn_issues[:6], 1):
            html += f'''
            <div class="issue-card warning">
                <div class="issue-number">{i}</div>
                <div class="issue-content">
                    <div class="issue-title">{issue.get('check', 'Unknown Check')}</div>
                    <div class="issue-source">Source: {issue.get('source', 'Unknown')} | {issue.get('dimension', 'N/A')}</div>
                    <div class="issue-metrics">
                        <span class="metric-badge actual">Current: {issue.get('actual', 'N/A')}</span>
                        <span class="metric-badge expected">Target: {issue.get('expected', 'N/A')}</span>
                    </div>
                </div>
            </div>
            '''
        html += '</div></div>'

    html += '</div>'
    return html

recommendations_html = generate_recommendations_html(issues_found)
print(f"‚úÖ Recommendations HTML generated ({len(issues_found)} issues)")

# ============================================
# 3.3 GENERATE DIMENSION BREAKDOWN HTML
# ============================================
print("\nüîß Generating dimension breakdown...")

def generate_dimension_cards_html(soda_dims, dqops_sensors):
    """Generate HTML for dimension comparison cards"""

    html = '<div class="dimension-comparison-grid">'

    # Soda Core Dimensions
    if soda_dims:
        html += '<div class="framework-section"><h3 class="framework-title">üìä Soda Core - Quality Dimensions</h3>'

        for dim_name, dim_data in sorted(soda_dims.items()):
            score = dim_data.get('score', 0)
            passed = dim_data.get('passed', 0)
            total = dim_data.get('total', 0)

            if score >= 90:
                status_class = 'excellent'
                icon = '‚úÖ'
            elif score >= 70:
                status_class = 'good'
                icon = '‚ö†Ô∏è'
            else:
                status_class = 'poor'
                icon = '‚ùå'

            html += f'''
            <div class="dimension-card {status_class}">
                <div class="dim-icon">{icon}</div>
                <div class="dim-name">{dim_name}</div>
                <div class="dim-score">{score:.1f}%</div>
                <div class="dim-progress">
                    <div class="progress-bar">
                        <div class="progress-fill" style="width: {score}%\"></div>
                    </div>
                    <div class="dim-stats">{passed}/{total} checks passed</div>
                </div>
            </div>
            '''

        html += '</div>'

    # DQOps Sensors
    if dqops_sensors:
        html += '<div class="framework-section"><h3 class="framework-title">üî¨ DQOps - Sensor Testing</h3>'

        for sensor_name, sensor_data in sorted(dqops_sensors.items()):
            score = sensor_data.get('score', 0)
            passed = sensor_data.get('passed', 0)
            total = sensor_data.get('total', 0)
            critical = sensor_data.get('critical', 0)

            if critical > 0:
                status_class = 'critical'
                icon = 'üö®'
            elif score >= 90:
                status_class = 'excellent'
                icon = '‚úÖ'
            elif score >= 70:
                status_class = 'good'
                icon = '‚ö†Ô∏è'
            else:
                status_class = 'poor'
                icon = '‚ùå'

            critical_badge = f'<span class="critical-badge">{critical} CRITICAL</span>' if critical > 0 else ''

            html += f'''
            <div class="dimension-card {status_class}">
                <div class="dim-icon">{icon}</div>
                <div class="dim-name">{sensor_name.replace(' Sensor', '')}</div>
                {critical_badge}
                <div class="dim-score">{score:.1f}%</div>
                <div class="dim-progress">
                    <div class="progress-bar">
                        <div class="progress-fill" style="width: {score}%\"></div>
                    </div>
                    <div class="dim-stats">{passed}/{total} tests passed</div>
                </div>
            </div>
            '''

        html += '</div>'

    html += '</div>'
    return html

dimension_html = generate_dimension_cards_html(
    soda_data.get('dimensions', {}),
    dqops_data.get('sensors', {})
)
print(f"‚úÖ Dimension breakdown generated")

# ============================================
# 3.4 GENERATE FULL HTML REPORT
# ============================================
print("\nüìÑ Generating full HTML report...")
print("-" * 80)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
timestamp_display = datetime.now().strftime('%A, %B %d, %Y at %H:%M:%S')

html_report = f"""<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Integrated Data Quality Report - MBG Data</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}

        body {{
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            padding: 30px;
            line-height: 1.6;
            color: #2c3e50;
        }}

        .container {{
            max-width: 1600px;
            margin: 0 auto;
            background: white;
            border-radius: 25px;
            box-shadow: 0 25px 70px rgba(0,0,0,0.3);
            overflow: hidden;
        }}

        .header {{
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 60px;
            text-align: center;
            position: relative;
            overflow: hidden;
        }}

        .header::before {{
            content: '';
            position: absolute;
            top: -50%;
            left: -50%;
            width: 200%;
            height: 200%;
            background: repeating-linear-gradient(
                45deg,
                transparent,
                transparent 20px,
                rgba(255,255,255,0.05) 20px,
                rgba(255,255,255,0.05) 40px
            );
            animation: movePattern 30s linear infinite;
        }}

        @keyframes movePattern {{
            0% {{ transform: translate(0, 0); }}
            100% {{ transform: translate(70px, 70px); }}
        }}

        .header h1 {{
            font-size: 3.5em;
            margin-bottom: 15px;
            text-shadow: 3px 3px 6px rgba(0,0,0,0.3);
            position: relative;
            z-index: 1;
        }}

        .header .subtitle {{
            font-size: 1.4em;
            opacity: 0.95;
            position: relative;
            z-index: 1;
            margin-bottom: 10px;
        }}

        .header .meta {{
            margin-top: 20px;
            opacity: 0.85;
            font-size: 1em;
            position: relative;
            z-index: 1;
        }}

        .content {{
            padding: 60px;
        }}

        .hero-score {{
            background: linear-gradient(135deg, {grade_color} 0%, {grade_color}dd 100%);
            color: white;
            padding: 70px;
            border-radius: 25px;
            text-align: center;
            margin: 50px 0;
            position: relative;
            overflow: hidden;
            box-shadow: 0 15px 40px rgba(0,0,0,0.3);
        }}

        .hero-score::before {{
            content: '';
            position: absolute;
            top: 0;
            left: 0;
            right: 0;
            bottom: 0;
            background: radial-gradient(circle at 30% 50%, rgba(255,255,255,0.15) 0%, transparent 50%);
        }}

        .hero-score h2 {{
            font-size: 1.8em;
            margin-bottom: 30px;
            opacity: 0.95;
            position: relative;
            z-index: 1;
        }}

        .hero-score .score {{
            font-size: 8em;
            font-weight: bold;
            margin: 40px 0;
            text-shadow: 4px 4px 8px rgba(0,0,0,0.3);
            position: relative;
            z-index: 1;
            letter-spacing: -5px;
        }}

        .hero-score .grade {{
            font-size: 2.5em;
            font-weight: 600;
            position: relative;
            z-index: 1;
            text-transform: uppercase;
            letter-spacing: 3px;
        }}

        .comparison-section {{
            margin: 60px 0;
        }}

        .comparison-section h2 {{
            color: #2c3e50;
            font-size: 2.5em;
            margin-bottom: 40px;
            text-align: center;
            padding-bottom: 20px;
            border-bottom: 3px solid #667eea;
        }}

        .comparison-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(450px, 1fr));
            gap: 40px;
            margin: 40px 0;
        }}

        .method-card {{
            background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%);
            border-radius: 20px;
            padding: 45px;
            box-shadow: 0 10px 30px rgba(0,0,0,0.1);
            transition: all 0.4s ease;
            position: relative;
            overflow: hidden;
        }}

        .method-card::before {{
            content: '';
            position: absolute;
            top: 0;
            left: 0;
            right: 0;
            height: 6px;
            background: linear-gradient(90deg, #3498db, #2ecc71);
        }}

        .method-card.sensor::before {{
            background: linear-gradient(90deg, #9b59b6, #e74c3c);
        }}

        .method-card:hover {{
            transform: translateY(-12px);
            box-shadow: 0 20px 50px rgba(0,0,0,0.2);
        }}

        .method-card h3 {{
            font-size: 2em;
            color: #2c3e50;
            margin-bottom: 15px;
            display: flex;
            align-items: center;
            gap: 15px;
        }}

        .method-card .approach {{
            color: #7f8c8d;
            font-style: italic;
            margin-bottom: 30px;
            font-size: 1.1em;
        }}

        .method-card .score-display {{
            font-size: 5em;
            font-weight: bold;
            text-align: center;
            margin: 35px 0;
            background: linear-gradient(135deg, #3498db, #2ecc71);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            background-clip: text;
        }}

        .method-card.sensor .score-display {{
            background: linear-gradient(135deg, #9b59b6, #e74c3c);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
            background-clip: text;
        }}

        .method-card .metrics {{
            display: grid;
            gap: 15px;
            margin-top: 30px;
        }}

        .metric-row {{
            display: flex;
            justify-content: space-between;
            align-items: center;
            padding: 18px 25px;
            background: white;
            border-radius: 12px;
            transition: all 0.3s ease;
        }}

        .metric-row:hover {{
            background: #f8f9fa;
            transform: translateX(8px);
        }}

        .metric-label {{
            font-size: 1.1em;
            color: #5a6c7d;
            font-weight: 500;
        }}

        .metric-value {{
            font-size: 1.4em;
            font-weight: bold;
            color: #2c3e50;
        }}

        .metric-value.success {{
            color: #27ae60;
        }}

        .metric-value.error {{
            color: #e74c3c;
        }}

        .stats-overview {{
            background: #f8f9fa;
            border-radius: 20px;
            padding: 50px;
            margin: 50px 0;
        }}

        .stats-overview h2 {{
            color: #2c3e50;
            font-size: 2.2em;
            margin-bottom: 35px;
            text-align: center;
        }}

        .stats-grid {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(220px, 1fr));
            gap: 25px;
            margin-top: 30px;
        }}

        .stat-card {{
            background: white;
            padding: 35px;
            border-radius: 15px;
            text-align: center;
            border: 2px solid #e9ecef;
            transition: all 0.3s ease;
        }}

        .stat-card:hover {{
            border-color: #667eea;
            box-shadow: 0 8px 25px rgba(102, 126, 234, 0.25);
            transform: translateY(-5px);
        }}

        .stat-card .icon {{
            font-size: 3em;
            margin-bottom: 15px;
        }}

        .stat-card .label {{
            color: #7f8c8d;
            font-size: 0.95em;
            text-transform: uppercase;
            letter-spacing: 1.2px;
            margin-bottom: 12px;
        }}

        .stat-card .value {{
            color: #2c3e50;
            font-size: 2.8em;
            font-weight: bold;
            margin: 15px 0;
        }}

        .comparison-viz {{
            background: white;
            border-radius: 20px;
            padding: 45px;
            margin: 50px 0;
            box-shadow: 0 5px 20px rgba(0,0,0,0.08);
        }}

        .comparison-viz h3 {{
            color: #2c3e50;
            font-size: 1.8em;
            margin-bottom: 30px;
        }}

        .bar-comparison {{
            display: grid;
            gap: 25px;
        }}

        .bar-item {{
            display: grid;
            grid-template-columns: 150px 1fr;
            align-items: center;
            gap: 20px;
        }}

        .bar-label {{
            font-weight: 600;
            color: #2c3e50;
            font-size: 1.1em;
        }}

        .bar-container {{
            background: #e9ecef;
            border-radius: 10px;
            overflow: hidden;
            height: 45px;
            position: relative;
        }}

        .bar-fill {{
            height: 100%;
            display: flex;
            align-items: center;
            justify-content: flex-end;
            padding-right: 15px;
            color: white;
            font-weight: bold;
            transition: width 1.5s ease;
            font-size: 1.1em;
        }}

        .bar-fill.soda {{
            background: linear-gradient(90deg, #3498db, #2ecc71);
        }}

        .bar-fill.sensor {{
            background: linear-gradient(90deg, #9b59b6, #e74c3c);
        }}

        .bar-fill.combined {{
            background: linear-gradient(90deg, #667eea, #764ba2);
        }}

        .recommendations-container {{
            margin: 60px 0;
        }}

        .rec-section {{
            margin-bottom: 40px;
            border-radius: 20px;
            overflow: hidden;
            box-shadow: 0 8px 25px rgba(0,0,0,0.1);
        }}

        .rec-header {{
            padding: 30px;
            color: white;
        }}

        .critical-section .rec-header {{
            background: linear-gradient(135deg, #e74c3c 0%, #c0392b 100%);
        }}

        .error-section .rec-header {{
            background: linear-gradient(135deg, #f39c12 0%, #e67e22 100%);
        }}

        .warning-section .rec-header {{
            background: linear-gradient(135deg, #3498db 0%, #2980b9 100%);
        }}

        .rec-header h3 {{
            font-size: 2em;
            margin-bottom: 10px;
        }}

        .rec-subtitle {{
            font-size: 1.1em;
            opacity: 0.95;
        }}

        .issues-list {{
            background: white;
            padding: 30px;
        }}

        .issue-card {{
            display: flex;
            gap: 20px;
            padding: 25px;
            margin-bottom: 15px;
            background: #f8f9fa;
            border-radius: 15px;
            border-left: 5px solid #e9ecef;
            transition: all 0.3s ease;
        }}

        .issue-card:hover {{
            box-shadow: 0 5px 20px rgba(0,0,0,0.1);
            transform: translateX(5px);
        }}

        .issue-card.critical {{
            border-left-color: #e74c3c;
            background: linear-gradient(to right, #fef5f5, #f8f9fa);
        }}

        .issue-card.error {{
            border-left-color: #f39c12;
            background: linear-gradient(to right, #fffbf0, #f8f9fa);
        }}

        .issue-card.warning {{
            border-left-color: #3498db;
            background: linear-gradient(to right, #f0f8ff, #f8f9fa);
        }}

        .issue-number {{
            background: #667eea;
            color: white;
            width: 40px;
            height: 40px;
            border-radius: 50%;
            display: flex;
            align-items: center;
            justify-content: center;
            font-weight: bold;
            font-size: 1.2em;
            flex-shrink: 0;
        }}

        .issue-content {{
            flex: 1;
        }}

        .issue-title {{
            font-size: 1.2em;
            font-weight: 600;
            color: #2c3e50;
            margin-bottom: 8px;
        }}

        .issue-source {{
            color: #7f8c8d;
            font-size: 0.95em;
            margin-bottom: 12px;
        }}

        .issue-metrics {{
            display: flex;
            gap: 15px;
            flex-wrap: wrap;
        }}

        .metric-badge {{
            padding: 8px 16px;
            border-radius: 20px;
            font-size: 0.9em;
            font-weight: 600;
        }}

        .metric-badge.actual {{
            background: #fff3cd;
            color: #856404;
        }}

        .metric-badge.expected {{
            background: #d4edda;
            color: #155724;
        }}

        .success-message {{
            background: linear-gradient(135deg, #d4edda 0%, #c3e6cb 100%);
            border-left: 6px solid #27ae60;
            padding: 50px;
            border-radius: 20px;
            text-align: center;
            box-shadow: 0 10px 30px rgba(39, 174, 96, 0.2);
            margin: 60px 0;
        }}

        .success-icon {{
            font-size: 5em;
            margin-bottom: 20px;
        }}

        .success-message h3 {{
            color: #155724;
            font-size: 2.5em;
            margin-bottom: 20px;
        }}

        .success-message p {{
            color: #155724;
            font-size: 1.3em;
            line-height: 1.8;
            max-width: 800px;
            margin: 0 auto 30px;
        }}

        .success-stats {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 25px;
            margin-top: 30px;
        }}

        .success-stat {{
            background: white;
            padding: 25px;
            border-radius: 15px;
            display: flex;
            flex-direction: column;
            align-items: center;
            gap: 10px;
        }}

        .stat-icon {{
            font-size: 2.5em;
        }}

        .stat-label {{
            color: #155724;
            font-weight: 600;
            font-size: 1.1em;
        }}

        .dimension-comparison-grid {{
            display: grid;
            gap: 40px;
            margin: 50px 0;
        }}

        .framework-section {{
            background: white;
            border-radius: 20px;
            padding: 40px;
            box-shadow: 0 5px 20px rgba(0,0,0,0.08);
        }}

        .framework-title {{
            color: #2c3e50;
            font-size: 2em;
            margin-bottom: 30px;
            padding-bottom: 15px;
            border-bottom: 3px solid #667eea;
        }}

        .dimension-card {{
            background: #f8f9fa;
            border-radius: 15px;
            padding: 25px;
            margin-bottom: 20px;
            border-left: 5px solid #e9ecef;
            transition: all 0.3s ease;
        }}

        .dimension-card:hover {{
            box-shadow: 0 5px 20px rgba(0,0,0,0.1);
            transform: translateX(5px);
        }}

        .dimension-card.excellent {{
            border-left-color: #27ae60;
            background: linear-gradient(to right, #f0f9f4, #f8f9fa);
        }}

        .dimension-card.good {{
            border-left-color: #f39c12;
            background: linear-gradient(to right, #fffbf0, #f8f9fa);
        }}

        .dimension-card.poor {{
            border-left-color: #e74c3c;
            background: linear-gradient(to right, #fef5f5, #f8f9fa);
        }}

        .dimension-card.critical {{
            border-left-color: #c0392b;
            background: linear-gradient(to right, #fee, #f8f9fa);
        }}

        .dim-icon {{
            font-size: 2em;
            margin-bottom: 10px;
        }}

        .dim-name {{
            font-size: 1.3em;
            font-weight: 600;
            color: #2c3e50;
            margin-bottom: 15px;
        }}

        .critical-badge {{
            display: inline-block;
            background: #e74c3c;
            color: white;
            padding: 5px 12px;
            border-radius: 15px;
            font-size: 0.8em;
            font-weight: 700;
            margin-bottom: 10px;
        }}

        .dim-score {{
            font-size: 2.5em;
            font-weight: bold;
            color: #667eea;
            margin: 15px 0;
        }}

        .dim-progress {{
            margin-top: 15px;
        }}

        .progress-bar {{
            background: #e9ecef;
            border-radius: 10px;
            height: 12px;
            overflow: hidden;
            margin-bottom: 10px;
        }}

        .progress-fill {{
            height: 100%;
            background: linear-gradient(90deg, #667eea, #764ba2);
            border-radius: 10px;
            transition: width 1s ease;
        }}

        .dim-stats {{
            color: #7f8c8d;
            font-size: 0.95em;
        }}

        .methodology {{
            background: #fff;
            padding: 50px;
            border-radius: 20px;
            margin: 60px 0;
            box-shadow: 0 5px 20px rgba(0,0,0,0.08);
        }}

        .methodology h2 {{
            color: #2c3e50;
            font-size: 2.2em;
            margin-bottom: 30px;
            text-align: center;
        }}

        .methodology p {{
            color: #495057;
            font-size: 1.1em;
            line-height: 1.8;
            margin: 20px 0;
        }}

        .methodology strong {{
            color: #2c3e50;
        }}

        .footer {{
            background: linear-gradient(135deg, #2c3e50 0%, #34495e 100%);
            color: white;
            padding: 50px;
            text-align: center;
            margin-top: 60px;
        }}

        .footer h3 {{
            font-size: 1.8em;
            margin-bottom: 20px;
        }}

        .footer p {{
            margin: 12px 0;
            opacity: 0.9;
            font-size: 1.05em;
        }}

        @media (max-width: 768px) {{
            .header h1 {{ font-size: 2.2em; }}
            .hero-score .score {{ font-size: 5em; }}
            .comparison-grid {{ grid-template-columns: 1fr; }}
            .method-card .score-display {{ font-size: 3.5em; }}
            .bar-item {{ grid-template-columns: 1fr; }}
        }}
        .academic-header {{
            background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
            color: white;
            padding: 25px 50px;
            border-bottom: 5px solid #f39c12;
        }}
        .academic-info {{
            display: grid;
            grid-template-columns: 2fr 1fr;
            gap: 30px;
            align-items: center;
        }}
        .course-info h2 {{
            font-size: 1.4em;
            margin-bottom: 8px;
            font-weight: 700;
            text-transform: uppercase;
            letter-spacing: 1px;
        }}
        .course-info p {{
            margin: 5px 0;
            opacity: 0.95;
            font-size: 0.95em;
        }}
        .student-info {{
            background: rgba(255,255,255,0.1);
            padding: 20px;
            border-radius: 12px;
            backdrop-filter: blur(10px);
        }}
        .student-info h3 {{
            font-size: 1.1em;
            margin-bottom: 12px;
            border-bottom: 2px solid rgba(255,255,255,0.3);
            padding-bottom: 8px;
        }}
        .student-info p {{
            margin: 6px 0;
            font-size: 0.9em;
        }}
        .tools-badge {{
            display: inline-block;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            color: white;
            padding: 8px 16px;
            border-radius: 20px;
            font-size: 0.85em;
            font-weight: 600;
            margin-top: 10px;
            box-shadow: 0 4px 12px rgba(102, 126, 234, 0.3);
        }}
        .project-description {{
            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
            padding: 25px 50px;
            border-left: 5px solid #3498db;
        }}
        .project-description h3 {{
            color: #2c3e50;
            font-size: 1.3em;
            margin-bottom: 12px;
            display: flex;
            align-items: center;
            gap: 10px;
        }}
        .project-description p {{
            color: #34495e;
            line-height: 1.7;
            font-size: 0.95em;
        }}
        .footer-enhanced {{
            background: linear-gradient(135deg, #1e3c72 0%, #2a5298 100%);
            color: white;
            padding: 40px 50px;
            border-top: 5px solid #f39c12;
        }}
        .footer-grid {{
            display: grid;
            grid-template-columns: repeat(3, 1fr);
            gap: 30px;
            margin-bottom: 25px;
        }}
        .footer-section h4 {{
            font-size: 1.1em;
            margin-bottom: 12px;
            padding-bottom: 8px;
            border-bottom: 2px solid rgba(255,255,255,0.3);
        }}
        .footer-section p {{
            margin: 6px 0;
            opacity: 0.9;
            font-size: 0.9em;
        }}
        .footer-badge {{
            display: inline-block;
            background: rgba(255,255,255,0.2);
            padding: 5px 12px;
            border-radius: 15px;
            font-size: 0.8em;
            margin: 3px;
        }}
        .footer-bottom {{
            text-align: center;
            padding-top: 20px;
            border-top: 1px solid rgba(255,255,255,0.2);
            opacity: 0.85;
        }}
        @media (max-width: 768px) {{
            .academic-info, .footer-grid {{
                grid-template-columns: 1fr;
            }}
            .academic-header, .project-description, .footer-enhanced {{
                padding: 20px 25px;
            }}
        }}
    </style>
</head>
<body>
    <div class="container">
        <!-- HEADER AKADEMIK -->
        <div class="academic-header">
            <div class="academic-info">
                <div class="course-info">
                    <h2>üìö ANALISIS BISNIS DATA PERUSAHAAN</h2>
                    <p><strong>Judul Tugas:</strong> Aplikasi 8 - Sistem Monitoring Kualitas Data Berbasis AI untuk Program Gratis</p>
                    <p><strong>Fokus:</strong> Otomatisasi deteksi anomali data distribusi makanan bergizi dengan AI untuk rekomendasi perbaikan</p>
                    <div class="tools-badge">üîß Tools: Soda Core (Observability) & DQOps (Testing Sensor-Based)</div>
                </div>
                <div class="student-info">
                    <h3>üë• Tim Mahasiswa</h3>
                    <p><strong>202022510021</strong><br>MADE MARSHALL VIRA DEVA</p>
                    <p><strong>202022420034</strong><br>IRFAN VENNY RAHMAYANTI</p>
                </div>
            </div>
        </div>



        <!-- HEADER REPORT -->
        <div class="header">
            <h1>üîç Laporan Analisis Kualitas Data Komprehensif</h1>
            <div class="subtitle">Perbandingan Framework: Soda Core (Observability) vs DQOps (Sensor Testing)</div>
            <div class="meta">
                Dataset: Data MBG ({len(df):,} baris √ó {len(df.columns)} kolom)<br>
                Pendekatan: Validasi Berlapis dengan AI-Powered Recommendations<br>
                Dibuat: {timestamp_display}
            </div>
        </div>

        <div class="content">
            <!-- Hero Score -->
            <div class="hero-score">
                <h2>üéØ Combined Quality Score</h2>
                <div class="score">{combined_score:.1f}</div>
                <div class="grade">{combined_grade}</div>
            </div>

            <!-- Comparison Section -->
            <div class="comparison-section">
                <h2>‚öñÔ∏è Method Comparison</h2>
                <div class="comparison-grid">
                    <!-- Soda Core Card -->
                    <div class="method-card">
                        <h3>üìä Soda Core</h3>
                        <div class="approach">{soda_data['approach']}</div>
                        <div class="score-display">{soda_data['quality_score']:.1f}</div>
                        <div class="metrics">
                            <div class="metric-row">
                                <span class="metric-label">Total Checks</span>
                                <span class="metric-value">{soda_data['total_checks']}</span>
                            </div>
                            <div class="metric-row">
                                <span class="metric-label">Passed</span>
                                <span class="metric-value success">‚úÖ {soda_data['passed']}</span>
                            </div>
                            <div class="metric-row">
                                <span class="metric-label">Failed</span>
                                <span class="metric-value error">‚ùå {soda_data['failed']}</span>
                            </div>
                            <div class="metric-row">
                                <span class="metric-label">Success Rate</span>
                                <span class="metric-value success">{soda_data['success_rate']:.1f}%</span>
                            </div>
                        </div>
                    </div>

                    <!-- DQOps Sensors Card -->
                    <div class="method-card sensor">
                        <h3>üî¨ DQOps Sensors</h3>
                        <div class="approach">{dqops_data['approach']}</div>
                        <div class="score-display">{dqops_data['quality_score']:.1f}</div>
                        <div class="metrics">
                            <div class="metric-row">
                                <span class="metric-label">Total Tests</span>
                                <span class="metric-value">{dqops_data['total_tests']}</span>
                            </div>
                            <div class="metric-row">
                                <span class="metric-label">Passed</span>
                                <span class="metric-value success">‚úÖ {dqops_data['passed']}</span>
                            </div>
                            <div class="metric-row">
                                <span class="metric-label">Failed</span>
                                <span class="metric-value error">‚ùå {dqops_data['failed']}</span>
                            </div>
                            <div class="metric-row">
                                <span class="metric-label">Critical Issues</span>
                                <span class="metric-value error">üö® {dqops_data['critical_failures']}</span>
                            </div>
                        </div>
                    </div>
                </div>
            </div>

            <!-- Visual Comparison -->
            <div class="comparison-viz">
                <h3>üìä Score Comparison Visualization</h3>
                <div class="bar-comparison">
                    <div class="bar-item">
                        <div class="bar-label">Soda Core</div>
                        <div class="bar-container">
                            <div class="bar-fill soda" style="width: {soda_data['quality_score']:.1f}%;">
                                {soda_data['quality_score']:.1f}%
                            </div>
                        </div>
                    </div>
                    <div class="bar-item">
                        <div class="bar-label">DQOps Sensors</div>
                        <div class="bar-container">
                            <div class="bar-fill sensor" style="width: {dqops_data['quality_score']:.1f}%;">
                                {dqops_data['quality_score']:.1f}%
                            </div>
                        </div>
                    </div>
                    <div class="bar-item">
                        <div class="bar-label">Combined</div>
                        <div class="bar-container">
                            <div class="bar-fill combined" style="width: {combined_score:.1f}%;">
                                {combined_score:.1f}%
                            </div>
                        </div>
                    </div>
                </div>
            </div>

            <!-- Statistics Overview -->
            <div class="stats-overview">
                <h2>üìà Combined Statistics</h2>
                <div class="stats-grid">
                    <div class="stat-card">
                        <div class="icon">üéØ</div>
                        <div class="label">Total Validations</div>
                        <div class="value">{combined_data['total_validations']}</div>
                    </div>
                    <div class="stat-card">
                        <div class="icon">‚úÖ</div>
                        <div class="label">Total Passed</div>
                        <div class="value" style="color: #27ae60;">{combined_data['total_passed']}</div>
                    </div>
                    <div class="stat-card">
                        <div class="icon">‚ùå</div>
                        <div class="label">Total Failed</div>
                        <div class="value" style="color: #e74c3c;">{combined_data['total_failed']}</div>
                    </div>
                    <div class="stat-card">
                        <div class="icon">üìä</div>
                        <div class="label">Success Rate</div>
                        <div class="value" style="color: #3498db;">{combined_data['success_rate']:.1f}%</div>
                    </div>
                </div>
            </div>

            <!-- Recommendations -->
            <h2 style="text-align: center; color: #2c3e50; font-size: 2.5em; margin: 60px 0 40px; border-bottom: 3px solid #667eea; padding-bottom: 20px;">
                üí° Recommendations & Issues
            </h2>
            {recommendations_html}

            <!-- Dimension Breakdown -->
            <h2 style="text-align: center; color: #2c3e50; font-size: 2.5em; margin: 60px 0 40px; border-bottom: 3px solid #667eea; padding-bottom: 20px;">
                üìä Detailed Dimension Analysis
            </h2>
            {dimension_html}

            <!-- Methodology -->
            <div class="methodology">
                <h2>üî¨ Validation Methodology</h2>
                <p><strong>Dual-Layer Approach:</strong> This report combines two complementary data quality frameworks to provide comprehensive validation coverage.</p>

                <p><strong>Soda Core (Observability Layer):</strong> Provides continuous monitoring and high-level quality checks. Focuses on overall data health, completeness, and consistency across the dataset. Best for ongoing data observability and alerting.</p>

                <p><strong>DQOps Sensors (Testing Layer):</strong> Implements sensor-based deep testing with granular validation rules. Detects anomalies, validates formats, and ensures data accuracy at the field level. Best for detailed quality testing and issue identification.</p>

                <p><strong>Combined Scoring:</strong> The overall quality score uses a weighted average (40% Soda Core + 60% DQOps) to emphasize deep testing while maintaining observability insights. This provides a balanced view of data quality across both breadth and depth.</p>

                <p><strong>Quality Dimensions Covered:</strong></p>
                <ul style="margin-left: 40px; margin-top: 15px; line-height: 2;">
                    <li><strong>Completeness:</strong> Missing data and null value detection</li>
                    <li><strong>Validity:</strong> Format correctness and type validation</li>
                    <li><strong>Accuracy:</strong> Value ranges and business rule compliance</li>
                    <li><strong>Uniqueness:</b> Duplicate detection and key constraints</li>
                    <li><strong>Consistency:</strong> Cross-field validation and referential integrity</li>
                    <li><strong>Statistical Quality:</strong> Distribution analysis and outlier detection</li>
                </ul>
            </div>
        </div>

        <div class="footer-enhanced">
            <div class="footer-grid">
                <div class="footer-section">
                    <h4>üìö Informasi Akademik</h4>
                    <p><strong>Mata Kuliah:</strong><br>Analisis Bisnis Data Perusahaan</p>
                    <p><strong>Aplikasi:</strong> #8 - Monitoring Kualitas Data AI</p>
                    <p><strong>Metode:</strong> Process Improvement</p>
                </div>

                <div class="footer-section">
                    <h4>üë• Tim Pengembang</h4>
                    <p><strong>202022510021</strong><br>Made Marshall Vira Deva</p>
                    <p><strong>202022420034</strong><br>Irfan Venny Rahmayanti</p>
                </div>

                <div class="footer-section">
                    <h4>üîß Framework & Tools</h4>
                    <div class="footer-badge">Soda Core</div>
                    <div class="footer-badge">DQOps</div>
                    <div class="footer-badge">Python</div>
                    <div class="footer-badge">Pandas</div>
                    <div class="footer-badge">AI-Powered</div>
                </div>
            </div>

            <div class="footer-bottom">
                <p><strong>üèÜ Framework Pengujian Sensor DQOps</strong></p>
                <p>Didukung oleh Validasi Berbasis Sensor Mendalam dengan Threshold yang Disesuaikan</p>
                <p>ID Laporan: {timestamp} | 6 Sensor √ó {total_checks} Pengujian</p>
                <p>¬© 2024 Proyek Kualitas Data MBG | Tugas Analisis Bisnis Data Perusahaan</p>
            </div>
        </div>
    </div>
</body>
</html>"""

# Combine dengan HTML awal
full_html = html_report

# Save HTML report
PROJECT_PATH = '/content/soda_project'
reports_dir = f'{PROJECT_PATH}/reports'
html_path = f'{reports_dir}/integrated_comparison_{timestamp}.html'

with open(html_path, 'w', encoding='utf-8') as f:
    f.write(full_html)

print(f"\n‚úÖ HTML Report saved!")
print(f"   üìÅ {os.path.basename(html_path)}")
print(f"   üìä Size: {os.path.getsize(html_path) / 1024:.1f} KB")

# ============================================
# 3.5 FINAL SUMMARY
# ============================================
print("\n" + "="*80)
print("‚úÖ STEP 5 - PART 3 COMPLETE!")
print("="*80)

print(f"\nüìä Final Report Summary:")
print(f"   ‚Ä¢ Combined Score: {combined_score:.1f}/100")
print(f"   ‚Ä¢ Grade: {combined_grade}")
print(f"   ‚Ä¢ Total Validations: {combined_data['total_validations']}")
print(f"   ‚Ä¢ Success Rate: {combined_data['success_rate']:.1f}%")
print(f"   ‚Ä¢ Issues Identified: {len(issues_found)}")

print(f"\nüìÅ All files saved to Google Drive:")
print(f"   {reports_dir}/")

print(f"\nüìÑ Generated Files:")
print(f"   1. ‚úÖ JSON: integrated_comparison_{timestamp}.json")
print(f"   2. ‚úÖ HTML: integrated_comparison_{timestamp}.html")

print("\nüéØ For Your Presentation:")
print("   ‚úÖ Dual-layer validation approach (Soda Core + DQOps)")
print("   ‚úÖ Comprehensive quality analysis")
print(f"   ‚úÖ Combined quality score: {combined_score:.1f}/100")
print(f"   ‚úÖ Detailed issue tracking ({len(issues_found)} issues)")
print("   ‚úÖ Download HTML report for visualizations")

if issues_found:
    critical_count = len([i for i in issues_found if i.get('severity') == 'critical'])
    error_count = len([i for i in issues_found if i.get('severity') == 'error'])

    if critical_count > 0:
        print(f"\nüö® ACTION REQUIRED:")
        print(f"   ‚Ä¢ {critical_count} critical issues need immediate attention")
        print(f"   ‚Ä¢ Review the HTML report for details")
    elif error_count > 0:
        print(f"\n‚ö†Ô∏è  IMPROVEMENTS NEEDED:")
        print(f"   ‚Ä¢ {error_count} high-priority issues found")
        print(f"   ‚Ä¢ Consider addressing these to improve quality")
else:
    print("\n‚úÖ EXCELLENT: All validations passed!")

print("\n" + "="*80)
print("üéâ ALL STEPS COMPLETE!")
print("="*80)
print("\nYou now have 3 comprehensive reports:")
print("   1. üìä Soda Core Report (Observability)")
print("   2. üî¨ DQOps Sensor Report (Deep Testing)")
print("   3. üîç Integrated Comparison Report (Combined Analysis)")
print("\nAll files are saved in Google Drive and ready for your presentation!")
print("="*80)


üìä STEP 5 - PART 1: LOADING & VERIFYING RESULTS

üìÅ Reports directory: /content/soda_project/reports
   Exists: True

üìã Available files in reports directory:
--------------------------------------------------------------------------------
   ‚Ä¢ dqops_metrics_adjusted_20251130_174155.csv                   (     6.7 KB)
   ‚Ä¢ dqops_sensor_report_adjusted_20251130_174155.html            (    52.6 KB)
   ‚Ä¢ dqops_sensor_results_adjusted_20251130_174155.csv            (     4.2 KB)
   ‚Ä¢ dqops_sensor_testing_adjusted_20251130_174155.json           (    17.6 KB)
   ‚Ä¢ dqops_summary_adjusted_20251130_174155.txt                   (     3.4 KB)
   ‚Ä¢ quality_report_adjusted_20251130_174153.json                 (     7.2 KB)
   ‚Ä¢ soda_core_report_adjusted_20251130_174153.html               (    37.8 KB)
   ‚Ä¢ soda_core_results_adjusted_20251130_174153.csv               (     2.4 KB)

üîç Identifying report files...
----------------------------------------------------------------