In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Overview and Quality Assessment\n",
    "\n",
    "**Project:** Automated Regulatory Reporting  \n",
    "**Author:** Nathan, Senior Data Scientist  \n",
    "**Agency:** Regulatory Agency  \n",
    "**Date:** August 2025  \n",
    "\n",
    "## Purpose\n",
    "This notebook provides an initial overview of the loan-level data, including:\n",
    "- Data loading and initial exploration\n",
    "- Data quality assessment\n",
    "- Basic summary statistics\n",
    "- Preliminary visualizations\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import plotly.graph_objects as go\n",
    "import plotly.express as px\n",
    "from datetime import datetime\n",
    "import sys\n",
    "from pathlib import Path\n",
    "\n",
    "# Add project modules to path\n",
    "project_root = Path().resolve().parent if 'notebooks' in str(Path().resolve()) else Path().resolve()\n",
    "sys.path.append(str(project_root))\n",
    "\n",
    "# Import custom modules\n",
    "from modules.report_utils import save_figure, save_table, display_summary_stats\n",
    "from modules.cleaning import comprehensive_clean, clean_loan_data\n",
    "from modules.validation import generate_quality_report, validate_loan_data\n",
    "from modules.visuals import plot_time_series, plot_distribution_comparison, generate_sample_data\n",
    "from config.config import *\n",
    "\n",
    "print(\"✅ All modules imported successfully\")\n",
    "print(f\"📁 Project root: {PROJECT_ROOT}\")\n",
    "print(f\"📊 Enterprise colors: {list(ENTERPRISE_COLORS.keys())}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Data Loading\n",
    "\n",
    "Load raw loan-level data from the data directory. Update the file path below to match your data source."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Configure data loading parameters\n",
    "DATA_FILE = \"loan_sample_data.csv\"  # Update with your actual file name\n",
    "PRINT_MODE = \"YES\"  # Set to \"NO\" to disable file saving\n",
    "\n",
    "# Check if data file exists, if not generate sample data\n",
    "data_path = RAW_DATA_PATH / DATA_FILE\n",
    "\n",
    "if data_path.exists():\n",
    "    print(f\"📥 Loading data from: {data_path}\")\n",
    "    # Load your actual data\n",
    "    raw_df = pd.read_csv(data_path)\n",
    "    print(f\"✅ Data loaded: {raw_df.shape[0]:,} records, {raw_df.shape[1]} columns\")\n",
    "else:\n",
    "    print(\"⚠️  Data file not found. Generating sample data for demonstration...\")\n",
    "    # Generate sample data for demonstration\n",
    "    raw_df = generate_sample_data('loan', n_records=5000)\n",
    "    print(f\"✅ Sample data generated: {raw_df.shape[0]:,} records, {raw_df.shape[1]} columns\")\n",
    "\n",
    "# Display basic info\n",
    "print(f\"\\n📋 Data Types:\")\n",
    "print(raw_df.dtypes)\n",
    "\n",
    "print(f\"\\n📊 Memory Usage: {raw_df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Initial Data Exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display first few rows\n",
    "print(\"🔍 First 5 rows:\")\n",
    "display(raw_df.head())\n",
    "\n",
    "# Check for missing values\n",
    "print(\"\\n📊 Missing Values Summary:\")\n",
    "missing_summary = raw_df.isnull().sum()\n",
    "missing_pct = (missing_summary / len(raw_df) * 100).round(2)\n",
    "missing_df = pd.DataFrame({\n",
    "    'Missing_Count': missing_summary,\n",
    "    'Missing_Percentage': missing_pct\n",
    "}).sort_values('Missing_Count', ascending=False)\n",
    "\n",
    "display(missing_df[missing_df['Missing_Count'] > 0])\n",
    "\n",
    "# Basic statistics for numeric columns\n",
    "numeric_cols = raw_df.select_dtypes(include=[np.number]).columns\n",
    "if len(numeric_cols) > 0:\n",
    "    print(f\"\\n📈 Numeric Columns Summary:\")\n",
    "    display(raw_df[numeric_cols].describe())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Data Quality Assessment\n",
    "\n",
    "Perform comprehensive data quality analysis using our validation module."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate comprehensive quality report\n",
    "print(\"🔍 Generating data quality report...\")\n",
    "quality_report = generate_quality_report(raw_df, include_profiling=True)\n",
    "\n",
    "# Display key quality metrics\n",
    "print(f\"\\n📊 Data Quality Summary:\")\n",
    "print(f\"   Overall Completeness: {quality_report['completeness']['overall_completeness']:.1f}%\")\n",
    "print(f\"   Duplicate Records: {quality_report['consistency']['duplicate_records']:,}\")\n",
    "print(f\"   Critical Missing Columns: {len(quality_report['completeness']['critical_missing'])}\")\n",
    "\n",
    "# Show recommendations if any\n",
    "if quality_report['recommendations']:\n",
    "    print(f\"\\n⚠️  Data Quality Recommendations:\")\n",
    "    for i, rec in enumerate(quality_report['recommendations'][:5]):  # Show top 5\n",
    "        print(f\"   {i+1}. [{rec['priority']}] {rec['issue']}\")\n",
    "        print(f\"      → {rec['recommendation']}\")\n",
    "\n",
    "# Save quality report\n",
    "if PRINT_MODE == \"YES\":\n",
    "    report_path = get_timestamped_path(DRAFT_DIR) / 'data_quality_report.xlsx'\n",
    "    # Note: Full export functionality woul