In [None]:
# @title Markdown Knowledge Base Builder v4.7 (Diagnostic & Hardened)
# @markdown This version adds enhanced diagnostic logging and more specific error handling
# @markdown to resolve potential conflicts and provide clearer feedback on failures.

# @markdown ### 1. Setup
# @markdown - Place your .md files into the `DOCUMENT_DIRECTORY` specified below.
# @markdown - Ensure your `GOOGLE_API_KEY` is correctly saved in Colab Secrets.

# =============================================================================
# 1. INSTALLATION
# =============================================================================
!pip install tqdm pandas google-generativeai -q

import os
import json
import textwrap
import time
import hashlib
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
import sqlite3
import shutil
import threading
import sys

import google.generativeai as genai
from google.colab import userdata, drive, auth
from tqdm.notebook import tqdm
import pandas as pd
from google.api_core import exceptions as google_exceptions
from urllib3.exceptions import ProtocolError


print("--- Markdown Knowledge Base Builder v4.7 ---")

# Mount Google Drive at the beginning
try:
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive', force_remount=True)
        print("‚úÖ Google Drive mounted successfully.")
    else:
        print("‚úÖ Google Drive is already mounted.")
except Exception as e:
    print(f"\n‚ùå CRITICAL ERROR: Google Drive mount failed: {e}")
    print("Please ensure you have given this notebook permission to access your Google Drive.")


# =============================================================================
# 2. CONFIGURATION - TUNED FOR RELIABILITY
# =============================================================================
DOCUMENT_DIRECTORY = '/content/drive/MyDrive/Documents/markdown'
OUTPUT_DIRECTORY = '/content/drive/MyDrive/Documents/analyzed_markdown'
QUARANTINE_DIRECTORY = os.path.join(OUTPUT_DIRECTORY, '_quarantine')
DB_FILE = os.path.join(OUTPUT_DIRECTORY, 'knowledge_base_md.sqlite')

MAX_GEMINI_RETRIES = 8
GEMINI_RETRY_DELAY = 10
MAX_WORKERS = 3
SUPPORTED_EXTENSIONS = ['.md']
MAX_SPLIT_SIZE_MB = 9

# =============================================================================
# 3. DATABASE FUNCTIONS (No changes needed)
# =============================================================================
def init_database():
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS documents (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                filename TEXT NOT NULL,
                file_hash TEXT NOT NULL,
                analysis_json TEXT NOT NULL,
                processed_at TEXT NOT NULL,
                UNIQUE(filename, file_hash)
            )
        ''')
        conn.commit()

def add_to_database(filename, file_hash, analysis_result):
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute(
            "INSERT OR REPLACE INTO documents (filename, file_hash, analysis_json, processed_at) VALUES (?, ?, ?, ?)",
            (filename, file_hash, json.dumps(analysis_result), analysis_result['processedAt'])
        )
        conn.commit()

def is_file_in_database(filename, file_hash):
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT 1 FROM documents WHERE filename = ? AND file_hash = ?", (filename, file_hash))
        return cursor.fetchone() is not None

def get_all_from_database():
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT analysis_json FROM documents")
        return [json.loads(row[0]) for row in cursor.fetchall()]

def get_last_n_filenames(n=5):
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT filename FROM documents ORDER BY id DESC LIMIT ?", (n,))
        return [row[0] for row in cursor.fetchall()]

# =============================================================================
# 4. CORE HELPER FUNCTIONS
# =============================================================================
def get_file_hash(filepath):
    hasher = hashlib.md5()
    try:
        with open(filepath, 'rb') as f:
            while chunk := f.read(8192): hasher.update(chunk)
        return hasher.hexdigest()
    except Exception: return None

def get_file_content_as_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception:
        return None

def find_supported_files_recursive(directory):
    print(f"\n--- Scanning for {', '.join(SUPPORTED_EXTENSIONS)} files recursively... ---")
    found_files = []
    for root, _, files in os.walk(directory):
        sanitized_root = root.encode(sys.stdout.encoding, errors='replace').decode(sys.stdout.encoding)
        print(f"  -> Scanning directory: {sanitized_root}", end='\r')
        for file in files:
            if os.path.splitext(file)[1].lower() in SUPPORTED_EXTENSIONS:
                found_files.append(os.path.join(root, file))
    sys.stdout.write("\n\033[K")
    print(f"Scan complete. Found {len(found_files)} total supported files.")
    return sorted(found_files)

def analyze_with_gemini_with_retries(prompt, filename, model):
    current_delay = GEMINI_RETRY_DELAY
    for attempt in range(MAX_GEMINI_RETRIES):
        try:
            response = model.generate_content(prompt, request_options={'timeout': 600})
            cleaned_response = response.text.strip()
            if cleaned_response.startswith('```json'):
                cleaned_response = cleaned_response[7:-3].strip()
            if not cleaned_response:
                return None
            return json.loads(cleaned_response)
        except (google_exceptions.ServiceUnavailable, google_exceptions.ResourceExhausted, ProtocolError) as e:
            if attempt < MAX_GEMINI_RETRIES - 1:
                print(f"  -> [WARN] Retriable API error for '{filename}' (Attempt {attempt + 1}/{MAX_GEMINI_RETRIES}). Retrying in {current_delay}s. Error: {type(e).__name__}")
                time.sleep(current_delay)
                current_delay *= 1.5
            else:
                print(f"  -> [ERROR] API call for '{filename}' failed after {MAX_GEMINI_RETRIES} attempts due to persistent API issues.")
                return None
        except Exception as e:
            print(f"  -> [ERROR] A non-retriable error occurred for '{filename}' during API call or JSON parsing: {e}")
            return None

# =============================================================================
# 5. MAIN PROCESSING LOGIC
# =============================================================================
def create_archival_prompt(doc_text_snippet, filename, knowledge_summary):
    prompt = textwrap.dedent(f"""
        You are an expert archivist AI. Analyze the document text provided below and create a structured record for a knowledge base.
        Provide ONLY a valid JSON response. If information for a field is not available in the document, state "Information not available in document".

        DOCUMENT FILENAME: {filename}
        CONTEXT OF PREVIOUSLY PROCESSED FILES: {knowledge_summary or "First document in series."}

        DOCUMENT TEXT SNIPPET:
        ---
        {doc_text_snippet}
        ---

        Required JSON Output (Based on standard archival principles):
        {{
          "fileName": "{filename}",
          "fileType": "{os.path.splitext(filename)[1]}",
          "provenance": "Who created this file? For what purpose?",
          "originalOrder": "How is the document structured or organized?",
          "primaryValue": "What immediate administrative, fiscal, or legal purpose does it serve?",
          "secondaryValue": "What evidential or informational value does it hold for future research?",
          "significance": "Does it document key decisions, events, or perspectives?",
          "uniqueness": "Is the information available elsewhere or is it unique to this document?",
          "usability": "Is the file well-organized, readable, and accessible?",
          "context": "How does it connect to the other files mentioned in the CONTEXT section?",
          "intrinsicValue": "Does the file‚Äôs original form add value beyond its text content (e.e., unique layout, annotations)?",
          "summary": "Provide a concise one-paragraph summary of the document's core content."
        }}
    """)
    return prompt

def process_document(file_path, model):
    filename = os.path.basename(file_path)
    print(f"  -> [Thread] Starting analysis for: {filename}...")
    try:
        document_text = get_file_content_as_text(file_path)
        if not document_text: return None, "Failed to extract text"

        max_chars = 15000
        snippet = document_text[:max_chars] + ("\n[...]" if len(document_text) > max_chars else "")

        last_files = get_last_n_filenames(5)
        knowledge_summary = "; ".join(last_files)

        prompt = create_archival_prompt(snippet, filename, knowledge_summary)
        analysis_result = analyze_with_gemini_with_retries(prompt, filename, model)

        if not analysis_result:
            return None, "Gemini analysis failed or returned empty"

        # NEW: Diagnostic print
        print(f"  -> [Thread] Successfully received Gemini analysis for: {filename}")

        analysis_result['rawText'] = document_text
        analysis_result['filePath'] = file_path
        analysis_result['processedAt'] = datetime.now().isoformat()
        return analysis_result, "Success"
    except Exception as exc:
        # NEW: This will catch any unexpected errors during the processing of a single file
        print(f"  -> [CRITICAL] An unexpected error occurred while processing {filename}: {exc}")
        return None, f"Unexpected error: {exc}"


def heartbeat(stop_event, pbar):
    while not stop_event.is_set():
        time.sleep(60)
        if stop_event.is_set():
            break
        progress = f"{pbar.n}/{pbar.total} files completed." if pbar.total > 0 else "waiting for files to process."
        print(f"\n‚ù§Ô∏è  [Heartbeat at {datetime.now().strftime('%H:%M:%S')}] Analysis is in progress... {progress}\n")


def generate_html_report(report_data, version):
    df = pd.DataFrame(report_data).reindex(columns=['File', 'Status', 'Details'])
    html = df.to_html(index=False, justify='left', border=0, classes='table table-striped')
    html_template = f"""
    <html><head><title>Processing Report</title><style>
        body {{ font-family: sans-serif; margin: 2em; background-color: #f9f9f9; color: #333; }}
        h1 {{ color: #1a1a1a; }} table {{ width: 100%; border-collapse: collapse; box-shadow: 0 2px 3px rgba(0,0,0,0.1); }}
        th, td {{ padding: 12px 15px; text-align: left; border-bottom: 1px solid #ddd; }}
        th {{ background-color: #4CAF50; color: white; }} tr:nth-child(even) {{ background-color: #f2f2f2; }}
    </style></head><body>
        <h1>Knowledge Base Processing Report (Version: {version})</h1>
        <p><strong>Generated on:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>{html}
    </body></html>"""
    report_path = os.path.join(OUTPUT_DIRECTORY, f'_report_v{version}.html')
    with open(report_path, 'w') as f: f.write(html_template)
    print(f"üìÑ HTML report saved to: {report_path}")

def main():
    print("\nüöÄ Starting Archival Knowledge Base Generation")
    try:
        print("--- Verifying Google API Key ---")
        API_KEY = userdata.get('GOOGLE_API_KEY')
        if not API_KEY:
            print("\n‚ùå CRITICAL ERROR: 'GOOGLE_API_KEY' not found in Colab Secrets.")
            return
        print("‚úÖ GOOGLE_API_KEY loaded successfully.")
        genai.configure(api_key=API_KEY)


        for dir_path in [OUTPUT_DIRECTORY, QUARANTINE_DIRECTORY]:
            if not os.path.exists(dir_path): os.makedirs(dir_path)

        auth.authenticate_user()
        model = genai.GenerativeModel('gemini-2.5-flash-preview-05-20')
    except Exception as e:
        print(f"\n‚ùå Critical setup error: {e}"); return

    init_database()
    all_files = find_supported_files_recursive(DOCUMENT_DIRECTORY)
    session_report, files_to_process = [], []

    print("\n--- Checking files against database cache... ---")
    for fp in tqdm(all_files, desc="Verifying Files"):
        filename, file_hash = os.path.basename(fp), get_file_hash(fp)
        if not file_hash or is_file_in_database(filename, file_hash):
            session_report.append({"File": filename, "Status": "Skipped", "Details": "Already processed"})
            continue
        files_to_process.append(fp)

    print(f"\nFound {len(files_to_process)} new or modified files to process.")

    version = datetime.now().strftime('%Y%m%d%H%M%S')
    print(f"\n--- Starting processing for version: {version} ---")

    if files_to_process:
        stop_heartbeat = threading.Event()
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            pbar = tqdm(total=len(files_to_process), desc="Analyzing Documents")
            heartbeat_thread = threading.Thread(target=heartbeat, args=(stop_heartbeat, pbar))
            heartbeat_thread.start()

            future_to_file = {executor.submit(process_document, filepath, model): filepath for filepath in files_to_process}

            try:
                for future in as_completed(future_to_file):
                    filepath = future_to_file[future]
                    filename = os.path.basename(filepath)
                    try:
                        result, status = future.result()
                        if result:
                            file_hash = get_file_hash(filepath)
                            add_to_database(filename, file_hash, result)
                            session_report.append({"File": filename, "Status": "Success", "Details": "Added to database"})
                        else: raise Exception(status)
                    except Exception as exc:
                        session_report.append({"File": filename, "Status": "Failed", "Details": str(exc)})
                        shutil.move(filepath, os.path.join(QUARANTINE_DIRECTORY, filename))
                    pbar.update(1)
            finally:
                pbar.close()
                stop_heartbeat.set()
                heartbeat_thread.join()

    print("\n--- Finalizing Knowledge Base ---")
    knowledge_base = get_all_from_database()
    final_save_path = os.path.join(OUTPUT_DIRECTORY, f'md_knowledge_base_v{version}.json')
    with open(final_save_path, 'w') as f: json.dump(knowledge_base, f, indent=2)

    print(f"\nüéâ KNOWLEDGE BASE COMPLETE! üéâ")
    print(f"üß† Total documents in knowledge base: {len(knowledge_base)}")
    print(f"‚úÖ Final knowledge base saved to: {final_save_path}")

    if os.path.exists(final_save_path) and os.path.getsize(final_save_path) > MAX_SPLIT_SIZE_MB * 1024 * 1024:
        print(f"\n--- Splitting Knowledge Base (Max size: {MAX_SPLIT_SIZE_MB}MB) ---")
        with open(final_save_path, 'r') as f: full_data = json.load(f)
        part_num, current_chunk, current_size = 1, [], 0
        for item in full_data:
            item_str = json.dumps(item)
            item_size = len(item_str.encode('utf-8'))
            if current_size + item_size > MAX_SPLIT_SIZE_MB * 1024 * 1024 and current_chunk:
                part_path = os.path.join(OUTPUT_DIRECTORY, f'md_knowledge_base_v{version}_part_{part_num}.json')
                with open(part_path, 'w') as f: json.dump(current_chunk, f, indent=2)
                print(f"‚úÖ Created part {part_num}: {part_path}")
                part_num += 1
                current_chunk, current_size = [item], item_size
            else:
                current_chunk.append(item)
                current_size += item_size
        if current_chunk:
            part_path = os.path.join(OUTPUT_DIRECTORY, f'md_knowledge_base_v{version}_part_{part_num}.json')
            with open(part_path, 'w') as f: json.dump(current_chunk, f, indent=2)
            print(f"‚úÖ Created part {part_num}: {part_path}")
        print(f"\nSplitting complete. Created {part_num} parts.")

    generate_html_report(session_report, version)

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nProcess interrupted by user. Progress has been saved.")
    finally:
        print("\n--- Script Finished ---")

In [1]:
# @title Markdown Knowledge Base Builder v4.8 (Free Tier Optimized)
# @markdown Optimized for Gemini free tier: sequential processing, proper rate limiting

# =============================================================================
# 1. INSTALLATION
# =============================================================================
!pip install tqdm pandas google-generativeai -q

import os
import json
import textwrap
import time
import hashlib
from datetime import datetime
import sqlite3
import shutil
import sys

import google.generativeai as genai
from google.colab import userdata, drive, auth
from tqdm.notebook import tqdm
import pandas as pd
from google.api_core import exceptions as google_exceptions

print("--- Markdown Knowledge Base Builder v4.8 (Free Tier) ---")

# Mount Google Drive
try:
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive', force_remount=True)
        print("‚úÖ Google Drive mounted successfully.")
    else:
        print("‚úÖ Google Drive is already mounted.")
except Exception as e:
    print(f"\n‚ùå CRITICAL ERROR: Google Drive mount failed: {e}")
    sys.exit(1)

# =============================================================================
# 2. CONFIGURATION - FREE TIER OPTIMIZED
# =============================================================================
DOCUMENT_DIRECTORY = '/content/drive/MyDrive/Documents/markdown'
OUTPUT_DIRECTORY = '/content/drive/MyDrive/Documents/analyzed_markdown'
QUARANTINE_DIRECTORY = os.path.join(OUTPUT_DIRECTORY, '_quarantine')
DB_FILE = os.path.join(OUTPUT_DIRECTORY, 'knowledge_base_md.sqlite')

# FREE TIER SETTINGS (critical for stability)
MAX_GEMINI_RETRIES = 5
GEMINI_RETRY_DELAY = 20  # Start with 20 seconds
REQUEST_DELAY = 4  # 4 seconds between requests
SUPPORTED_EXTENSIONS = ['.md']
MAX_SPLIT_SIZE_MB = 9

# =============================================================================
# 3. DATABASE FUNCTIONS
# =============================================================================
def init_database():
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS documents (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                filename TEXT NOT NULL,
                file_hash TEXT NOT NULL,
                analysis_json TEXT NOT NULL,
                processed_at TEXT NOT NULL,
                UNIQUE(filename, file_hash)
            )
        ''')
        conn.commit()

def add_to_database(filename, file_hash, analysis_result):
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute(
            "INSERT OR REPLACE INTO documents (filename, file_hash, analysis_json, processed_at) VALUES (?, ?, ?, ?)",
            (filename, file_hash, json.dumps(analysis_result), analysis_result['processedAt'])
        )
        conn.commit()

def is_file_in_database(filename, file_hash):
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT 1 FROM documents WHERE filename = ? AND file_hash = ?", (filename, file_hash))
        return cursor.fetchone() is not None

def get_all_from_database():
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT analysis_json FROM documents")
        return [json.loads(row[0]) for row in cursor.fetchall()]

def get_last_n_filenames(n=5):
    with sqlite3.connect(DB_FILE) as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT filename FROM documents ORDER BY id DESC LIMIT ?", (n,))
        return [row[0] for row in cursor.fetchall()]

# =============================================================================
# 4. HELPER FUNCTIONS
# =============================================================================
def get_file_hash(filepath):
    hasher = hashlib.md5()
    try:
        with open(filepath, 'rb') as f:
            while chunk := f.read(8192):
                hasher.update(chunk)
        return hasher.hexdigest()
    except Exception:
        return None

def get_file_content_as_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception:
        return None

def find_supported_files_recursive(directory):
    print(f"\n--- Scanning for {', '.join(SUPPORTED_EXTENSIONS)} files ---")
    found_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if os.path.splitext(file)[1].lower() in SUPPORTED_EXTENSIONS:
                found_files.append(os.path.join(root, file))
    print(f"Found {len(found_files)} total supported files.")
    return sorted(found_files)

def analyze_with_gemini_with_retries(prompt, filename, model):
    """Free tier optimized with exponential backoff"""
    current_delay = GEMINI_RETRY_DELAY

    for attempt in range(MAX_GEMINI_RETRIES):
        try:
            response = model.generate_content(
                prompt,
                request_options={'timeout': 600}
            )
            cleaned_response = response.text.strip()

            # Remove code block markers
            if cleaned_response.startswith('```json'):
                cleaned_response = cleaned_response[7:-3].strip()
            elif cleaned_response.startswith('```'):
                cleaned_response = cleaned_response[3:-3].strip()

            if not cleaned_response:
                print(f"  ‚ö†Ô∏è  Empty response for '{filename}'")
                return None

            return json.loads(cleaned_response)

        except google_exceptions.ResourceExhausted:
            if attempt < MAX_GEMINI_RETRIES - 1:
                print(f"  ‚ö†Ô∏è  Rate limit hit for '{filename}' (Attempt {attempt + 1}/{MAX_GEMINI_RETRIES})")
                print(f"  ‚è≥ Waiting {current_delay}s...")
                time.sleep(current_delay)
                current_delay *= 2  # Exponential backoff
            else:
                print(f"  ‚ùå Rate limit exceeded after {MAX_GEMINI_RETRIES} attempts: {filename}")
                return None

        except google_exceptions.ServiceUnavailable:
            if attempt < MAX_GEMINI_RETRIES - 1:
                print(f"  ‚ö†Ô∏è  Service unavailable for '{filename}' (Attempt {attempt + 1}/{MAX_GEMINI_RETRIES})")
                print(f"  ‚è≥ Waiting {current_delay}s...")
                time.sleep(current_delay)
                current_delay *= 1.5
            else:
                print(f"  ‚ùå Service unavailable after {MAX_GEMINI_RETRIES} attempts: {filename}")
                return None

        except json.JSONDecodeError as e:
            print(f"  ‚ùå Invalid JSON for '{filename}': {str(e)[:100]}")
            return None

        except Exception as e:
            print(f"  ‚ùå Unexpected error for '{filename}': {type(e).__name__}")
            if attempt < MAX_GEMINI_RETRIES - 1:
                time.sleep(current_delay)
                current_delay *= 1.5
            else:
                return None

    return None

# =============================================================================
# 5. PROCESSING LOGIC
# =============================================================================
def create_archival_prompt(doc_text_snippet, filename, knowledge_summary):
    prompt = textwrap.dedent(f"""
        You are an expert archivist AI. Analyze this document and create a structured JSON record.
        Provide ONLY valid JSON. If information is unavailable, state "Information not available in document".

        DOCUMENT: {filename}
        CONTEXT: {knowledge_summary or "First document"}

        TEXT:
        ---
        {doc_text_snippet}
        ---

        Required JSON:
        {{
          "fileName": "{filename}",
          "fileType": "{os.path.splitext(filename)[1]}",
          "provenance": "Who created this? For what purpose?",
          "originalOrder": "How is it structured?",
          "primaryValue": "What purpose does it serve?",
          "secondaryValue": "What research value does it hold?",
          "significance": "Does it document key decisions or events?",
          "uniqueness": "Is this information available elsewhere?",
          "usability": "Is it well-organized and accessible?",
          "context": "How does it relate to other files?",
          "intrinsicValue": "Does the original form add value?",
          "summary": "One-paragraph summary of core content."
        }}
    """)
    return prompt

def process_document(file_path, model):
    """Sequential processing with rate limiting"""
    filename = os.path.basename(file_path)

    try:
        document_text = get_file_content_as_text(file_path)
        if not document_text:
            return None, "Failed to extract text"

        # Limit snippet size
        max_chars = 12000  # Smaller to reduce token usage
        snippet = document_text[:max_chars]
        if len(document_text) > max_chars:
            snippet += "\n[... content truncated ...]"

        last_files = get_last_n_filenames(3)  # Reduced from 5
        knowledge_summary = "; ".join(last_files) if last_files else "First document"

        prompt = create_archival_prompt(snippet, filename, knowledge_summary)
        analysis_result = analyze_with_gemini_with_retries(prompt, filename, model)

        if not analysis_result:
            return None, "Gemini analysis failed"

        analysis_result['rawText'] = document_text
        analysis_result['filePath'] = file_path
        analysis_result['processedAt'] = datetime.now().isoformat()
        return analysis_result, "Success"

    except Exception as exc:
        print(f"  ‚ùå Critical error processing {filename}: {exc}")
        return None, f"Error: {exc}"

def generate_html_report(report_data, version):
    df = pd.DataFrame(report_data).reindex(columns=['File', 'Status', 'Details'])
    html = df.to_html(index=False, justify='left', border=0, classes='table table-striped')
    html_template = f"""
    <html><head><title>Processing Report</title><style>
        body {{ font-family: sans-serif; margin: 2em; background-color: #f9f9f9; }}
        h1 {{ color: #1a1a1a; }}
        table {{ width: 100%; border-collapse: collapse; box-shadow: 0 2px 3px rgba(0,0,0,0.1); }}
        th, td {{ padding: 12px; text-align: left; border-bottom: 1px solid #ddd; }}
        th {{ background-color: #4CAF50; color: white; }}
    </style></head><body>
        <h1>Knowledge Base Processing Report v{version}</h1>
        <p><strong>Generated:</strong> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        {html}
    </body></html>"""
    report_path = os.path.join(OUTPUT_DIRECTORY, f'_report_v{version}.html')
    with open(report_path, 'w') as f:
        f.write(html_template)
    print(f"üìÑ Report saved: {report_path}")

def main():
    print("\nüöÄ Starting Knowledge Base Generation (Free Tier Mode)")

    try:
        print("--- Verifying API Key ---")
        API_KEY = userdata.get('GOOGLE_API_KEY')
        if not API_KEY:
            print("\n‚ùå CRITICAL: 'GOOGLE_API_KEY' not found in Colab Secrets.")
            print("Go to: üîë icon (left sidebar) > Add Secret > Name: GOOGLE_API_KEY")
            return
        print("‚úÖ API Key loaded")
        genai.configure(api_key=API_KEY)

        for dir_path in [OUTPUT_DIRECTORY, QUARANTINE_DIRECTORY]:
            if not os.path.exists(dir_path):
                os.makedirs(dir_path)

        auth.authenticate_user()
        # Use stable model - better for free tier
        model = genai.GenerativeModel('gemini-2.5-flash')

    except Exception as e:
        print(f"\n‚ùå Setup error: {e}")
        return

    init_database()
    all_files = find_supported_files_recursive(DOCUMENT_DIRECTORY)

    if not all_files:
        print(f"\n‚ö†Ô∏è  No .md files found in: {DOCUMENT_DIRECTORY}")
        return

    session_report, files_to_process = [], []

    print("\n--- Checking cache ---")
    for fp in tqdm(all_files, desc="Verifying Files"):
        filename, file_hash = os.path.basename(fp), get_file_hash(fp)
        if not file_hash or is_file_in_database(filename, file_hash):
            session_report.append({"File": filename, "Status": "Skipped", "Details": "Already processed"})
            continue
        files_to_process.append(fp)

    print(f"\nüìù {len(files_to_process)} new files to process")

    if not files_to_process:
        print("‚úÖ All files already processed!")
        return

    version = datetime.now().strftime('%Y%m%d%H%M%S')
    print(f"\n--- Processing (Version: {version}) ---")
    print(f"‚è±Ô∏è  Estimated time: ~{len(files_to_process) * 8 / 60:.1f} minutes")

    # SEQUENTIAL processing for free tier
    for i, filepath in enumerate(tqdm(files_to_process, desc="Analyzing")):
        filename = os.path.basename(filepath)

        try:
            result, status = process_document(filepath, model)

            if result:
                file_hash = get_file_hash(filepath)
                add_to_database(filename, file_hash, result)
                session_report.append({"File": filename, "Status": "‚úÖ Success", "Details": "Added to database"})
            else:
                raise Exception(status)

        except Exception as exc:
            session_report.append({"File": filename, "Status": "‚ùå Failed", "Details": str(exc)[:100]})
            # Move to quarantine
            try:
                shutil.move(filepath, os.path.join(QUARANTINE_DIRECTORY, filename))
            except:
                pass

        # Rate limiting between requests (critical for free tier)
        if i < len(files_to_process) - 1:
            time.sleep(REQUEST_DELAY)

    print("\n--- Finalizing ---")
    knowledge_base = get_all_from_database()
    final_save_path = os.path.join(OUTPUT_DIRECTORY, f'md_knowledge_base_v{version}.json')

    with open(final_save_path, 'w') as f:
        json.dump(knowledge_base, f, indent=2)

    print(f"\nüéâ COMPLETE!")
    print(f"üìä Total documents: {len(knowledge_base)}")
    print(f"‚úÖ Saved to: {final_save_path}")

    # Split if needed
    if os.path.getsize(final_save_path) > MAX_SPLIT_SIZE_MB * 1024 * 1024:
        print(f"\n--- Splitting (Max: {MAX_SPLIT_SIZE_MB}MB) ---")
        with open(final_save_path, 'r') as f:
            full_data = json.load(f)

        part_num, current_chunk, current_size = 1, [], 0

        for item in full_data:
            item_size = len(json.dumps(item).encode('utf-8'))

            if current_size + item_size > MAX_SPLIT_SIZE_MB * 1024 * 1024 and current_chunk:
                part_path = os.path.join(OUTPUT_DIRECTORY, f'md_kb_v{version}_part{part_num}.json')
                with open(part_path, 'w') as f:
                    json.dump(current_chunk, f, indent=2)
                print(f"‚úÖ Part {part_num}: {part_path}")
                part_num += 1
                current_chunk, current_size = [item], item_size
            else:
                current_chunk.append(item)
                current_size += item_size

        if current_chunk:
            part_path = os.path.join(OUTPUT_DIRECTORY, f'md_kb_v{version}_part{part_num}.json')
            with open(part_path, 'w') as f:
                json.dump(current_chunk, f, indent=2)
            print(f"‚úÖ Part {part_num}: {part_path}")

    generate_html_report(session_report, version)

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\n‚ö†Ô∏è  Interrupted by user. Progress saved.")
    finally:
        print("\n--- Finished ---")

--- Markdown Knowledge Base Builder v4.8 (Free Tier) ---
Mounted at /content/drive
‚úÖ Google Drive mounted successfully.

üöÄ Starting Knowledge Base Generation (Free Tier Mode)
--- Verifying API Key ---
‚úÖ API Key loaded

--- Scanning for .md files ---
Found 301 total supported files.

--- Checking cache ---


Verifying Files:   0%|          | 0/301 [00:00<?, ?it/s]


üìù 293 new files to process

--- Processing (Version: 20251002183946) ---
‚è±Ô∏è  Estimated time: ~39.1 minutes


Analyzing:   0%|          | 0/293 [00:00<?, ?it/s]

  ‚ùå Invalid JSON for 'blueprint_for_a_gpt_powered_redditor_design_deployment_and_ethical_interaction.md': Expecting ',' delimiter: line 4 column 328 (char 449)




  ‚ùå Unexpected error for 'mammoth_cave_wikibot_option.md': TooManyRequests




  ‚ùå Unexpected error for 'mammoth_cave_wikibot_option.md': TooManyRequests




  ‚ùå Unexpected error for 'metadata_strategy_research_report_refined.md': TooManyRequests




  ‚ùå Unexpected error for 'monetizing_local_website_staged_plan.md': TooManyRequests




  ‚ùå Unexpected error for 'monetizing_local_website_staged_plan.md': TooManyRequests

‚ö†Ô∏è  Interrupted by user. Progress saved.

--- Finished ---


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
