<a href="https://colab.research.google.com/github/moeezmujahid70/hebrew-nlp-llms/blob/main/improved_%3E_Replace_modern_Hebrew_words_Workflow_11_07_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **AI Enhance Outline Workflow**
1. Please add your **Antropic API key** (you can get it [here](https://console.anthropic.com/settings/keys))
2. Please add your **OpenAI API key** (you can get it [here](https://platform.openai.com/api-keys))
3. run the '**Install Dependencies**'
4. When done run the workflow bellow






In [None]:
# @title **Add API Keys and Install Dependencies**
!pip install -q -U google-genai
!pip install -q --upgrade python-docx pandas
!pip install anthropic

#!pip install mammoth sqlite3-binaries
# Install necessary dependencies for processing .doc files

!pip install python-docx textract
#!apt-get install -y antiword catdoc

from google.colab import userdata
antropic_api_key = userdata.get('antropic_api_key')
grok_api_key = userdata.get('grok_api_key')
openai_api_key = userdata.get('openai_api_key')

Collecting textract
  Using cached textract-1.6.5-py3-none-any.whl.metadata (2.5 kB)
Requested textract from https://files.pythonhosted.org/packages/6b/3e/ac16b6bf28edf78296aea7d0cb416b49ed30282ac8c711662541015ee6f3/textract-1.6.5-py3-none-any.whl has invalid metadata: .* suffix can only be used with `==` or `!=` operators
    extract-msg (<=0.29.*)
                 ~~~~~~~^
Please use pip<24.1 if you need to use this version.[0m[33m
[0m  Using cached textract-1.6.4.tar.gz (17 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for outp

In [None]:
import json
import csv
import re
import requests
from typing import Dict, List, Tuple, Optional
from tqdm import tqdm
import pandas as pd
from google.colab import files
from docx import Document
import io
import os
import sqlite3
from dataclasses import dataclass
from collections import defaultdict
import hashlib
# import re
import re as regex_module




@dataclass
class HebrewMatch:
    word: str
    traditional_equivalent: str
    confidence: float
    context_before: str
    context_after: str
    position: Tuple[int, int]
    source: str  # 'database' or 'grok' or 'context'

@dataclass
class ModernWordCategory:
    """Categories of modern Hebrew words"""
    TRANSLATABLE = "translatable"          # Has traditional equivalent
    UNTRANSLATABLE = "untranslatable"      # No traditional equivalent
    CONTEXTUAL = "contextual"              # Depends on context
    ALREADY_TRADITIONAL = "traditional"    # Already traditional Hebrew


def find_forbidden_words(document_text: str, forbidden_list: list) -> list:
    """
    Find forbidden Hebrew words/phrases in a document.

    Args:
        document_text: The text content to search through
        forbidden_list: List of forbidden Hebrew words/phrases

    Returns:
        List of forbidden words found in the document
    """
    found_words = []

    for forbidden_word in forbidden_list:
        if forbidden_word.strip() and forbidden_word in document_text:
            if forbidden_word not in found_words:  # Avoid duplicates
                found_words.append(forbidden_word)

    return found_words



def scan_uploaded_docx_files():

    print("Please select HEBREW DOCX files to upload...")
    uploaded = files.upload()

    results = {}

    for filename, file_content in uploaded.items():
        print(f"\nScanning file: {filename}")

        if not filename.lower().endswith('.docx'):
            print(f"Warning: {filename} is not a DOCX file. Skipping...")
            continue

        # Extract text from DOCX
        try:
        # Create a file-like object from the content
          docx_file = io.BytesIO(file_content)
          doc = Document(docx_file)

          # Extract text from all paragraphs
          full_text = []
          for paragraph in doc.paragraphs:
              full_text.append(paragraph.text)

          return '\n'.join(full_text)

        except Exception as e:
            print(f"Error extracting text from DOCX: {e}")
            return ""




def apply_replacements_safely(text: str, matches: List[HebrewMatch], grok_suggestions: Dict[str, str]) -> Tuple[str, List[Dict]]:
    """
    Apply replacements safely without creating duplicate brackets
    """
    processed_text = text
    replacement_log = []
    replaced_words = set()  # Track words already replaced

    # Sort matches by confidence (highest first) to prioritize better replacements
    sorted_matches = sorted(matches, key=lambda x: x.confidence, reverse=True)

    print(f"Applying {len(sorted_matches)} database matches...")

    # Apply database matches first (higher priority)
    for match in sorted_matches:
        if match.confidence > 0.5:  # Only apply high-confidence replacements

            # Skip if word already replaced
            if match.word in replaced_words:
                continue

            # Create pattern that doesn't match already replaced words
            pattern = r'\b' + re.escape(match.word) + r'\b'

            # Check if the word exists and hasn't been replaced yet
            if re.search(pattern, processed_text) and f"{{{match.word}}}" not in processed_text:
                replacement = f"{{{match.word}}}[{match.traditional_equivalent}]"
                processed_text = re.sub(pattern, replacement, processed_text)
                replaced_words.add(match.word)

                replacement_log.append({
                    'word': match.word,
                    'replacement': match.traditional_equivalent,
                    'confidence': match.confidence,
                    'source': match.source,
                    'priority': 'database'
                })
                # print(f"  ✅ Replaced: {match.word} → {match.traditional_equivalent}")

    print(f"Applying {len(grok_suggestions)} Grok suggestions...")

    # Apply Grok suggestions (for words not already replaced)
    for modern_word, traditional_word in grok_suggestions.items():

        # Skip if word already replaced
        if modern_word in replaced_words:
            # print(f"  ⏭️ Skipping {modern_word} (already replaced)")
            continue

        pattern = r'\b' + re.escape(modern_word) + r'\b'

        # Only apply if word exists and hasn't been replaced yet
        if re.search(pattern, processed_text) and f"{{{modern_word}}}" not in processed_text:
            replacement = f"{{{modern_word}}}[{traditional_word}]"
            processed_text = re.sub(pattern, replacement, processed_text)
            replaced_words.add(modern_word)

            replacement_log.append({
                'word': modern_word,
                'replacement': traditional_word,
                'confidence': 0.7,  # Grok suggestions get medium confidence
                'source': 'grok_filtered',
                'priority': 'grok'
            })
            # print(f"  ✅ Replaced: {modern_word} → {traditional_word}")
        else:
            print(f"  ⏭️ Skipping {modern_word} (not found or already replaced)")

    return processed_text, replacement_log



class SmartModernWordFilter:
    """Filter modern Hebrew words to identify only those worth processing"""

    def __init__(self):
        # Words that have NO traditional equivalent (modern concepts)
        self.untranslatable_words = {
            # Technology
            "מחשב", "אינטרנט", "טלפון", "פלאפון", "מייל", "אימייל", "וואטסאפ",
            "פייסבוק", "גוגל", "אפליקציה", "תוכנה", "חומרה", "פרוגרמר",
            "האקר", "וירוס", "מחשוב", "דיגיטלי", "אלקטרוני", "רדיו", "טלוויזיה",
            "וידאו", "סלפי", "בלוטות'", "וויפי", "GPS", "USB", "סמארטפון",

            # Transportation
            "מכונית", "אוטובוס", "רכבת", "מטוס", "אופנוע", "אופניים", "מונית",
            "רמזור", "חניון", "תחנת דלק", "כביש מהיר", "גשר עליון",

            # Modern institutions/concepts
            "בנק", "ביטוח", "משכנתא", "קרדיט", "דביט", "עסקה", "השקעה",
            "מניה", "בורסה", "אינפלציה", "דפלציה", "GDP", "אבטלה",
            "דמוקרטיה", "קפיטליזם", "סוציאליזם", "קומוניזם",

            # Modern professions
            "מנהל", "מזכירה", "רואה חשבון", "עורך דין", "מהנדס", "אדריכל",
            "פסיכולוג", "סוציולוג", "עיתונאי", "צלם", "שחקן", "במאי",

            # Modern objects/items
            "מקרר", "תנור", "מיקרוגל", "מכונת כביסה", "שואב אבק", "מזגן",
            "שעון יד", "משקפיים", "עדשות מגע", "תרופה", "ויטמין",
            "פלסטיק", "נייר", "עט", "עיפרון", "מחברת", "ספר לימוד",

            # Sports and recreation
            "כדורגל", "כדורסל", "טניס", "שחייה", "ריצה", "כושר", "חדר כושר",
            "אולימפיאדה", "מדליה", "אליפות", "ליגה", "קבוצה",

            # Modern food/products
            "פיצה", "המבורגר", "סנדוויץ'", "שוקולד", "גלידה", "קפה", "תה",
            "קוקה קולא", "מים מינרלים", "יוגורט", "קורנפלקס",
        }

        # Words that are already traditional Hebrew - don't process these
        self.already_traditional = {
            # Religious terms
            "אלוהים", "ה'", "בורא", "קדוש", "ברוך", "אמן", "הללויה",
            "תפילה", "ברכה", "מצווה", "תורה", "תנך", "משנה", "תלמוד",
            "שבת", "חג", "פסח", "סוכות", "שבועות", "ראש השנה", "יום כיפור",
            "בית כנסת", "בית מדרש", "ישיבה", "כהן", "לוי", "ישראל",

            # Biblical Hebrew
            "נפש", "רוח", "לב", "לבב", "עין", "אוזן", "פה", "לשון",
            "יד", "רגל", "ראש", "גוף", "בשר", "דם", "עצם",
            "שמים", "ארץ", "ים", "נהר", "הר", "גבעה", "עמק", "מדבר",
            "אור", "חושך", "יום", "לילה", "בוקר", "ערב", "שנה", "חודש",

            # Traditional values/concepts
            "חכמה", "בינה", "דעת", "אמונה", "יראה", "אהבה", "שמחה",
            "שלום", "צדק", "חסד", "רחמים", "סליחה", "תשובה",
        }

    def categorize_word(self, word: str) -> str:
        """Categorize a Hebrew word to determine if it should be processed"""
        normalized_word = word.strip()

        if normalized_word in self.already_traditional:
            return ModernWordCategory.ALREADY_TRADITIONAL
        elif normalized_word in self.untranslatable_words:
            return ModernWordCategory.UNTRANSLATABLE
        else:
            # Default to contextual - let Grok decide but with lower priority
            return ModernWordCategory.CONTEXTUAL

    def filter_words_for_grok_processing(self, word_list: List[str]) -> List[str]:
        """Filter word list to include only words worth processing with Grok"""
        filtered_words = []
        skipped_count = 0

        for word in word_list:
            category = self.categorize_word(word)

            # Only process contextual words (potentially translatable)
            if category == ModernWordCategory.CONTEXTUAL:
                filtered_words.append(word)
            else:
                skipped_count += 1
                if skipped_count <= 5:  # Show first few skipped words
                    print(f"  Skipping '{word}' - {category}")

        if skipped_count > 5:
            print(f"  ... and {skipped_count - 5} more skipped words")

        return filtered_words

class EnhancedHebrewTextProcessor:
    def __init__(self, claude_api_key: str = None, grok_api_key: str = None):
        self.claude_api_key = claude_api_key
        self.grok_api_key = grok_api_key
        self.csv_database = {}
        self.context_patterns = defaultdict(list)
        self.word_frequency = defaultdict(int)
        self.processed_cache = {}

        # NEW: Add smart filtering
        self.word_filter = SmartModernWordFilter()

        # Initialize SQLite for better data management
        self.init_local_database()

        # Hebrew morphology patterns
        self.hebrew_prefixes = ['ב', 'כ', 'ל', 'מ', 'ש', 'ה', 'ו']
        self.hebrew_suffixes = ['ים', 'ות', 'ה', 'י', 'ך', 'נו', 'כם', 'הם', 'הן']

    def init_local_database(self):
        """Initialize local SQLite database for caching and analysis"""
        self.conn = sqlite3.connect(':memory:')  # In-memory for Colab

        self.conn.execute('''
            CREATE TABLE words (
                id INTEGER PRIMARY KEY,
                modern_word TEXT UNIQUE,
                traditional_word TEXT,
                confidence REAL,
                frequency INTEGER DEFAULT 1,
                source TEXT,
                context_patterns TEXT,
                last_updated TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        ''')

        self.conn.execute('''
            CREATE TABLE word_contexts (
                id INTEGER PRIMARY KEY,
                word_id INTEGER,
                before_context TEXT,
                after_context TEXT,
                replacement_used TEXT,
                confidence REAL,
                FOREIGN KEY (word_id) REFERENCES words (id)
            )
        ''')

        self.conn.commit()

    def normalize_hebrew_text(self, text: str) -> str:
        """Advanced Hebrew text normalization"""
        # Remove nikud (vowel points) for better matching
        normalized = re.sub(r'[\u0591-\u05C7]', '', text)

        # Normalize similar-looking Hebrew letters
        letter_mappings = {
            'ך': 'כ', 'ם': 'מ', 'ן': 'נ', 'ף': 'פ', 'ץ': 'צ'  # Final forms
        }

        for final, regular in letter_mappings.items():
            normalized = normalized.replace(final, regular)

        return normalized.strip()

    def extract_word_variants(self, word: str) -> List[str]:
        """Generate morphological variants of Hebrew words"""
        variants = [word]
        normalized = self.normalize_hebrew_text(word)

        # Add variant with/without prefixes
        for prefix in self.hebrew_prefixes:
            if word.startswith(prefix) and len(word) > 2:
                variants.append(word[1:])  # Without prefix
            else:
                variants.append(prefix + word)  # With prefix

        # Add variant with/without common suffixes
        for suffix in self.hebrew_suffixes:
            if word.endswith(suffix) and len(word) > len(suffix) + 1:
                variants.append(word[:-len(suffix)])  # Without suffix
            else:
                variants.append(word + suffix)  # With suffix

        return list(set(variants))

    def enhanced_google_sheet_loader(self, sheet_url: str, sheet_name: str = "Sheet1"):
        """Enhanced Google Sheets loader with validation and caching"""
        import re

        # Extract sheet ID
        match = re.search(r'/spreadsheets/d/([a-zA-Z0-9-_]+)', sheet_url)
        if not match:
            raise ValueError("Invalid Google Sheets URL")

        sheet_id = match.group(1)
        csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

        # Create cache key
        cache_key = hashlib.md5(csv_url.encode()).hexdigest()

        try:
            print("Loading Google Sheets data...")
            df = pd.read_csv(csv_url, encoding='utf-8')

            # Flexible column detection
            if len(df.columns) >= 2:
                # Auto-detect columns based on content
                modern_col = df.columns[0]
                traditional_col = df.columns[1]
                source_col = df.columns[2] if len(df.columns) > 2 else None

                print(f"Detected columns: {modern_col} -> {traditional_col}")

                # Clean and validate data
                df = df.dropna(subset=[modern_col, traditional_col])
                df[modern_col] = df[modern_col].astype(str).str.strip()
                df[traditional_col] = df[traditional_col].astype(str).str.strip()

                # Filter out invalid entries
                df = df[df[modern_col].str.len() > 0]
                df = df[df[traditional_col].str.len() > 0]
                df = df[df[modern_col] != df[traditional_col]]  # Exclude identical words

                # Store in local database with metadata
                for _, row in df.iterrows():
                    modern_word = self.normalize_hebrew_text(row[modern_col])
                    traditional_word = row[traditional_col]
                    source = row[source_col] if source_col else 'google_sheet'

                    self.conn.execute('''
                        INSERT OR REPLACE INTO words
                        (modern_word, traditional_word, confidence, source)
                        VALUES (?, ?, ?, ?)
                    ''', (modern_word, traditional_word, 0.8, source))

                    # Also store variants
                    for variant in self.extract_word_variants(modern_word):
                        if variant != modern_word:
                            self.conn.execute('''
                                INSERT OR IGNORE INTO words
                                (modern_word, traditional_word, confidence, source)
                                VALUES (?, ?, ?, ?)
                            ''', (variant, traditional_word, 0.6, f"{source}_variant"))

                self.conn.commit()

                # Update main dictionary for backward compatibility
                self.csv_database = dict(zip(
                    df[modern_col].apply(self.normalize_hebrew_text),
                    df[traditional_col]
                ))

                print(f"Successfully loaded {len(self.csv_database)} word pairs")
                print(f"Generated {self.conn.execute('SELECT COUNT(*) FROM words').fetchone()[0]} total entries with variants")

                # Show sample
                sample_items = list(self.csv_database.items())[:5]
                for modern, traditional in sample_items:
                    print(f"  {modern} -> {traditional}")

        except Exception as e:
            print(f"Error loading Google Sheets: {e}")

    def context_aware_word_detection(self, text: str, window_size: int = 50) -> List[HebrewMatch]:
        """Enhanced word detection with context analysis"""
        matches = []
        normalized_text = self.normalize_hebrew_text(text)

        # Get all words from database
        cursor = self.conn.execute('''
            SELECT modern_word, traditional_word, confidence, source
            FROM words ORDER BY confidence DESC
        ''')

        for modern_word, traditional_word, confidence, source in cursor.fetchall():
            # Find word boundaries more accurately
            pattern = r'\b' + re.escape(modern_word) + r'\b'

            for match in re.finditer(pattern, normalized_text):
                start, end = match.span()

                # Extract context
                context_start = max(0, start - window_size)
                context_end = min(len(normalized_text), end + window_size)

                context_before = normalized_text[context_start:start].strip()
                context_after = normalized_text[end:context_end].strip()

                # Calculate context-based confidence adjustment
                context_confidence = self.calculate_context_confidence(
                    modern_word, context_before, context_after
                )

                adjusted_confidence = min(1.0, confidence * context_confidence)

                matches.append(HebrewMatch(
                    word=modern_word,
                    traditional_equivalent=traditional_word,
                    confidence=adjusted_confidence,
                    context_before=context_before,
                    context_after=context_after,
                    position=(start, end),
                    source=source
                ))

        # Sort by confidence and remove duplicates
        matches.sort(key=lambda x: x.confidence, reverse=True)
        return self.remove_overlapping_matches(matches)

    def calculate_context_confidence(self, word: str, before: str, after: str) -> float:
        """Calculate confidence based on surrounding context"""
        confidence = 1.0

        # Religious/spiritual context indicators (boost confidence)
        religious_indicators = ['נפש', 'רוח', 'אלוקי', 'קדוש', 'ברוך', 'תפילה', 'מצווה']
        modern_indicators = ['טכנולוגיה', 'אינטרנט', 'מחשב', 'פלאפון']

        context_text = before + ' ' + after

        # Boost confidence for religious context
        for indicator in religious_indicators:
            if indicator in context_text:
                confidence *= 1.2

        # Reduce confidence for modern context
        for indicator in modern_indicators:
            if indicator in context_text:
                confidence *= 0.8

        return min(1.0, confidence)

    def remove_overlapping_matches(self, matches: List[HebrewMatch]) -> List[HebrewMatch]:
        """Remove overlapping matches, keeping higher confidence ones"""
        if not matches:
            return []

        # Sort by position
        matches.sort(key=lambda x: x.position[0])

        filtered = [matches[0]]

        for current in matches[1:]:
            last_match = filtered[-1]

            # Check for overlap
            if current.position[0] < last_match.position[1]:
                # Overlapping - keep higher confidence
                if current.confidence > last_match.confidence:
                    filtered[-1] = current
            else:
                filtered.append(current)

        return filtered

    def chunk_words_for_grok(self, words: List[str], max_words_per_chunk: int = 250) -> List[List[str]]:
        """
        FIXED: Chunk filtered words (not sentences) for Grok processing
        """
        chunks = []
        current_chunk = []

        for word in words:
            current_chunk.append(word)

            if len(current_chunk) >= max_words_per_chunk:
                chunks.append(current_chunk)
                current_chunk = []

        # Add remaining words if any
        if current_chunk:
            chunks.append(current_chunk)

        print(f"Created {len(chunks)} word chunks from {len(words)} filtered words")
        return chunks


    def validate_suggestions_with_grok(self, raw_suggestions: Dict[str, str]) -> Dict[str, str]:
        """
        Validate Grok's initial suggestions using a second Grok API call

        Args:
            raw_suggestions: Dictionary of modern_word: traditional_equivalent from first pass

        Returns:
            Dictionary of validated suggestions (only the ones that pass validation)
        """
        if not self.grok_api_key or not raw_suggestions:
            return {}

        print(f"\n=== GROK VALIDATION PASS ===")
        print(f"Validating {len(raw_suggestions)} initial suggestions...")

        # Create validation prompt
        validation_prompt = f"""
            You are a Hebrew linguistics expert reviewing translation suggestions for errors.

            TASK: Validate these modern→traditional Hebrew translations. Return ONLY the ones that are correct.

            TRANSLATIONS TO VALIDATE:
            {json.dumps(raw_suggestions, ensure_ascii=False, indent=2)}

            VALIDATION CRITERIA:
            ❌ REJECT if:
            - Wrong meaning entirely (semantic mismatch)
            - Over-elevation (mundane→sacred inappropriately)
            - Under-elevation (sacred→mundane inappropriately)
            - Modern word disguised as traditional
            - Not actually from traditional Hebrew sources (biblical/mishnaic/medieval)
            - False cognates or sound-alike words with different meanings

            ✅ KEEP if:
            - Exact semantic match between modern and traditional word
            - Appropriate intensity level for the context
            - Genuinely traditional Hebrew from authentic sources
            - Natural usage in traditional Hebrew contexts
            - Preserves the original meaning precisely

            CRITICAL EXAMPLES OF ERRORS TO CATCH:
            ❌ "משימה" → "שליחות" (task ≠ divine mission - wrong intensity level)
            ❌ "פגיעות" → "חולין" (vulnerability ≠ secular/mundane - completely wrong meaning)
            ❌ "בינוניות" → "פשטות" (mediocrity ≠ simplicity - different concepts entirely)
            ❌ "השראה" → "נבואה" (inspiration ≠ prophecy - too specific/elevated)
            ❌ "עוול" → "עול" (injustice ≠ yoke - false cognate)

            EXAMPLES OF GOOD TRANSLATIONS TO KEEP:
            ✅ "פחדים" → "יראה" (fears → awe - appropriate spiritual elevation)
            ✅ "אתגר" → "נסיון" (challenge → test/trial - perfect semantic match)
            ✅ "עצבות" → "דאבה" (sadness → sorrow - authentic biblical term)

            CRITICAL INSTRUCTION:
            When in doubt, REJECT the suggestion.
            Only approve translations where you are confident of:
            1. Semantic accuracy
            2. Appropriate register/intensity
            3. Traditional Hebrew authenticity

            RESPONSE FORMAT:
            Return ONLY the validated translations as clean JSON:
            {{"modern_word": "traditional_equivalent"}}

            If a translation fails validation, omit it entirely from the response.
            Do not include explanations, just the validated JSON object.
            """

        try:
            headers = {
                'Content-Type': 'application/json',
                'Authorization': f'Bearer {self.grok_api_key}'
            }

            data = {
                'model': 'grok-3-latest',
                'messages': [{'role': 'user', 'content': validation_prompt}],
                'max_tokens': 3500,
                'temperature': 0.1  # Very low temperature for consistent validation
            }

            print("Sending validation request to Grok...")

            response = requests.post(
                'https://api.x.ai/v1/chat/completions',
                headers=headers,
                json=data,
                timeout=30
            )

            if response.status_code == 200:
                result = response.json()
                content = result['choices'][0]['message']['content']

                print("Received validation response, parsing...")

                # Extract JSON from response
                json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
                json_matches = re.findall(json_pattern, content, re.DOTALL)

                validated_suggestions = {}

                for json_match in json_matches:
                    try:
                        parsed_suggestions = json.loads(json_match)
                        if isinstance(parsed_suggestions, dict):
                            # Additional safety check - ensure suggested words were in original
                            for modern, traditional in parsed_suggestions.items():
                                if modern in raw_suggestions:
                                    validated_suggestions[modern] = traditional
                                else:
                                    print(f"  ⚠️  Validation added unknown word: {modern} (ignored)")
                    except json.JSONDecodeError as e:
                        print(f"  ❌ JSON parsing error in validation: {e}")
                        continue

                # Calculate and display validation statistics
                original_count = len(raw_suggestions)
                validated_count = len(validated_suggestions)
                rejection_rate = ((original_count - validated_count) / original_count * 100) if original_count > 0 else 0

                print(f"\n📊 VALIDATION RESULTS:")
                print(f"Original suggestions: {original_count}")
                print(f"Validated suggestions: {validated_count}")
                print(f"Rejected suggestions: {original_count - validated_count}")
                print(f"Rejection rate: {rejection_rate:.1f}%")

                # Show what was rejected (for debugging)
                # rejected = set(raw_suggestions.keys()) - set(validated_suggestions.keys())
                # if rejected:
                #     print(f"\n🗑️  REJECTED SUGGESTIONS:")
                #     for word in rejected:
                #         print(f"   {word} → {raw_suggestions[word]} (filtered out)")

                # print(f"\n✅ VALIDATED SUGGESTIONS:")
                # for modern, traditional in validated_suggestions.items():
                #     print(f"   {modern} → {traditional}")

                return validated_suggestions

            else:
                print(f"❌ Grok validation API error: {response.status_code}")
                if response.status_code == 429:
                    print("   Rate limit hit - consider adding delays between calls")
                print(f"   Response: {response.text}")

                # Fallback: return original suggestions with warning
                print("⚠️  Falling back to unvalidated suggestions")
                return raw_suggestions

        except Exception as e:
            print(f"❌ Error in Grok validation call: {e}")
            # Fallback: return original suggestions
            print("⚠️  Falling back to unvalidated suggestions")
            return raw_suggestions


    def enhanced_grok_api_call(self, text: str, known_words: List[str]) -> Dict[str, str]:
        """ENHANCED: Grok API call with smart filtering integration"""
        if not self.grok_api_key:
            return {}

        print(f"\n=== SMART FILTERING FOR GROK ===")
        print(f"Words detected from database matching: {len(known_words)}")

        # NEW: Apply smart filtering before processing
        # Extract words that appear in the text for filtering
        text_words = re.findall(r'[\u0590-\u05FF]+', text)  # Extract Hebrew words
        unique_text_words = list(set(text_words))

        # Filter words to only those worth sending to Grok
        filtered_words = self.word_filter.filter_words_for_grok_processing(unique_text_words)

        print(f"Words in text: {len(unique_text_words)}")
        print(f"Words to process with Grok: {len(filtered_words)}")
        # print(f"API calls saved by filtering: {len(unique_text_words) - len(filtered_words)}")

        if not filtered_words:
            print("No words need Grok processing - all are either untranslatable or traditional!")
            return {}

        # Create cache key for this text segment
        text_hash = hashlib.md5(text.encode()).hexdigest()

        # Check cache first
        if text_hash in self.processed_cache:
            return self.processed_cache[text_hash]

        sentences = self.split_into_sentences(text)
        sentence_chunks = self.chunk_sentences(sentences, max_tokens=2500)

        all_suggestions = {}

        for i, sentence_chunk in enumerate(tqdm(sentence_chunks, desc="Processing with Grok")):
            chunk_text = " ".join(sentence_chunk)

            # ENHANCED: Better prompt with filtering guidance
            sample_untranslatable = list(self.word_filter.untranslatable_words)[:15]
            sample_traditional = list(self.word_filter.already_traditional)[:10]

            # Enhanced prompt with examples and constraints
            prompt = f"""
            You are a scholar of Lashon Hakodesh (Sacred Hebrew) specializing in religious and spiritual texts.

            TASK: Transform modern Hebrew words into their traditional/biblical equivalents while preserving exact meaning and spiritual depth.

            CRITICAL CONTEXT: This text discusses spiritual matters - souls, divine service, and religious psychology. Choose traditional terms that ENHANCE rather than diminish sacred concepts

            TEXT TO ANALYZE:
            {chunk_text}


            PRECISION RULES:
            1. Verify semantic equivalence - meaning must match exactly
            2. Preserve spiritual intensity - prefer elevated over mundane language
            3  Secular terminology with religious equivalents
            4. Use only authentic traditional Hebrew from Tanakh/Mishna/medieval sources
            5. If modern word is already traditional, suggest keeping it


            CRITICAL EXCLUSIONS - DO NOT suggest traditional equivalents for:
            ❌ Modern technology: {sample_untranslatable}
            ❌ Modern objects that didn't exist in ancient times
            ❌ Already traditional terms: {sample_traditional}
            ❌ Words already in our database: {known_words[:50]}


            EXAMPLES of excellence:
            ✓ "פחדים" → "יראה" (fears → sacred awe - deeper spiritual meaning)
            ✓ "אתגר" → "נסיון" (challenge → divine test - theological precision)
            ✓ "שעמום" → "שיממון" (boredom → desolation - creative biblical root usage)

            COMMON ERRORS TO AVOID:
            ❌ Semantic mismatch: "פגיעות" ≠ "חולין" (vulnerability ≠ secular)
            ❌ Over-elevation: "משימה" ≠ "שליחות" (task ≠ divine mission)
            ❌ Wrong specificity: "השראה" ≠ "נבואה" (inspiration ≠ prophecy)
            ❌ False cognates: "עוול" ≠ "עול" (injustice ≠ yoke)
            ❌ Modern slang as traditional: "תסכול", "פספוס" are NOT traditional


            RESPONSE FORMAT:
            Return ONLY valid JSON: {{"modern_word": "traditional_equivalent"}}
            If no suitable words found, return: {{}}
            """

            try:
                headers = {
                    'Content-Type': 'application/json',
                    'Authorization': f'Bearer {self.grok_api_key}'
                }

                data = {
                    'model': 'grok-3-latest',
                    'messages': [{'role': 'user', 'content': prompt}],
                    'max_tokens': 4000,
                    'temperature': 0.2  # Lower temperature for consistency
                }

                response = requests.post(
                    'https://api.x.ai/v1/chat/completions',
                    headers=headers,
                    json=data,
                    timeout=30
                )

                if response.status_code == 200:
                    result = response.json()
                    content = result['choices'][0]['message']['content']

                    # Better JSON extraction
                    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
                    json_matches = re.findall(json_pattern, content, re.DOTALL)

                    for json_match in json_matches:
                        try:
                            chunk_suggestions = json.loads(json_match)
                            if isinstance(chunk_suggestions, dict):
                                # Filter and validate suggestions with our smart filter
                                filtered = self.validate_grok_suggestions_with_filter(chunk_suggestions, known_words)
                                # filtered = self.validate_grok_suggestions_strict(chunk_suggestions,known_words,filtered_words)
                                all_suggestions.update(filtered)
                        except json.JSONDecodeError:
                            continue

                else:
                    print(f"Grok API error: {response.status_code}")

            except Exception as e:
                print(f"Error in Grok API call: {e}")
                continue

        # Cache results
        self.processed_cache[text_hash] = all_suggestions
        return all_suggestions


    def validate_grok_suggestions_strict(self, suggestions: Dict[str, str],
                                   known_words: List[str],
                                   original_word_chunk: List[str]) -> Dict[str, str]:
        """
        FIXED: Strict validation that ensures Grok only suggests words from our filtered list
        """
        validated = {}

        for modern, traditional in suggestions.items():
            # 1. Skip if already in our database
            if modern in known_words or modern in self.csv_database:
                print(f"    ⏭️ Skipping {modern} (already in database)")
                continue

            # 2. CRITICAL: Only accept words that were in our filtered chunk
            if modern not in original_word_chunk:
                print(f"    ❌ Rejecting {modern} (not in filtered words)")
                continue

            # 3. Check with our smart filter again
            category = self.word_filter.categorize_word(modern)
            if category in [ModernWordCategory.UNTRANSLATABLE, ModernWordCategory.ALREADY_TRADITIONAL]:
                print(f"    ❌ Rejecting {modern} ({category})")
                continue

            # 4. Basic validation
            if (len(modern.strip()) > 1 and
                len(traditional.strip()) > 1 and
                modern != traditional and
                self.is_hebrew_word(modern) and
                self.is_hebrew_word(traditional)):

                validated[modern] = traditional
                print(f"    ✅ Validated: {modern} → {traditional}")
            else:
                print(f"    ❌ Failed basic validation: {modern} → {traditional}")

            return validated

    def validate_grok_suggestions_with_filter(self, suggestions: Dict[str, str], known_words: List[str]) -> Dict[str, str]:
        """ENHANCED: Validate Grok suggestions with smart filtering"""
        validated = {}

        for modern, traditional in suggestions.items():
            # Skip if already in database
            if modern in known_words or modern in self.csv_database:
                continue

            # NEW: Check with our smart filter
            category = self.word_filter.categorize_word(modern)
            if category in [ModernWordCategory.UNTRANSLATABLE, ModernWordCategory.ALREADY_TRADITIONAL]:
                print(f"  Filtered out Grok suggestion: {modern} ({category})")
                continue

            # Basic validation
            if (len(modern.strip()) > 1 and
                len(traditional.strip()) > 1 and
                modern != traditional and
                self.is_hebrew_word(modern) and
                self.is_hebrew_word(traditional)):

                validated[modern] = traditional

        return validated


    def grok_call_with_valiadations(self, text: str, known_words: list) -> Dict[str, str]:
        """
        Enhanced version of your existing Grok call with validation
        """
        # Step 1: Generate initial suggestions
        raw_suggestions = self.enhanced_grok_api_call(text, known_words)

        if not raw_suggestions:
            return {}

        # Step 2: Validate suggestions
        validated_suggestions = self.validate_suggestions_with_grok(raw_suggestions)

        return validated_suggestions



    def is_hebrew_word(self, word: str) -> bool:
        """Check if word contains Hebrew characters"""
        hebrew_pattern = r'[\u0590-\u05FF]'
        return bool(re.search(hebrew_pattern, word))

    def split_into_sentences(self, text: str) -> List[str]:
        """Improved Hebrew sentence splitting"""
        # Hebrew punctuation marks
        sentence_endings = r'[.!?׃։]'

        # Split while preserving punctuation
        parts = re.split(f'({sentence_endings})', text)

        sentences = []
        i = 0
        while i < len(parts):
            sentence = parts[i].strip()
            if i + 1 < len(parts) and re.match(sentence_endings, parts[i + 1]):
                sentence += parts[i + 1]
                i += 2
            else:
                i += 1

            if sentence and len(sentence) > 5:  # Filter very short segments
                sentences.append(sentence)

        return sentences

    def chunk_sentences(self, sentences: List[str], max_tokens: int = 2500) -> List[List[str]]:
        """Smart sentence chunking with context preservation"""
        chunks = []
        current_chunk = []
        current_length = 0

        for sentence in sentences:
            # Estimate tokens (Hebrew chars / 1.5 + spaces)
            estimated_tokens = len(sentence.replace(' ', '')) // 1.5 + sentence.count(' ')

            if current_length + estimated_tokens > max_tokens and current_chunk:
                chunks.append(current_chunk)
                current_chunk = [sentence]
                current_length = estimated_tokens
            else:
                current_chunk.append(sentence)
                current_length += estimated_tokens

        if current_chunk:
            chunks.append(current_chunk)

        return chunks

    def generate_analysis_report(self, matches: List[HebrewMatch], grok_suggestions: Dict[str, str] = None) -> str:
        """Enhanced analysis report with filtering statistics"""
        report = ["=== HEBREW TEXT ANALYSIS REPORT WITH SMART FILTERING ===\n"]

        # Summary statistics
        total_matches = len(matches)
        high_confidence = len([m for m in matches if m.confidence > 0.8])
        medium_confidence = len([m for m in matches if 0.5 < m.confidence <= 0.8])
        low_confidence = len([m for m in matches if m.confidence <= 0.5])

        report.append(f"Total modern words found: {total_matches}")
        report.append(f"High confidence (>0.8): {high_confidence}")
        report.append(f"Medium confidence (0.5-0.8): {medium_confidence}")
        report.append(f"Low confidence (≤0.5): {low_confidence}")

        if grok_suggestions:
            report.append(f"Additional Grok suggestions: {len(grok_suggestions)}")

        report.append("")

        # Group by source
        by_source = defaultdict(list)
        for match in matches:
            by_source[match.source].append(match)

        for source, source_matches in by_source.items():
            report.append(f"\n--- {(source or 'UNKNOWN').upper()} SUGGESTIONS ---")
            for match in sorted(source_matches, key=lambda x: x.confidence, reverse=True)[:10]:
                report.append(f"{match.word} → {match.traditional_equivalent} (confidence: {match.confidence:.2f})")

        if grok_suggestions:
            report.append(f"\n--- GROK API SUGGESTIONS (FILTERED) ---")
            for modern, traditional in list(grok_suggestions.items())[:30]:
                report.append(f"{modern} → {traditional}")

        return "\n".join(report)

# Enhanced main function with smart filtering
def enhanced_main():
    """Enhanced main function with smart filtering integration"""

    # You'll need to define these API keys
    # antropic_api_key = antropic_api_key  # Add your key here
    # grok_api_key = grok_api_key          # Add your key here

    # Initialize enhanced processor with filtering
    processor = EnhancedHebrewTextProcessor(
        claude_api_key=antropic_api_key,
        grok_api_key=grok_api_key
    )

    # Load database with enhancements
    sheet_url = "https://docs.google.com/spreadsheets/d/14iTTuymFbo-pcepnd1k6djBiSgkL6TSVK6b3_DVWzgQ/edit?usp=sharing"
    sheet_url2 = "https://docs.google.com/spreadsheets/d/1eXWRIA8LTk7fvQdn_-b1VT9Ao-V3CGMg1a0BdsJbZ34/edit?usp=sharing"
    processor.enhanced_google_sheet_loader(sheet_url2)

    # Get document text
    print("Please upload your Hebrew document...")
    text = scan_uploaded_docx_files()

    if not text:
        print("No text to process")
        return

    print(f"Processing document with {len(text)} characters...")

    # Enhanced word detection with context analysis
    matches = processor.context_aware_word_detection(text)
    print(f"Found {len(matches)} potential replacements from database")



    # Get additional suggestions from Grok (with smart filtering)
    known_words = list(processor.csv_database.keys())
    # grok_suggestions = processor.enhanced_grok_api_call(text, known_words)
    grok_suggestions = processor.grok_call_with_valiadations(text, known_words)
    print(f"Grok provided {len(grok_suggestions)} additional suggestions")


    # print(grok_suggestions)
    # with open('datagrok.json', 'w', encoding='utf-8') as f:
    #           json.dump(grok_suggestions, f, ensure_ascii=False, indent=2)

    # Load later
    # with open('datagrok.json', 'r', encoding='utf-8') as f:
    #     grok_suggestions = json.load(f)

    import re

    # Combine all suggestions
    all_replacements = {**processor.csv_database, **grok_suggestions}

    # Apply replacements with confidence scoring
    processed_text = text
    replacement_log = []


     # Apply replacements with the fixed logic
    processed_text, replacement_log = apply_replacements_safely(text, matches, grok_suggestions)

    # Generate enhanced analysis report
    report = processor.generate_analysis_report(matches, grok_suggestions)

    # Create filtering statistics
    import re

    # 1) Compile your Hebrew‐letter regex once
    heb_re = re.compile(r'[\u0590-\u05FF]+')

    # 2) Extract words and compute basics
    all_words       = heb_re.findall(text)
    unique_words    = set(all_words)
    db_hits         = len(matches)
    to_grok         = processor.word_filter.filter_words_for_grok_processing(list(unique_words))
    grok_hits       = len(grok_suggestions)

    # 3) Tally replacements
    total_repl      = len(replacement_log)
    db_repl_count   = sum(1 for r in replacement_log if r['source'] != 'grok_filtered')
    grok_repl_count = sum(1 for r in replacement_log if r['source'] == 'grok_filtered')

    # 4) Build the report with a plain f-string
    filtering_stats = f"""
    === SMART FILTERING STATISTICS ===
    Total words in text: {len(all_words)}
    Unique words in text: {len(unique_words)}
    Words from database: {db_hits}
    Words sent to Grok: {len(to_grok)}
    Grok suggestions received: {grok_hits}
    Total replacements applied: {total_repl}

    === REPLACEMENT BREAKDOWN ===
    Database replacements: {db_repl_count}
    Grok replacements: {grok_repl_count}
    """

    # Save outputs
    with open('enhanced_processed.txt', 'w', encoding='utf-8') as f:
        f.write(processed_text)

    with open('analysis_report.txt', 'w', encoding='utf-8') as f:
        f.write(report + "\n\n" + filtering_stats)

    with open('replacement_log.json', 'w', encoding='utf-8') as f:
        json.dump(replacement_log, f, ensure_ascii=False, indent=2)

    with open('grok_suggestions.json', 'w', encoding='utf-8') as f:
        json.dump(grok_suggestions, f, ensure_ascii=False, indent=2)

    # Download files
    files.download('enhanced_processed.txt')
    files.download('analysis_report.txt')
    # files.download('replacement_log.json')
    # files.download('grok_suggestions.json')

    print(f"\n🎉 PROCESSING COMPLETE! 🎉")
    print(f"📊 Applied {len(replacement_log)} total replacements")
    print(f"🗄️ Database replacements: {len([r for r in replacement_log if r['source'] != 'grok_filtered'])}")
    print(f"🤖 Grok replacements: {len([r for r in replacement_log if r['source'] == 'grok_filtered'])}")
    # print(f"💰 API calls saved by filtering: {len(set(re.findall(r'[\u0590-\u05FF]+', text))) - len(processor.word_filter.filter_words_for_grok_processing(list(set(re.findall(r'[\u0590-\u05FF]+', text)))))}")
    print(f"📁 Downloaded files:")
    print(f"   - enhanced_processed.txt (main output)")
    print(f"   - analysis_report.txt (detailed analysis)")
    print(f"   - replacement_log.json (all replacements)")

if __name__ == "__main__":
    # Run the demo first to see how filtering works
    # demo_filtering()

    print("\n" + "="*50)
    print("Ready to run enhanced_main()!")
    print("Make sure to:")
    print("1. Set your API keys in the enhanced_main() function")
    enhanced_main()
    print("="*50)


Ready to run enhanced_main()!
Make sure to:
1. Set your API keys in the enhanced_main() function
Loading Google Sheets data...
Detected columns: word -> traditional
Successfully loaded 220 word pairs
Generated 3731 total entries with variants
  אליל -> אפס חסר כוח
  אחז -> חלק אחד ממאה, מאית
  אני מאמינ -> שלושה‑עשר עיקרי אמונה
  אנכ -> קו ניצב/יורד מן הגובה
  איש מעשה -> בעל מצוות; חסיד
Please upload your Hebrew document...
Please select HEBREW DOCX files to upload...


Saving הפחד מחוסר ערך ומשימת החיים של איש הרו1 קלוד (1) (2).docx to הפחד מחוסר ערך ומשימת החיים של איש הרו1 קלוד (1) (2) (4).docx

Scanning file: הפחד מחוסר ערך ומשימת החיים של איש הרו1 קלוד (1) (2) (4).docx
Processing document with 8982 characters...
Found 46 potential replacements from database

=== SMART FILTERING FOR GROK ===
Words detected from database matching: 220
  Skipping 'רוח' - traditional
  Skipping 'אור' - traditional
  Skipping 'חכמה' - traditional
  Skipping 'צדק' - traditional
  Skipping 'חסד' - traditional
  ... and 1 more skipped words
Words in text: 631
Words to process with Grok: 625


Processing with Grok: 100%|██████████| 3/3 [00:38<00:00, 12.83s/it]

  Filtered out Grok suggestion: נפש (traditional)

=== GROK VALIDATION PASS ===
Validating 146 initial suggestions...
Sending validation request to Grok...





Received validation response, parsing...

📊 VALIDATION RESULTS:
Original suggestions: 146
Validated suggestions: 84
Rejected suggestions: 62
Rejection rate: 42.5%
Grok provided 84 additional suggestions
Applying 46 database matches...
Applying 84 Grok suggestions...
  ⏭️ Skipping משימת החיים (not found or already replaced)
  ⏭️ Skipping מרדף (not found or already replaced)
  ⏭️ Skipping מאמץ (not found or already replaced)
  ⏭️ Skipping דפוסים (not found or already replaced)
  ⏭️ Skipping מחויבויות (not found or already replaced)
  ⏭️ Skipping התפזר (not found or already replaced)
  ⏭️ Skipping התמדה (not found or already replaced)
  ⏭️ Skipping ראייה (not found or already replaced)
  ⏭️ Skipping תובנה (not found or already replaced)
  ⏭️ Skipping חוסר מעש (not found or already replaced)
  ⏭️ Skipping כאב נפשי (not found or already replaced)
  ⏭️ Skipping חוויה (not found or already replaced)
  ⏭️ Skipping שעמום (not found or already replaced)
  ⏭️ Skipping שואפת (not found or already 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


🎉 PROCESSING COMPLETE! 🎉
📊 Applied 65 total replacements
🗄️ Database replacements: 7
🤖 Grok replacements: 58
📁 Downloaded files:
   - enhanced_processed.txt (main output)
   - analysis_report.txt (detailed analysis)
   - replacement_log.json (all replacements)


#code DUMP


In [None]:


    # def enhanced_grok_api_call(self, text: str, known_words: List[str]) -> Dict[str, str]:
    #     """FIXED: Grok API call that actually uses the filtered words"""
    #     if not self.grok_api_key:
    #         return {}

    #     print(f"\n=== SMART FILTERING FOR GROK ===")
    #     print(f"Words detected from database matching: {len(known_words)}")

    #     # Extract words that appear in the text for filtering
    #     text_words = re.findall(r'[\u0590-\u05FF]+', text)  # Extract Hebrew words
    #     unique_text_words = list(set(text_words))

    #     # Filter words to only those worth sending to Grok
    #     filtered_words = self.word_filter.filter_words_for_grok_processing(unique_text_words)

    #     print(f"Words in text: {len(unique_text_words)}")
    #     print(f"Words to process with Grok: {len(filtered_words)}")
    #     print(f"API calls saved by filtering: {len(unique_text_words) - len(filtered_words)}")

    #     if not filtered_words:
    #         print("No words need Grok processing - all are either untranslatable or traditional!")
    #         return {}

    #     # Create cache key for this specific set of filtered words
    #     words_hash = hashlib.md5(str(sorted(filtered_words)).encode()).hexdigest()

    #     # Check cache first
    #     if words_hash in self.processed_cache:
    #         print("Using cached results for these filtered words")
    #         return self.processed_cache[words_hash]

    #     print(f"Sending {len(filtered_words)} filtered words to Grok for analysis...")

    #     # FIXED: Process filtered words in chunks instead of entire text
    #     word_chunks = self.chunk_words_for_grok(filtered_words, max_words_per_chunk=250)

    #     all_suggestions = {}

    #     for i, word_chunk in enumerate(tqdm(word_chunks, desc="Processing word chunks with Grok")):
    #         print(f"Processing word chunk {i+1}/{len(word_chunks)}: {word_chunk}")

    #         # FIXED: Better prompt focusing on specific filtered words
    #         sample_untranslatable = list(self.word_filter.untranslatable_words)[:10]
    #         sample_traditional = list(self.word_filter.already_traditional)[:8]

    #         prompt = f"""
    #         You are an expert in Hebrew linguistics specializing in Lashon Hakodesh (Traditional Hebrew).

    #         TASK: For each modern Hebrew word in the list below, provide its traditional Hebrew equivalent ONLY if it has one.
    #         the workds are extracted from sentences so you can take sentence context into account.

    #         WORDS TO ANALYZE: {word_chunk}

    #         CRITICAL EXCLUSIONS - DO NOT suggest equivalents for:
    #         ❌ Modern technology: {sample_untranslatable}
    #         ❌ Already traditional terms: {sample_traditional}
    #         ❌ Words already in our database: {known_words[:30]}

    #         INCLUDE ONLY words that are:
    #         ✓ Modern Hebrew neologisms with clear traditional equivalents
    #         ✓ Modern psychological/philosophical terms → traditional concepts
    #         ✓ Modern abstract concepts → traditional Hebrew terms

    #         EXAMPLES:
    #         ✓ "פחדים" → "יראה" (modern fears → traditional awe/fear)
    #         ✓ "מהות" → "עצם" (modern essence → traditional essence)
    #         ✓ "אתגר" → "נסיון" (modern challenge → traditional test/trial)

    #         RESPONSE FORMAT:
    #         Return ONLY valid JSON with words that have traditional equivalents:
    #         {{"modern_word": "traditional_equivalent"}}

    #         If no words have traditional equivalents, return: {{}}
    #         """

    #         try:
    #             headers = {
    #                 'Content-Type': 'application/json',
    #                 'Authorization': f'Bearer {self.grok_api_key}'
    #             }

    #             data = {
    #                 'model': 'grok-4-latest',  # Updated model name
    #                 'messages': [{'role': 'user', 'content': prompt}],
    #                 'max_tokens': 3500,  # Reduced since we're sending fewer words
    #                 'temperature': 0.1   # Very low temperature for consistency
    #             }

    #             response = requests.post(
    #                 'https://api.x.ai/v1/chat/completions',
    #                 headers=headers,
    #                 json=data,
    #                 timeout=30
    #             )

    #             if response.status_code == 200:
    #                 result = response.json()
    #                 content = result['choices'][0]['message']['content']

    #                 # Extract JSON from response
    #                 json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
    #                 json_matches = re.findall(json_pattern, content, re.DOTALL)

    #                 for json_match in json_matches:
    #                     try:
    #                         chunk_suggestions = json.loads(json_match)
    #                         if isinstance(chunk_suggestions, dict):
    #                             # Validate that suggested words were actually in our filtered list
    #                             # validated = self.validate_grok_suggestions_with_filter(chunk_suggestions, known_words)
    #                             validated = self.validate_grok_suggestions_strict(
    #                                 chunk_suggestions, known_words, word_chunk
    #                             )
    #                             all_suggestions.update(validated)
    #                             print(f"  ✅ Got {len(validated)} valid suggestions from chunk {i+1}")
    #                     except json.JSONDecodeError as e:
    #                         print(f"  ❌ JSON parsing error in chunk {i+1}: {e}")
    #                         continue

    #             else:
    #                 print(f"  ❌ Grok API error for chunk {i+1}: {response.status_code}")
    #                 if response.status_code == 429:
    #                     print("     Rate limit hit - consider adding delays")

    #         except Exception as e:
    #             print(f"  ❌ Error in Grok API call for chunk {i+1}: {e}")
    #             continue

    #     # Cache results
    #     self.processed_cache[words_hash] = all_suggestions
    #     print(f"Total suggestions from Grok: {len(all_suggestions)}")
    #     return all_suggestions