In [None]:
import torch
import os
import glob
import time
from PIL import Image
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration

# Clear GPU memory and set environment
torch.cuda.empty_cache()
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def setup_llava_model():
    """Setup LLaVA model with proper error handling and retries"""
    print("🚀 Setting up LLaVA model...")
    
    # Use LLaVA-NeXT which is more stable and faster to download
    model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
    
    try:
        print(f"📥 Loading processor from {model_id}...")
        processor = LlavaNextProcessor.from_pretrained(model_id)
        print("✅ Processor loaded!")
        
        print(f"📥 Loading model from {model_id}...")
        model = LlavaNextForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map="auto"
        )
        print("✅ Model loaded!")
        
        return processor, model
        
    except Exception as e:
        print(f"❌ Error with {model_id}: {e}")
        print("🔄 Trying alternative model...")
        
        # Fallback to smaller, more reliable model
        fallback_model = "llava-hf/llava-1.5-7b-hf"
        try:
            print(f"📥 Loading fallback processor from {fallback_model}...")
            processor = LlavaNextProcessor.from_pretrained(fallback_model)
            print("✅ Fallback processor loaded!")
            
            print(f"📥 Loading fallback model from {fallback_model}...")
            model = LlavaNextForConditionalGeneration.from_pretrained(
                fallback_model,
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True,
                device_map="auto"
            )
            print("✅ Fallback model loaded!")
            
            return processor, model
            
        except Exception as e2:
            print(f"❌ Fallback also failed: {e2}")
            raise Exception("Both primary and fallback models failed to load")

def process_single_image_llava(image_path, processor, model, output_folder):
    """Process a single image with LLaVA"""
    filename = os.path.basename(image_path)
    base_name = os.path.splitext(filename)[0]
    text_file_path = os.path.join(output_folder, f"{base_name}.txt")
    
    try:
        # Load and prepare image
        image = Image.open(image_path).convert("RGB")
        
        # Resize if too large (important for GPU memory)
        max_size = 512
        if max(image.size) > max_size:
            image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
        
        # Prepare conversation
        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "image"},
                    {"type": "text", "text": "Extract all text from this image. Provide only the extracted text without any additional commentary or explanations."}
                ]
            }
        ]
        
        # Apply chat template and tokenize
        prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
        
        # Generate with optimized settings
        print(f"🔍 Processing {filename}...")
        start_time = time.time()
        
        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=False,
                temperature=0.1,
                pad_token_id=processor.tokenizer.eos_token_id
            )
        
        # Decode output
        generated_text = processor.decode(output[0], skip_special_tokens=True)
        
        # Clean up the output - extract only the assistant's response
        if "assistant" in generated_text.lower():
            # Find the last occurrence of assistant and extract text after it
            assistant_pos = generated_text.lower().rfind("assistant")
            if assistant_pos != -1:
                extracted_text = generated_text[assistant_pos + len("assistant"):].strip()
            else:
                extracted_text = generated_text.strip()
        else:
            extracted_text = generated_text.strip()
        
        # Remove any remaining conversation artifacts
        lines = extracted_text.split('\n')
        cleaned_lines = []
        for line in lines:
            line = line.strip()
            if line and not any(skip in line.lower() for skip in ['user:', 'assistant:', 'extract all text', 'provide only']):
                cleaned_lines.append(line)
        
        final_text = '\n'.join(cleaned_lines).strip()
        
        end_time = time.time()
        
        # Save result
        with open(text_file_path, 'w', encoding='utf-8') as f:
            f.write(final_text)
        
        print(f"✅ Processed in {end_time - start_time:.2f}s")
        if final_text:
            preview = final_text[:100] + "..." if len(final_text) > 100 else final_text
            print(f"📝 Extracted: {preview}")
        else:
            print("⚠️ No text found")
        
        return True
        
    except Exception as e:
        print(f"❌ Error processing {filename}: {e}")
        # Save error file
        with open(text_file_path, 'w', encoding='utf-8') as f:
            f.write(f"ERROR: {str(e)}")
        return False

def llava_ocr_main(images_folder="part_2_images", output_folder="extracted_text/llava_fixed"):
    """Main LLaVA OCR function"""
    print("🔥 STARTING FIXED LLaVA OCR PROCESSING 🔥")
    
    # Create output directory
    os.makedirs(output_folder, exist_ok=True)
    
    # Setup model
    try:
        processor, model = setup_llava_model()
    except Exception as e:
        print(f"💀 Failed to setup model: {e}")
        return
    
    # Find images
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []
    
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
    if not image_files:
        print(f"❌ No images found in {images_folder}")
        return
    
    print(f"🖼️ Found {len(image_files)} images to process")
    
    # Process images one by one
    successful = 0
    failed = 0
    
    for i, image_path in enumerate(image_files, 1):
        print(f"\n📷 [{i}/{len(image_files)}] Processing: {os.path.basename(image_path)}")
        
        success = process_single_image_llava(image_path, processor, model, output_folder)
        
        if success:
            successful += 1
        else:
            failed += 1
        
        # Clear GPU memory after each image
        torch.cuda.empty_cache()
    
    print(f"\n🎉 LLaVA OCR COMPLETE! 🎉")
    print(f"✅ Successful: {successful}")
    print(f"❌ Failed: {failed}")
    print(f"📁 Results saved to: {output_folder}")

# Run the fixed LLaVA implementation
if __name__ == "__main__":
    llava_ocr_main()

# FIXED LLaVA Implementation - WORKING VERSION

In [None]:
import os
import glob
import cv2
import easyocr
import pyttsx3
from langdetect import detect
import time

def simple_ocr_solution(images_folder="part_2_images", output_folder="extracted_text"):
    """
    Simple working OCR solution using EasyOCR that actually works
    """
    # Create output directory
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder: {output_folder}")
    
    # Initialize EasyOCR with multiple languages
    print("Initializing EasyOCR...")
    reader = easyocr.Reader(['en', 'ar', 'ur'], gpu=True, verbose=False)
    print("✅ EasyOCR ready!")
    
    # Initialize TTS
    tts_engine = pyttsx3.init()
    
    # Get all image files
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []
    
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
    if not image_files:
        print(f"❌ No images found in {images_folder}")
        return
    
    print(f"Found {len(image_files)} images to process")
    
    successful_extractions = 0
    
    # Process each image
    for i, image_path in enumerate(image_files, 1):
        filename = os.path.basename(image_path)
        base_name = os.path.splitext(filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
        print(f"\n[{i}/{len(image_files)}] Processing: {filename}")
        
        try:
            # Read image with OpenCV
            image = cv2.imread(image_path)
            if image is None:
                print(f"❌ Could not read image: {image_path}")
                continue
            
            # Extract text using EasyOCR
            start_time = time.time()
            results = reader.readtext(image, paragraph=True, detail=0)
            end_time = time.time()
            
            # Combine all text
            extracted_text = " ".join(results).strip()
            
            if extracted_text:
                # Try to detect language
                try:
                    detected_lang = detect(extracted_text)
                    print(f"✅ Text extracted in {end_time - start_time:.2f}s - Language: {detected_lang}")
                    
                    # Save to file with language info
                    with open(text_file_path, 'w', encoding='utf-8') as f:
                        f.write(f"LANG:{detected_lang}\n")
                        f.write(extracted_text)
                    
                    print(f"💾 Saved to: {text_file_path}")
                    print(f"📝 Preview: {extracted_text[:100]}...")
                    
                    successful_extractions += 1
                    
                except Exception as lang_error:
                    print(f"⚠️ Language detection failed: {lang_error}")
                    with open(text_file_path, 'w', encoding='utf-8') as f:
                        f.write("LANG:unknown\n")
                        f.write(extracted_text)
                    successful_extractions += 1
            else:
                print("⚠️ No text detected")
                # Save empty file
                with open(text_file_path, 'w', encoding='utf-8') as f:
                    f.write("LANG:none\n")
                    f.write("")
                
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")
            continue
    
    print(f"\n=== Processing Complete ===")
    print(f"✅ Successfully extracted text from {successful_extractions}/{len(image_files)} images")
    
    # Ask if user wants to read files aloud
    if successful_extractions > 0:
        try:
            response = input(f"\n🔊 Read {successful_extractions} text files aloud? (y/n): ").lower()
            if response in ['y', 'yes']:
                read_text_files_aloud(output_folder, tts_engine)
        except:
            print("Skipping TTS...")
    
    return output_folder

def read_text_files_aloud(text_folder, tts_engine):
    """Read text files aloud"""
    text_files = glob.glob(os.path.join(text_folder, "*.txt"))
    
    for i, text_file in enumerate(text_files, 1):
        print(f"\n🔊 Reading file {i}/{len(text_files)}: {os.path.basename(text_file)}")
        
        try:
            with open(text_file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            if len(lines) > 1:
                content = "".join(lines[1:]).strip()  # Skip language line
                if content:
                    tts_engine.say(content)
                    tts_engine.runAndWait()
                    time.sleep(1)
                else:
                    print("No content to read")
            else:
                print("File is empty")
                
        except Exception as e:
            print(f"Error reading {text_file}: {e}")

# Run the simple solution
if __name__ == "__main__":
    simple_ocr_solution()

# Working Simple OCR Solution (EasyOCR + TTS)

In [None]:
# Download LLaVA model in smaller chunks with retries
import os
import time
from transformers import AutoProcessor, LlavaForConditionalGeneration
import torch

# Fix for slow downloads
os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '0'
os.environ['HF_HUB_DOWNLOAD_TIMEOUT'] = '3600'  # 1 hour timeout

def download_llava_model():
    """Download LLaVA model with retry mechanism"""
    max_retries = 3
    
    for attempt in range(max_retries):
        try:
            print(f"Attempt {attempt + 1}/{max_retries}")
            
            # Try processor first
            print("Downloading processor...")
            processor = AutoProcessor.from_pretrained(
                "llava-hf/llava-1.5-7b-hf", 
                use_fast=False,
                trust_remote_code=True,
                cache_dir="./models"  # Local cache
            )
            print("✅ Processor downloaded!")
            
            # Then model
            print("Downloading model (this will take a while)...")
            model = LlavaForConditionalGeneration.from_pretrained(
                "llava-hf/llava-1.5-7b-hf",
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True,
                trust_remote_code=True,
                cache_dir="./models"  # Local cache
            )
            print("✅ Model downloaded successfully!")
            
            return processor, model
            
        except Exception as e:
            print(f"❌ Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                print("Retrying in 10 seconds...")
                time.sleep(10)
            else:
                print("All attempts failed. Check your internet connection.")
                return None, None

# Run the download
processor, model = download_llava_model()

# Working LLaVA Setup (Fixed)

## Environment Report

**Current Package Versions (Baseline before LLAVA/SHIKRA integration):**

- **Python**: 3.13.3
- **torch**: 2.7.1+cu128
- **torchvision**: 0.22.1+cu128
- **torchaudio**: 2.7.1+cu128
- **easyocr**: 1.7.2
- **opencv-python**: 4.12.0.88
- **pillow**: 11.2.1
- **numpy**: 2.2.6
- **pyttsx3**: 2.99
- **langdetect**: 1.0.9
- **transformers**: 4.52.3
- **gradio**: 5.31.0
- **CUDA Available**: Yes
- **CUDA Version**: 12.8
- **GPU**: NVIDIA GeForce RTX 2060
- **GPU Memory**: 12.88 GB

# Imports and Inits

In [1]:
import os
import glob
import torch
from PIL import Image
import time
import gc
from transformers import AutoProcessor, LlavaForConditionalGeneration
import easyocr
import pyttsx3
from langdetect import detect
import cv2
import numpy as np

## Moving Hugging Face default Download Dir

# # LLAVA OCR Cell - GPU Accelerated Implementation
# import os

# # IMPORTANT: Set cache BEFORE any imports from transformers/huggingface
# os.environ['HF_HOME'] = 'D:/HuggingFaceCache'
# os.environ['TRANSFORMERS_CACHE'] = 'D:/HuggingFaceCache/transformers'
# os.environ['HUGGINGFACE_HUB_CACHE'] = 'D:/HuggingFaceCache/hub'

# # Now import everything else
# import glob
# import torch
# from PIL import Image
# import time
# import gc
# from transformers import AutoProcessor, LlavaForConditionalGeneration

# Force GPU usage
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.backends.cudnn.benchmark = True

# Check GPU availability
if not torch.cuda.is_available():
    print("ERROR: CUDA not available. LLAVA requires GPU acceleration.")
    exit(1)

print("=== LLAVA OCR Processing ===")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"HuggingFace cache location: {os.environ.get('HF_HOME', 'default')}")

  from .autonotebook import tqdm as notebook_tqdm
2025-07-11 23:06:10.304046: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752257170.342471    5747 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752257170.353169    5747 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752257170.428920    5747 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752257170.428954    5747 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752257170.428957    5747

=== LLAVA OCR Processing ===
GPU: NVIDIA GeForce RTX 2050
GPU Memory: 3.96 GB
HuggingFace cache location: default


# Simple TTS using pyttsx3 Gpu based:

In [None]:
# Force GPU usage and optimization for RTX 2060
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use first GPU
torch.backends.cudnn.benchmark = True  # Enable cudnn autotuner for performance

# Check GPU and setup
if not torch.cuda.is_available():
    print("ERROR: CUDA not available. This script requires GPU acceleration.")
    print("Please check your NVIDIA drivers and PyTorch installation.")
    exit(1)

# Display GPU information
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU detected: {torch.cuda.get_device_name(0)}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Clear GPU memory before starting
torch.cuda.empty_cache()
gc.collect()

# Initialize EasyOCR reader with explicit GPU settings
print("Initializing EasyOCR with GPU acceleration...")
reader = easyocr.Reader(
    ["ar", "ur", "en"], 
    gpu=True,
    verbose=False,
    # For RTX 2060, set reasonable batch size and model parameters
    detector=True,
    recognizer=True
)

# Initialize TTS engine
tts_engine = pyttsx3.init()

def log_gpu_memory():
    """Log current GPU memory usage"""
    allocated = torch.cuda.memory_allocated(0) / 1e9
    reserved = torch.cuda.memory_reserved(0) / 1e9
    print(f"GPU Memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved")

def process_images_ocr_save_text(images_folder, output_folder):
    """
    Process all images in the folder with GPU-accelerated OCR and save to text files
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")
    
    # Get all image files in the folder
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []

    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

    if not image_files:
        print(f"No images found in {images_folder}")
        return

    print(f"Found {len(image_files)} images to process")
    
    # Report initial GPU memory
    log_gpu_memory()

    # Process files
    for i, image_path in enumerate(image_files, 1):
        image_filename = os.path.basename(image_path)
        base_name = os.path.splitext(image_filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
        print(f"\n--- Processing Image {i}/{len(image_files)}: {image_filename} ---")

        try:
            # Read image
            image = cv2.imread(image_path)
            if image is None:
                print(f"Could not read image: {image_path}")
                continue

            # Optimize image for GPU processing (resize large images)
            h, w = image.shape[:2]
            max_dim = 2000  # Optimal for RTX 2060 memory
            if max(h, w) > max_dim:
                scale = max_dim / max(h, w)
                image = cv2.resize(image, (int(w * scale), int(h * scale)))
                print(f"Resized image to {image.shape[1]}x{image.shape[0]} to optimize GPU memory")
            
            # Report GPU memory before OCR
            log_gpu_memory()

            # Extract text using GPU-accelerated EasyOCR
            print("Extracting text with GPU acceleration...")
            start_time = time.time()
            
            # For RTX 2060, use appropriate batch size
            results = reader.readtext(
                image,
                batch_size=2,  # Adjust based on your GPU memory
                paragraph=True,  # Group text into paragraphs
                detail=0  # 0 for more accuracy
            )

            end_time = time.time()
            print(f"OCR completed in {end_time - start_time:.2f} seconds")

            # Combine all detected text with confidence filtering
            extracted_text = " ".join(
                [result[1] for result in results if result[2] > 0.5]
            )  # confidence > 0.5

            if not extracted_text.strip():
                print("No text detected in this image")
                continue

            print(f"Extracted text: {extracted_text}")
            
            # Try to detect language
            try:
                detected_lang = detect(extracted_text)
                print(f"Detected language: {detected_lang}")
                
                # Save text with language information to file
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(f"LANG:{detected_lang}\n")
                    text_file.write(extracted_text)
                
                print(f"Saved text to: {text_file_path}")
                
            except Exception as lang_error:
                print(f"Language detection failed: {lang_error}")
                # Save text without language information
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(f"LANG:unknown\n")
                    text_file.write(extracted_text)
                
                print(f"Saved text to: {text_file_path} (language unknown)")

            # Clear GPU memory after each image
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            # Clear GPU memory on error
            torch.cuda.empty_cache()
            continue

    # Final GPU memory cleanup
    torch.cuda.empty_cache()
    gc.collect()
    print("\n--- OCR Processing Complete ---")
    log_gpu_memory()
    
    return output_folder

def read_text_files_aloud(text_folder):
    """
    Read all text files in the folder aloud using TTS, one by one with clear separation
    """
    # Get all text files
    text_files = glob.glob(os.path.join(text_folder, "*.txt"))
    
    if not text_files:
        print(f"No text files found in {text_folder}")
        return
    
    print(f"\nFound {len(text_files)} text files to read")
    
    # Language mapping for TTS
    lang_mapping = {
        "en": "english",
        "ar": "arabic",
        "ur": "urdu",
        "hi": "hindi",
        "fa": "persian",
        "ps": "pashto"
    }
    
    # Sort text files alphabetically to ensure consistent reading order
    text_files.sort()
    
    for i, text_file_path in enumerate(text_files, 1):
        file_name = os.path.basename(text_file_path)
        print(f"\n===== Reading File {i}/{len(text_files)}: {file_name} =====")
        
        # Announce the file being read (optional)
        tts_engine.setProperty("rate", 150)
        announcement = f"Reading file {i} of {len(text_files)}: {os.path.splitext(file_name)[0]}"
        print(announcement)
        tts_engine.say(announcement)
        tts_engine.runAndWait()
        
        # Pause between announcement and content
        time.sleep(1)
        
        try:
            # Read text file
            with open(text_file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                
            if not lines:
                print(f"File is empty: {text_file_path}")
                continue
                
            # Extract language information from first line
            lang_line = lines[0].strip()
            if lang_line.startswith("LANG:"):
                detected_lang = lang_line[5:]
                print(f"Language: {detected_lang}")
                # Remove the language line
                content = "".join(lines[1:])
            else:
                # No language information, treat all lines as content
                detected_lang = "unknown"
                content = "".join(lines)
            
            if not content.strip():
                print("No content to read")
                continue
                
            # Print a preview of the content
            content_preview = content[:100] + "..." if len(content) > 100 else content
            print(f"Text content: {content_preview}")
            
            # Get available voices
            voices = tts_engine.getProperty("voices")
            
            # Try to set appropriate voice based on language
            voice_set = False
            for voice in voices:
                tts_lang = lang_mapping.get(detected_lang, "english")
                if (tts_lang.lower() in voice.name.lower() or 
                    detected_lang in voice.id.lower()):
                    tts_engine.setProperty("voice", voice.id)
                    voice_set = True
                    print(f"Using voice: {voice.name}")
                    break
            
            if not voice_set:
                print(f"No specific voice found for {detected_lang}, using default")
            
            # Adjust speech rate based on language
            if detected_lang in ["ar", "ur"]:
                # Slower for Arabic and Urdu
                tts_engine.setProperty("rate", 130)
            else:
                tts_engine.setProperty("rate", 150)
            
            # Read the text aloud
            print(f"Reading text aloud...")
            tts_engine.say(content)
            tts_engine.runAndWait()
            
            # Pause between files to clearly separate them
            print("Finished reading file.")
            time.sleep(2)
            
        except Exception as e:
            print(f"Error reading {text_file_path}: {e}")
            continue
    
    print("\n===== All Text Files Have Been Read =====")

def get_user_confirmation():
    """
    Ask the user if they want to proceed to the TTS reading phase
    """
    while True:
        response = input("\nOCR processing complete. Proceed with reading text files? (y/n): ").lower()
        if response in ['y', 'yes']:
            return True
        elif response in ['n', 'no']:
            return False
        else:
            print("Please enter 'y' or 'n'")

# Main process
if __name__ == "__main__":
    try:
        # Define folders
        images_folder = "part_2_images"
        output_folder = "extracted_text"
        
        # Print GPU optimization message
        print("=== Running GPU-Optimized OCR for RTX 2060 ===")
        
        # Step 1: Process images with OCR and save text
        start_time = time.time()
        text_folder = process_images_ocr_save_text(images_folder, output_folder)
        end_time = time.time()
        
        print(f"OCR processing completed in {end_time - start_time:.2f} seconds")
        
        # Optional: Ask for user confirmation before proceeding to TTS
        if get_user_confirmation():
            # Step 2: Read the saved text files aloud one by one
            read_text_files_aloud(text_folder)
        else:
            print("TTS reading canceled. Text files are saved in the output folder.")
        
        print("Process completed successfully!")
        
    except KeyboardInterrupt:
        print("\nProcess interrupted by user")
        # Clean up GPU memory on interrupt
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"An error occurred: {e}")
        # Clean up GPU memory on error
        torch.cuda.empty_cache()
    finally:
        # Final cleanup
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
# Try a smaller vision model first
processor = AutoProcessor.from_pretrained("microsoft/git-base", use_fast=False)
print("Downloaded!")

# LLava Model


In [2]:
# # LLAVA OCR Cell - GPU Accelerated Implementation

# # Force GPU usage
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ['HF_HOME'] = 'D:/HuggingFaceCache'
# torch.backends.cudnn.benchmark = True

# # Check GPU availability
# if not torch.cuda.is_available():
#     print("ERROR: CUDA not available. LLAVA requires GPU acceleration.")
#     exit(1)

# print("=== LLAVA OCR Processing ===")
# print(f"GPU: {torch.cuda.get_device_name(0)}")
# print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# def process_images_with_llava(images_folder="part_2_images", output_folder="extracted_text/llava"):
#     """
#     Process images using LLAVA model for OCR with GPU acceleration
#     """
#     # Create output directory if it doesn't exist
#     os.makedirs(output_folder, exist_ok=True)
#     print(f"Output folder: {output_folder}")

#     # # Clear GPU memory before starting
#     # torch.cuda.empty_cache()
#     # gc.collect()

#     # Initialize LLAVA model and processor
#     print("Loading LLAVA model...")
#     try:
#         # TODO: Replace with your specific LLAVA model path/name
#         model_name = "llava-hf/llava-1.5-7b-hf"  # Example model name

#         processor = AutoProcessor.from_pretrained(model_name, use_fast=True)
#         model = LlavaForConditionalGeneration.from_pretrained(
#             model_name,
#             torch_dtype=torch.float16,  # Use fp16 for RTX 2060 efficiency
#             device_map="cuda",
#             low_cpu_mem_usage=True
#         )
#         model.eval()
#         print("LLAVA model loaded successfully!")

#     except Exception as e:
#         print(f"Error loading LLAVA model: {e}")
#         print("Please ensure you have the correct model name/path")
#         return

#     # Get all image files
#     image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
#     image_files = []

#     for ext in image_extensions:
#         image_files.extend(glob.glob(os.path.join(images_folder, ext)))
#         image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

#     if not image_files:
#         print(f"No images found in {images_folder}")
#         return

#     print(f"Found {len(image_files)} images to process")

#     # Process each image
#     processed_count = 0
#     skipped_count = 0

#     for i, image_path in enumerate(image_files, 1):
#         image_filename = os.path.basename(image_path)
#         base_name = os.path.splitext(image_filename)[0]
#         text_file_path = os.path.join(output_folder, f"{base_name}.txt")

#         # Check if text file already exists (efficiency check)
#         if os.path.exists(text_file_path):
#             print(f"\n[{i}/{len(image_files)}] Skipping {image_filename} - already processed")
#             skipped_count += 1
#             continue

#         print(f"\n[{i}/{len(image_files)}] Processing {image_filename}...")

#         try:
#             # Load and preprocess image
#             image = Image.open(image_path).convert("RGB")

#             # Resize large images to optimize GPU memory
#             max_dim = 1024  # Adjust based on your GPU memory
#             if max(image.size) > max_dim:
#                 image.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
#                 print(f"Resized image to {image.size} for GPU optimization")

#             # Prepare prompt for OCR task
#             prompt = "USER: <image>\nExtract all text from this image. Provide only the extracted text without any additional commentary.\nASSISTANT:"

#             # Process with LLAVA
#             start_time = time.time()

#             # TODO: Adjust this section based on your specific LLAVA implementation
#             inputs = processor(prompt, image, return_tensors="pt").to("cuda")

#             # Generate text with GPU
#             with torch.no_grad():
#                 generated_ids = model.generate(
#                     **inputs,
#                     max_new_tokens=1024,
#                     temperature=0.1,  # Low temperature for more accurate OCR
#                     do_sample=False,
#                     use_cache=True
#                 )

#             # Decode the generated text
#             extracted_text = processor.decode(generated_ids[0], skip_special_tokens=True)

#             # Remove the prompt from the output
#             if "ASSISTANT:" in extracted_text:
#                 extracted_text = extracted_text.split("ASSISTANT:")[-1].strip()

#             end_time = time.time()
#             print(f"OCR completed in {end_time - start_time:.2f} seconds")

#             if not extracted_text.strip():
#                 print("No text detected in this image")
#                 # Save empty file to avoid reprocessing
#                 with open(text_file_path, 'w', encoding='utf-8') as f:
#                     f.write("")
#                 continue

#             # Save extracted text
#             with open(text_file_path, 'w', encoding='utf-8') as f:
#                 f.write(extracted_text)

#             print(f"Saved text to: {text_file_path}")
#             print(f"Text preview: {extracted_text[:100]}..." if len(extracted_text) > 100 else f"Text: {extracted_text}")

#             processed_count += 1

#             # Clear GPU memory after each image
#             torch.cuda.empty_cache()

#         except Exception as e:
#             print(f"Error processing {image_filename}: {e}")
#             # Save error file to avoid reprocessing
#             with open(text_file_path, 'w', encoding='utf-8') as f:
#                 f.write(f"ERROR: {str(e)}")
#             torch.cuda.empty_cache()
#             continue

#     # Final cleanup
#     del model
#     del processor
#     torch.cuda.empty_cache()
#     gc.collect()

#     print(f"\n=== LLAVA Processing Complete ===")
#     print(f"Processed: {processed_count} images")
#     print(f"Skipped: {skipped_count} images (already processed)")
#     print(f"Total: {len(image_files)} images")

# # Run LLAVA OCR processing
# if __name__ == "__main__":
#     process_images_with_llava()


# LLAVA OCR Cell - GPU Accelerated Implementation

import os
import gc
import time
import glob
import torch
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration
import warnings

warnings.filterwarnings("ignore")

# Force GPU usage
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# os.environ['HF_HOME'] = 'D:/HuggingFaceCache'
torch.backends.cudnn.benchmark = True

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
use_gpu = torch.cuda.is_available()

print("=== LLAVA OCR Processing ===")
if use_gpu:
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(
        f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB"
    )
    print("Using GPU acceleration")
else:
    print("GPU not available - using CPU (will be slower)")
    print("For better performance, install CUDA-compatible PyTorch")


def process_images_with_llava(
    images_folder="part_2_images", output_folder="extracted_text/llava"
):
    """
    Process images using LLAVA model for OCR with GPU acceleration
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder: {output_folder}")

    # Clear GPU memory before starting (only if GPU available)
    if use_gpu:
        torch.cuda.empty_cache()
        gc.collect()

    # Initialize LLAVA model and processor
    print("Loading LLAVA model...")
    try:
        # Updated model name - use the correct LLAVA model
        model_name = "llava-hf/llava-1.5-7b-hf"

        print("Downloading/Loading processor...")
        processor = AutoProcessor.from_pretrained(
            model_name, use_fast=False, trust_remote_code=True
        )

        print("Downloading/Loading model...")
        model = LlavaForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16
            if use_gpu
            else torch.float32,  # Use fp16 only if GPU available
            device_map="auto"
            if use_gpu
            else None,  # Use auto device mapping only if GPU available
            low_cpu_mem_usage=True,
            trust_remote_code=True,
        )

        # Move model to appropriate device
        if use_gpu and not next(model.parameters()).is_cuda:
            model = model.to(device)
        elif not use_gpu:
            model = model.to(device)

        model.eval()
        print("LLAVA model loaded successfully!")

    except Exception as e:
        print(f"Error loading LLAVA model: {e}")
        print("Trying alternative approach...")

        # Alternative: Try different model or installation
        try:
            # Alternative model names to try
            alternative_models = [
                "llava-hf/llava-1.5-13b-hf",
                "llava-hf/llava-v1.6-mistral-7b-hf",
                "llava-hf/llava-v1.6-vicuna-7b-hf",
            ]

            for alt_model in alternative_models:
                try:
                    print(f"Trying {alt_model}...")
                    processor = AutoProcessor.from_pretrained(
                        alt_model, trust_remote_code=True
                    )
                    model = LlavaForConditionalGeneration.from_pretrained(
                        alt_model,
                        torch_dtype=torch.float16 if use_gpu else torch.float32,
                        device_map="auto" if use_gpu else None,
                        low_cpu_mem_usage=True,
                        trust_remote_code=True,
                    )
                    if use_gpu and not next(model.parameters()).is_cuda:
                        model = model.to(device)
                    elif not use_gpu:
                        model = model.to(device)
                    model.eval()
                    print(f"Successfully loaded {alt_model}!")
                    model_name = alt_model
                    break
                except Exception as alt_e:
                    print(f"Failed to load {alt_model}: {alt_e}")
                    continue
            else:
                print("All model loading attempts failed.")
                print("Please ensure you have installed the requirements:")
                print("pip install transformers torch torchvision accelerate")
                print("pip install git+https://github.com/huggingface/transformers.git")
                return

        except Exception as final_e:
            print(f"Final error: {final_e}")
            return

    # Get all image files
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []

    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

    if not image_files:
        print(f"No images found in {images_folder}")
        return

    print(f"Found {len(image_files)} images to process")

    # Process each image
    processed_count = 0
    skipped_count = 0

    for i, image_path in enumerate(image_files, 1):
        image_filename = os.path.basename(image_path)
        base_name = os.path.splitext(image_filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")

        # Check if text file already exists (efficiency check)
        if os.path.exists(text_file_path):
            print(
                f"\n[{i}/{len(image_files)}] Skipping {image_filename} - already processed"
            )
            skipped_count += 1
            continue

        print(f"\n[{i}/{len(image_files)}] Processing {image_filename}...")

        try:
            # Load and preprocess image
            image = Image.open(image_path).convert("RGB")

            # Resize large images to optimize memory
            max_dim = 1024 if use_gpu else 512  # Smaller images for CPU processing
            if max(image.size) > max_dim:
                image.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
                print(
                    f"Resized image to {image.size} for {'GPU' if use_gpu else 'CPU'} optimization"
                )

            # Prepare prompt for OCR task - Updated format
            prompt = "USER: <image>\nExtract all text from this image. Provide only the extracted text without any additional commentary.\nASSISTANT:"

            # Process with LLAVA
            start_time = time.time()

            # Updated processing approach
            inputs = processor(prompt, image, return_tensors="pt")

            # Move inputs to appropriate device
            if use_gpu:
                inputs = {
                    k: v.to(device) if isinstance(v, torch.Tensor) else v
                    for k, v in inputs.items()
                }
            else:
                inputs = {
                    k: v.to(device) if isinstance(v, torch.Tensor) else v
                    for k, v in inputs.items()
                }

            # Generate text with appropriate device
            with torch.no_grad():
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=1024 if use_gpu else 512,  # Reduce tokens for CPU
                    temperature=0.1,  # Low temperature for more accurate OCR
                    do_sample=False,
                    use_cache=True,
                    pad_token_id=processor.tokenizer.eos_token_id,
                )

            # Decode the generated text
            generated_text = processor.decode(
                generated_ids[0], skip_special_tokens=True
            )

            # Clean up the output - remove prompt
            if "ASSISTANT:" in generated_text:
                extracted_text = generated_text.split("ASSISTANT:")[-1].strip()
            else:
                extracted_text = generated_text.strip()

            # Remove any remaining prompt artifacts
            if "USER:" in extracted_text:
                extracted_text = extracted_text.split("USER:")[-1].strip()

            end_time = time.time()
            print(f"OCR completed in {end_time - start_time:.2f} seconds")

            if not extracted_text.strip():
                print("No text detected in this image")
                # Save empty file to avoid reprocessing
                with open(text_file_path, "w", encoding="utf-8") as f:
                    f.write("")
                continue

            # Save extracted text
            with open(text_file_path, "w", encoding="utf-8") as f:
                f.write(extracted_text)

            print(f"Saved text to: {text_file_path}")
            print(
                f"Text preview: {extracted_text[:100]}..."
                if len(extracted_text) > 100
                else f"Text: {extracted_text}"
            )

            processed_count += 1

            # Clear memory after each image (only if GPU available)
            if use_gpu:
                torch.cuda.empty_cache()

        except Exception as e:
            print(f"Error processing {image_filename}: {e}")
            # Save error file to avoid reprocessing
            with open(text_file_path, "w", encoding="utf-8") as f:
                f.write(f"ERROR: {str(e)}")
            if use_gpu:
                torch.cuda.empty_cache()
            continue

    # Final cleanup
    del model
    del processor
    if use_gpu:
        torch.cuda.empty_cache()
    gc.collect()

    print(f"\n=== LLAVA Processing Complete ===")
    print(f"Device used: {'GPU' if use_gpu else 'CPU'}")
    print(f"Processed: {processed_count} images")
    print(f"Skipped: {skipped_count} images (already processed)")
    print(f"Total: {len(image_files)} images")


# Install requirements function
def install_requirements():
    """Install required packages if not already installed"""
    import subprocess
    import sys

    required_packages = [
        "transformers>=4.36.0",
        "torch",
        "torchvision",
        "accelerate",
        "pillow",
        "bitsandbytes",  # For efficient loading
    ]

    print("Installing/updating required packages...")
    for package in required_packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        except subprocess.CalledProcessError as e:
            print(f"Failed to install {package}: {e}")

    # Install latest transformers from git for LLAVA support
    try:
        subprocess.check_call(
            [
                sys.executable,
                "-m",
                "pip",
                "install",
                "git+https://github.com/huggingface/transformers.git",
            ]
        )
    except subprocess.CalledProcessError as e:
        print(f"Failed to install transformers from git: {e}")


# Run LLAVA OCR processing
if __name__ == "__main__":
    # Uncomment the line below if you need to install requirements
    # install_requirements()

    process_images_with_llava()

=== LLAVA OCR Processing ===
GPU: NVIDIA GeForce RTX 2050
GPU Memory: 3.96 GB
Using GPU acceleration
Output folder: extracted_text/llava
Loading LLAVA model...
Downloading/Loading processor...


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Downloading/Loading model...


Downloading shards:   0%|          | 0/3 [05:47<?, ?it/s]


KeyboardInterrupt: 

# Shikra Model

In [1]:
# # SHIKRA OCR Cell - GPU Accelerated Implementation
# import os
# import glob
# import torch
# from PIL import Image
# import time
# import gc
# from transformers import AutoProcessor, AutoModelForVision2Seq

# # Force GPU usage
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# torch.backends.cudnn.benchmark = True

# # Check GPU availability
# if not torch.cuda.is_available():
#     print("ERROR: CUDA not available. SHIKRA requires GPU acceleration.")
#     exit(1)

# print("=== SHIKRA OCR Processing ===")
# print(f"GPU: {torch.cuda.get_device_name(0)}")
# print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# def process_images_with_shikra(images_folder="part_2_images", output_folder="extracted_text/shikra"):
#     """
#     Process images using SHIKRA model for OCR with GPU acceleration
#     """
#     # Create output directory if it doesn't exist
#     os.makedirs(output_folder, exist_ok=True)
#     print(f"Output folder: {output_folder}")
    
#     # Clear GPU memory before starting
#     torch.cuda.empty_cache()
#     gc.collect()
    
#     # Initialize SHIKRA model and processor
#     print("Loading SHIKRA model...")
#     try:
#         # TODO: Replace with your specific SHIKRA model path/name
#         model_name = "shikras/shikra-7b-delta-v1"  # Example model name
        
#         processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
#         model = AutoModelForVision2Seq.from_pretrained(
#             model_name,
#             torch_dtype=torch.float16,  # Use fp16 for RTX 2060 efficiency
#             device_map="cuda",
#             low_cpu_mem_usage=True
#         )
#         model.eval()
#         print("SHIKRA model loaded successfully!")
        
#     except Exception as e:
#         print(f"Error loading SHIKRA model: {e}")
#         print("Please ensure you have the correct model name/path")
#         return
    
#     # Get all image files
#     image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
#     image_files = []
    
#     for ext in image_extensions:
#         image_files.extend(glob.glob(os.path.join(images_folder, ext)))
#         image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
#     if not image_files:
#         print(f"No images found in {images_folder}")
#         return
    
#     print(f"Found {len(image_files)} images to process")
    
#     # Process each image
#     processed_count = 0
#     skipped_count = 0
    
#     for i, image_path in enumerate(image_files, 1):
#         image_filename = os.path.basename(image_path)
#         base_name = os.path.splitext(image_filename)[0]
#         text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
#         # Check if text file already exists (efficiency check)
#         if os.path.exists(text_file_path):
#             print(f"\n[{i}/{len(image_files)}] Skipping {image_filename} - already processed")
#             skipped_count += 1
#             continue
        
#         print(f"\n[{i}/{len(image_files)}] Processing {image_filename}...")
        
#         try:
#             # Load and preprocess image
#             image = Image.open(image_path).convert("RGB")
            
#             # Resize large images to optimize GPU memory
#             max_dim = 1024  # Adjust based on your GPU memory
#             if max(image.size) > max_dim:
#                 image.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
#                 print(f"Resized image to {image.size} for GPU optimization")
            
#             # Prepare prompt for OCR task
#             # SHIKRA might use a different prompt format - adjust as needed
#             prompt = "<image> Extract and transcribe all text visible in this image."
            
#             # Process with SHIKRA
#             start_time = time.time()
            
#             # TODO: Adjust this section based on your specific SHIKRA implementation
#             inputs = processor(
#                 text=prompt,
#                 images=image,
#                 return_tensors="pt"
#             ).to("cuda")
            
#             # Generate text with GPU
#             with torch.no_grad():
#                 generated_ids = model.generate(
#                     **inputs,
#                     max_new_tokens=1024,
#                     temperature=0.1,  # Low temperature for more accurate OCR
#                     do_sample=False,
#                     num_beams=1,  # Adjust based on accuracy vs speed tradeoff
#                     use_cache=True
#                 )
            
#             # Decode the generated text
#             extracted_text = processor.decode(generated_ids[0], skip_special_tokens=True)
            
#             # Clean up the output (remove prompt if included)
#             if prompt in extracted_text:
#                 extracted_text = extracted_text.replace(prompt, "").strip()
            
#             end_time = time.time()
#             print(f"OCR completed in {end_time - start_time:.2f} seconds")
            
#             if not extracted_text.strip():
#                 print("No text detected in this image")
#                 # Save empty file to avoid reprocessing
#                 with open(text_file_path, 'w', encoding='utf-8') as f:
#                     f.write("")
#                 continue
            
#             # Save extracted text
#             with open(text_file_path, 'w', encoding='utf-8') as f:
#                 f.write(extracted_text)
            
#             print(f"Saved text to: {text_file_path}")
#             print(f"Text preview: {extracted_text[:100]}..." if len(extracted_text) > 100 else f"Text: {extracted_text}")
            
#             processed_count += 1
            
#             # Clear GPU memory after each image
#             torch.cuda.empty_cache()
            
#         except Exception as e:
#             print(f"Error processing {image_filename}: {e}")
#             # Save error file to avoid reprocessing
#             with open(text_file_path, 'w', encoding='utf-8') as f:
#                 f.write(f"ERROR: {str(e)}")
#             torch.cuda.empty_cache()
#             continue
    
#     # Final cleanup
#     del model
#     del processor
#     torch.cuda.empty_cache()
#     gc.collect()
    
#     print(f"\n=== SHIKRA Processing Complete ===")
#     print(f"Processed: {processed_count} images")
#     print(f"Skipped: {skipped_count} images (already processed)")
#     print(f"Total: {len(image_files)} images")

# # Run SHIKRA OCR processing
# if __name__ == "__main__":
#     process_images_with_shikra()





# import os
# from transformers import AutoProcessor, AutoModelForVision2Seq
# from PIL import Image
# import torch

# # Initialize model and processor from Hugging Face
# model_name = "shikras/shikra-7b-delta-v1-0708"
# processor = AutoProcessor.from_pretrained(model_name)
# model = AutoModelForVision2Seq.from_pretrained(model_name)

# # Path to the folder containing images
# image_folder = 'part_2_images'

# # Function to process and extract text from images
# def extract_text_from_images(image_folder):
#     # List all image files in the folder
#     image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]

#     extracted_text = {}

#     for img_file in image_files:
#         img_path = os.path.join(image_folder, img_file)

#         # Open image
#         image = Image.open(img_path)

#         # Preprocess the image and make predictions
#         inputs = processor(images=image, return_tensors="pt")
#         pixel_values = inputs["pixel_values"].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

#         # Get predictions from the model
#         with torch.no_grad():
#             outputs = model.generate(pixel_values=pixel_values)
        
#         # Decode the generated output (i.e., text)
#         extracted_text[img_file] = processor.decode(outputs[0], skip_special_tokens=True)

#         print(f"Extracted text from {img_file}:")
#         print(extracted_text[img_file])
#         print('-' * 50)

#     # Save extracted text to a file
#     with open("extracted_text.txt", "w") as f:
#         for img_file, text in extracted_text.items():
#             f.write(f"Text from {img_file}:\n{text}\n\n")

# # Call the function
# extract_text_from_images(image_folder)

import os
from dotenv import load_dotenv
from transformers import AutoProcessor, AutoModelForVision2Seq
from PIL import Image
import torch

# Load environment variables from .env file
load_dotenv()

# Get variables from the .env file
MODEL_NAME = os.getenv("MODEL_NAME")
IMAGE_FOLDER = os.getenv("IMAGE_FOLDER")
OUTPUT_FILE = os.getenv("OUTPUT_FILE")

# Check if variables are loaded correctly
if not MODEL_NAME or not IMAGE_FOLDER or not OUTPUT_FILE:
    raise ValueError("Missing one or more environment variables.")

# Initialize model and processor from Hugging Face
processor = AutoProcessor.from_pretrained(MODEL_NAME)
model = AutoModelForVision2Seq.from_pretrained(MODEL_NAME)

# Function to process and extract text from images
def extract_text_from_images(image_folder):
    image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]
    extracted_text = {}

    for img_file in image_files:
        img_path = os.path.join(image_folder, img_file)
        image = Image.open(img_path)

        # Preprocess the image
        inputs = processor(images=image, return_tensors="pt")
        pixel_values = inputs["pixel_values"].to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

        # Get predictions from the model
        with torch.no_grad():
            outputs = model.generate(pixel_values=pixel_values)

        # Decode the generated output (i.e., text)
        extracted_text[img_file] = processor.decode(outputs[0], skip_special_tokens=True)

        print(f"Extracted text from {img_file}:")
        print(extracted_text[img_file])
        print('-' * 50)

    # Save extracted text to a file
    with open(OUTPUT_FILE, "w") as f:
        for img_file, text in extracted_text.items():
            f.write(f"Text from {img_file}:\n{text}\n\n")

# Call the function
extract_text_from_images(IMAGE_FOLDER)


  from .autonotebook import tqdm as notebook_tqdm


RuntimeError: Failed to import transformers.models.auto.processing_auto because of the following error (look up to see its traceback):
operator torchvision::nms does not exist

# Hugging face version

In [1]:
import torch
import os
import glob
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the model in half-precision
print("Loading LLAVA model for batch processing...")
model = LlavaForConditionalGeneration.from_pretrained("liuhaotian/llava-v1.6-vicuna-7b", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", use_fast=False)

def process_images_batch(images_folder="part_2_images", output_folder="extracted_text/llava_batch", batch_size=2):
    """
    Process images from folder using batch processing with LLAVA
    """
    # Create output directory
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all image files
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []
    
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
    if not image_files:
        print(f"No images found in {images_folder}")
        return
    
    print(f"Found {len(image_files)} images to process in batches of {batch_size}")
    
    # Process images in batches
    for i in range(0, len(image_files), batch_size):
        batch_files = image_files[i:i + batch_size]
        print(f"\nProcessing batch {i//batch_size + 1}: {len(batch_files)} images")
        
        # Prepare conversations for this batch
        conversations = []
        valid_files = []
        
        for image_path in batch_files:
            try:
                # Load and validate image
                image = Image.open(image_path).convert("RGB")
                
                # Create conversation for this image
                conversation = [
                    {
                        "role": "user",
                        "content": [
                            {"type": "image", "image": image},
                            {"type": "text", "text": "Extract all text from this image. Provide only the extracted text without any additional commentary."},
                        ],
                    },
                ]
                
                conversations.append(conversation)
                valid_files.append(image_path)
                
            except Exception as e:
                print(f"Error loading {image_path}: {e}")
                continue
        
        if not conversations:
            print("No valid images in this batch, skipping...")
            continue
        
        try:
            # Process batch
            print(f"Processing {len(conversations)} images...")
            
            inputs = processor.apply_chat_template(
                conversations,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                padding=True,
                return_tensors="pt"
            ).to(model.device, torch.float16)
            
            # Generate text for all images in batch
            with torch.no_grad():
                generate_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
            
            # Decode results
            results = processor.batch_decode(generate_ids, skip_special_tokens=True)
            
            # Save results for each image
            for j, (image_path, result) in enumerate(zip(valid_files, results)):
                filename = os.path.basename(image_path)
                base_name = os.path.splitext(filename)[0]
                text_file_path = os.path.join(output_folder, f"{base_name}.txt")
                
                # Clean up the result text
                if "assistant" in result.lower():
                    # Find the last occurrence of assistant and take text after it
                    parts = result.lower().split("assistant")
                    if len(parts) > 1:
                        extracted_text = result[result.lower().rfind("assistant") + len("assistant"):].strip()
                    else:
                        extracted_text = result.strip()
                else:
                    extracted_text = result.strip()
                
                # Remove any remaining artifacts
                lines = extracted_text.split('\n')
                cleaned_lines = []
                for line in lines:
                    line = line.strip()
                    if line and not line.lower().startswith(('user:', 'assistant:', 'extract all text')):
                        cleaned_lines.append(line)
                
                final_text = '\n'.join(cleaned_lines).strip()
                
                # Save to file
                with open(text_file_path, 'w', encoding='utf-8') as f:
                    f.write(final_text)
                
                print(f"Saved: {filename} -> {text_file_path}")
                if final_text:
                    preview = final_text[:100] + "..." if len(final_text) > 100 else final_text
                    print(f"Preview: {preview}")
                else:
                    print("No text extracted")
            
            # Clear GPU memory after each batch
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing batch: {e}")
            # Save error files for this batch
            for image_path in valid_files:
                filename = os.path.basename(image_path)
                base_name = os.path.splitext(filename)[0]
                text_file_path = os.path.join(output_folder, f"{base_name}.txt")
                with open(text_file_path, 'w', encoding='utf-8') as f:
                    f.write(f"ERROR: {str(e)}")
            torch.cuda.empty_cache()
            continue
    
    print(f"\n=== Batch Processing Complete ===")
    print(f"Results saved to: {output_folder}")

# Run the batch processing
if __name__ == "__main__":
    process_images_batch(batch_size=2)  # Adjust batch_size based on your GPU memory

ModuleNotFoundError: No module named 'torch'

# LLava via Olama



In [2]:
import os
import glob
import ollama
from PIL import Image


def direct_ollama_llava_ocr(
    images_folder="part_2_images", output_folder="extracted_text/ollama_direct"
):
    """Process images with Ollama LLaVA directly (no HTTP requests)"""

    # Create output directory
    os.makedirs(output_folder, exist_ok=True)

    # Get all image files
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []

    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

    if not image_files:
        print(f"No images found in {images_folder}")
        return

    print(f"Found {len(image_files)} images to process")

    successful = 0

    for i, image_path in enumerate(image_files, 1):
        filename = os.path.basename(image_path)
        base_name = os.path.splitext(filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")

        print(f"[{i}/{len(image_files)}] Processing: {filename}")

        try:
            # Use Ollama directly with the image file
            response = ollama.generate(
                model="llava",
                prompt="Extract all text from this image. Provide only the extracted text without any commentary.",
                images=[image_path],
            )

            extracted_text = response["response"].strip()

            # Save extracted text
            with open(text_file_path, "w", encoding="utf-8") as f:
                f.write(extracted_text)

            print(f"✅ Saved: {text_file_path}")
            if extracted_text:
                preview = (
                    extracted_text[:100] + "..."
                    if len(extracted_text) > 100
                    else extracted_text
                )
                print(f"📝 Text: {preview}")
            else:
                print("⚠️ No text extracted")

            successful += 1

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")
            with open(text_file_path, "w", encoding="utf-8") as f:
                f.write(f"ERROR: {str(e)}")

    print(
        f"\n🎉 OCR Complete! Successfully processed {successful}/{len(image_files)} images"
    )
    print(f"📁 Results saved to: {output_folder}")


# Run the OCR processing
if __name__ == "__main__":
    direct_ollama_llava_ocr()

Found 3 images to process
[1/3] Processing: IMG_20250629_214514_439.jpg
✅ Saved: extracted_text/ollama_direct/IMG_20250629_214514_439.txt
📝 Text: The image is blurry and rotated, making it difficult to read the text with certainty. However, I wil...
[2/3] Processing: IMG_20250629_214324_528.jpg
✅ Saved: extracted_text/ollama_direct/IMG_20250629_214324_528.txt
📝 Text: The image shows a page with handwritten notes, and it appears to be a personal study or workbook. He...
[3/3] Processing: textbook_img.jpg
✅ Saved: extracted_text/ollama_direct/textbook_img.txt
📝 Text: The text in the image reads:

"Episode 25: When the Moon Splits

Synopsis:

The war for Ryloth rages...

🎉 OCR Complete! Successfully processed 3/3 images
📁 Results saved to: extracted_text/ollama_direct


# Gemini Api Test

In [1]:
from google import genai

# The client gets the API key from the environment variable `GEMINI_API_KEY` automatically
client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.5-pro", contents="which gemini model to use for ocr?"
)
print(response.text)

Of course! This is an excellent question, as Gemini's multimodal capabilities make it a powerful, but different, kind of tool for OCR.

Here’s a breakdown of which Gemini model to use and how it compares to traditional OCR services.

### The Short Answer: The Model to Use

For OCR tasks, you should use the Gemini models with **vision capabilities**. As of now, your primary choices are:

1.  **Gemini 1.5 Pro:** **This is the recommended choice.** It's the latest and most capable publicly available model. It has a massive context window (up to 1 million tokens), which means it can analyze very large documents, multiple documents at once, or even frames from a video for text. Its reasoning and accuracy are state-of-the-art.
2.  **Gemini Pro Vision:** This was the first widely available multimodal model in the Gemini family. It is still very powerful, reliable, and a great choice for most standard OCR tasks. If `1.5 Pro` is not available to you or seems like overkill, this is your go-to mo

# Gemini OCR 

In [5]:
#!/usr/bin/env python3
import os
from pathlib import Path
from google import genai

def extract_text_from_images():
    # Initialize the Gemini client
    client = genai.Client()
    
    # Define paths
    images_dir = Path("part_2_images")
    output_dir = Path("gemini 2.5 pro/extracted_text")
    
    # Ensure output directory exists
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Process each image file
    for image_file in images_dir.glob("*.jpg"):
        print(f"Processing {image_file.name}...")
        
        try:
            # Read the image file
            with open(image_file, "rb") as f:
                image_data = f.read()
            
            # Create the prompt for OCR
            import base64
            image_b64 = base64.b64encode(image_data).decode()
            
            response = client.models.generate_content(
                model="gemini-2.5-pro",
                contents=[
                    {
                        "role": "user",
                        "parts": [
                            {
                                "text": "Extract all text from this image. Return only the text content, no additional commentary."
                            },
                            {
                                "inline_data": {
                                    "mime_type": "image/jpeg",
                                    "data": image_b64
                                }
                            }
                        ]
                    }
                ]
            )
            
            # Save extracted text to file
            output_file = output_dir / f"{image_file.stem}_extracted.txt"
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(response.text)
            
            print(f"✓ Extracted text saved to {output_file}")
            
        except Exception as e:
            print(f"✗ Error processing {image_file.name}: {str(e)}")

if __name__ == "__main__":
    extract_text_from_images()
    

Processing IMG_20250629_214514_439.jpg...
✓ Extracted text saved to gemini 2.5 pro/extracted_text/IMG_20250629_214514_439_extracted.txt
Processing textbook_img.jpg...
✓ Extracted text saved to gemini 2.5 pro/extracted_text/textbook_img_extracted.txt
Processing IMG_20250629_214324_528.jpg...
✓ Extracted text saved to gemini 2.5 pro/extracted_text/IMG_20250629_214324_528_extracted.txt
