## Environment Report

**Current Package Versions (Baseline before LLAVA/SHIKRA integration):**

- **Python**: 3.13.3
- **torch**: 2.7.1+cu128
- **torchvision**: 0.22.1+cu128
- **torchaudio**: 2.7.1+cu128
- **easyocr**: 1.7.2
- **opencv-python**: 4.12.0.88
- **pillow**: 11.2.1
- **numpy**: 2.2.6
- **pyttsx3**: 2.99
- **langdetect**: 1.0.9
- **transformers**: 4.52.3
- **gradio**: 5.31.0
- **CUDA Available**: Yes
- **CUDA Version**: 12.8
- **GPU**: NVIDIA GeForce RTX 2060
- **GPU Memory**: 12.88 GB

# Imports and Inits

In [1]:
import os
import glob
import torch
from PIL import Image
import time
import gc
from transformers import AutoProcessor, LlavaForConditionalGeneration
import easyocr
import pyttsx3
from langdetect import detect
import cv2
import numpy as np

## Moving Hugging Face default Download Dir

# # LLAVA OCR Cell - GPU Accelerated Implementation
# import os

# # IMPORTANT: Set cache BEFORE any imports from transformers/huggingface
# os.environ['HF_HOME'] = 'D:/HuggingFaceCache'
# os.environ['TRANSFORMERS_CACHE'] = 'D:/HuggingFaceCache/transformers'
# os.environ['HUGGINGFACE_HUB_CACHE'] = 'D:/HuggingFaceCache/hub'

# # Now import everything else
# import glob
# import torch
# from PIL import Image
# import time
# import gc
# from transformers import AutoProcessor, LlavaForConditionalGeneration

# Force GPU usage
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.backends.cudnn.benchmark = True

# Check GPU availability
if not torch.cuda.is_available():
    print("ERROR: CUDA not available. LLAVA requires GPU acceleration.")
    exit(1)

print("=== LLAVA OCR Processing ===")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"HuggingFace cache location: {os.environ.get('HF_HOME', 'default')}")

  from .autonotebook import tqdm as notebook_tqdm
2025-07-11 01:52:08.266639: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752180728.279169   14124 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752180728.282756   14124 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752180728.293164   14124 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752180728.293180   14124 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752180728.293181   14124

=== LLAVA OCR Processing ===
GPU: NVIDIA GeForce RTX 2050
GPU Memory: 3.96 GB
HuggingFace cache location: default


# Simple TTS using pyttsx3 Gpu based:

In [None]:
# Force GPU usage and optimization for RTX 2060
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use first GPU
torch.backends.cudnn.benchmark = True  # Enable cudnn autotuner for performance

# Check GPU and setup
if not torch.cuda.is_available():
    print("ERROR: CUDA not available. This script requires GPU acceleration.")
    print("Please check your NVIDIA drivers and PyTorch installation.")
    exit(1)

# Display GPU information
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU detected: {torch.cuda.get_device_name(0)}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Clear GPU memory before starting
torch.cuda.empty_cache()
gc.collect()

# Initialize EasyOCR reader with explicit GPU settings
print("Initializing EasyOCR with GPU acceleration...")
reader = easyocr.Reader(
    ["ar", "ur", "en"], 
    gpu=True,
    verbose=False,
    # For RTX 2060, set reasonable batch size and model parameters
    detector=True,
    recognizer=True
)

# Initialize TTS engine
tts_engine = pyttsx3.init()

def log_gpu_memory():
    """Log current GPU memory usage"""
    allocated = torch.cuda.memory_allocated(0) / 1e9
    reserved = torch.cuda.memory_reserved(0) / 1e9
    print(f"GPU Memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved")

def process_images_ocr_save_text(images_folder, output_folder):
    """
    Process all images in the folder with GPU-accelerated OCR and save to text files
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")
    
    # Get all image files in the folder
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []

    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

    if not image_files:
        print(f"No images found in {images_folder}")
        return

    print(f"Found {len(image_files)} images to process")
    
    # Report initial GPU memory
    log_gpu_memory()

    # Process files
    for i, image_path in enumerate(image_files, 1):
        image_filename = os.path.basename(image_path)
        base_name = os.path.splitext(image_filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
        print(f"\n--- Processing Image {i}/{len(image_files)}: {image_filename} ---")

        try:
            # Read image
            image = cv2.imread(image_path)
            if image is None:
                print(f"Could not read image: {image_path}")
                continue

            # Optimize image for GPU processing (resize large images)
            h, w = image.shape[:2]
            max_dim = 2000  # Optimal for RTX 2060 memory
            if max(h, w) > max_dim:
                scale = max_dim / max(h, w)
                image = cv2.resize(image, (int(w * scale), int(h * scale)))
                print(f"Resized image to {image.shape[1]}x{image.shape[0]} to optimize GPU memory")
            
            # Report GPU memory before OCR
            log_gpu_memory()

            # Extract text using GPU-accelerated EasyOCR
            print("Extracting text with GPU acceleration...")
            start_time = time.time()
            
            # For RTX 2060, use appropriate batch size
            results = reader.readtext(
                image,
                batch_size=2,  # Adjust based on your GPU memory
                paragraph=True,  # Group text into paragraphs
                detail=0  # 0 for more accuracy
            )

            end_time = time.time()
            print(f"OCR completed in {end_time - start_time:.2f} seconds")

            # Combine all detected text with confidence filtering
            extracted_text = " ".join(
                [result[1] for result in results if result[2] > 0.5]
            )  # confidence > 0.5

            if not extracted_text.strip():
                print("No text detected in this image")
                continue

            print(f"Extracted text: {extracted_text}")
            
            # Try to detect language
            try:
                detected_lang = detect(extracted_text)
                print(f"Detected language: {detected_lang}")
                
                # Save text with language information to file
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(f"LANG:{detected_lang}\n")
                    text_file.write(extracted_text)
                
                print(f"Saved text to: {text_file_path}")
                
            except Exception as lang_error:
                print(f"Language detection failed: {lang_error}")
                # Save text without language information
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(f"LANG:unknown\n")
                    text_file.write(extracted_text)
                
                print(f"Saved text to: {text_file_path} (language unknown)")

            # Clear GPU memory after each image
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            # Clear GPU memory on error
            torch.cuda.empty_cache()
            continue

    # Final GPU memory cleanup
    torch.cuda.empty_cache()
    gc.collect()
    print("\n--- OCR Processing Complete ---")
    log_gpu_memory()
    
    return output_folder

def read_text_files_aloud(text_folder):
    """
    Read all text files in the folder aloud using TTS, one by one with clear separation
    """
    # Get all text files
    text_files = glob.glob(os.path.join(text_folder, "*.txt"))
    
    if not text_files:
        print(f"No text files found in {text_folder}")
        return
    
    print(f"\nFound {len(text_files)} text files to read")
    
    # Language mapping for TTS
    lang_mapping = {
        "en": "english",
        "ar": "arabic",
        "ur": "urdu",
        "hi": "hindi",
        "fa": "persian",
        "ps": "pashto"
    }
    
    # Sort text files alphabetically to ensure consistent reading order
    text_files.sort()
    
    for i, text_file_path in enumerate(text_files, 1):
        file_name = os.path.basename(text_file_path)
        print(f"\n===== Reading File {i}/{len(text_files)}: {file_name} =====")
        
        # Announce the file being read (optional)
        tts_engine.setProperty("rate", 150)
        announcement = f"Reading file {i} of {len(text_files)}: {os.path.splitext(file_name)[0]}"
        print(announcement)
        tts_engine.say(announcement)
        tts_engine.runAndWait()
        
        # Pause between announcement and content
        time.sleep(1)
        
        try:
            # Read text file
            with open(text_file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                
            if not lines:
                print(f"File is empty: {text_file_path}")
                continue
                
            # Extract language information from first line
            lang_line = lines[0].strip()
            if lang_line.startswith("LANG:"):
                detected_lang = lang_line[5:]
                print(f"Language: {detected_lang}")
                # Remove the language line
                content = "".join(lines[1:])
            else:
                # No language information, treat all lines as content
                detected_lang = "unknown"
                content = "".join(lines)
            
            if not content.strip():
                print("No content to read")
                continue
                
            # Print a preview of the content
            content_preview = content[:100] + "..." if len(content) > 100 else content
            print(f"Text content: {content_preview}")
            
            # Get available voices
            voices = tts_engine.getProperty("voices")
            
            # Try to set appropriate voice based on language
            voice_set = False
            for voice in voices:
                tts_lang = lang_mapping.get(detected_lang, "english")
                if (tts_lang.lower() in voice.name.lower() or 
                    detected_lang in voice.id.lower()):
                    tts_engine.setProperty("voice", voice.id)
                    voice_set = True
                    print(f"Using voice: {voice.name}")
                    break
            
            if not voice_set:
                print(f"No specific voice found for {detected_lang}, using default")
            
            # Adjust speech rate based on language
            if detected_lang in ["ar", "ur"]:
                # Slower for Arabic and Urdu
                tts_engine.setProperty("rate", 130)
            else:
                tts_engine.setProperty("rate", 150)
            
            # Read the text aloud
            print(f"Reading text aloud...")
            tts_engine.say(content)
            tts_engine.runAndWait()
            
            # Pause between files to clearly separate them
            print("Finished reading file.")
            time.sleep(2)
            
        except Exception as e:
            print(f"Error reading {text_file_path}: {e}")
            continue
    
    print("\n===== All Text Files Have Been Read =====")

def get_user_confirmation():
    """
    Ask the user if they want to proceed to the TTS reading phase
    """
    while True:
        response = input("\nOCR processing complete. Proceed with reading text files? (y/n): ").lower()
        if response in ['y', 'yes']:
            return True
        elif response in ['n', 'no']:
            return False
        else:
            print("Please enter 'y' or 'n'")

# Main process
if __name__ == "__main__":
    try:
        # Define folders
        images_folder = "part_2_images"
        output_folder = "extracted_text"
        
        # Print GPU optimization message
        print("=== Running GPU-Optimized OCR for RTX 2060 ===")
        
        # Step 1: Process images with OCR and save text
        start_time = time.time()
        text_folder = process_images_ocr_save_text(images_folder, output_folder)
        end_time = time.time()
        
        print(f"OCR processing completed in {end_time - start_time:.2f} seconds")
        
        # Optional: Ask for user confirmation before proceeding to TTS
        if get_user_confirmation():
            # Step 2: Read the saved text files aloud one by one
            read_text_files_aloud(text_folder)
        else:
            print("TTS reading canceled. Text files are saved in the output folder.")
        
        print("Process completed successfully!")
        
    except KeyboardInterrupt:
        print("\nProcess interrupted by user")
        # Clean up GPU memory on interrupt
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"An error occurred: {e}")
        # Clean up GPU memory on error
        torch.cuda.empty_cache()
    finally:
        # Final cleanup
        torch.cuda.empty_cache()
        gc.collect()

In [None]:
# Try a smaller vision model first
processor = AutoProcessor.from_pretrained("microsoft/git-base", use_fast=False)
print("Downloaded!")

# LLava Model


In [None]:
# # LLAVA OCR Cell - GPU Accelerated Implementation

# # Force GPU usage
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ['HF_HOME'] = 'D:/HuggingFaceCache' 
# torch.backends.cudnn.benchmark = True

# # Check GPU availability
# if not torch.cuda.is_available():
#     print("ERROR: CUDA not available. LLAVA requires GPU acceleration.")
#     exit(1)

# print("=== LLAVA OCR Processing ===")
# print(f"GPU: {torch.cuda.get_device_name(0)}")
# print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# def process_images_with_llava(images_folder="part_2_images", output_folder="extracted_text/llava"):
#     """
#     Process images using LLAVA model for OCR with GPU acceleration
#     """
#     # Create output directory if it doesn't exist
#     os.makedirs(output_folder, exist_ok=True)
#     print(f"Output folder: {output_folder}")
    
#     # # Clear GPU memory before starting
#     # torch.cuda.empty_cache()
#     # gc.collect()
    
#     # Initialize LLAVA model and processor
#     print("Loading LLAVA model...")
#     try:
#         # TODO: Replace with your specific LLAVA model path/name
#         model_name = "llava-hf/llava-1.5-7b-hf"  # Example model name
        
#         processor = AutoProcessor.from_pretrained(model_name, use_fast=True)
#         model = LlavaForConditionalGeneration.from_pretrained(
#             model_name,
#             torch_dtype=torch.float16,  # Use fp16 for RTX 2060 efficiency
#             device_map="cuda",
#             low_cpu_mem_usage=True
#         )
#         model.eval()
#         print("LLAVA model loaded successfully!")
        
#     except Exception as e:
#         print(f"Error loading LLAVA model: {e}")
#         print("Please ensure you have the correct model name/path")
#         return
    
#     # Get all image files
#     image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
#     image_files = []
    
#     for ext in image_extensions:
#         image_files.extend(glob.glob(os.path.join(images_folder, ext)))
#         image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
#     if not image_files:
#         print(f"No images found in {images_folder}")
#         return
    
#     print(f"Found {len(image_files)} images to process")
    
#     # Process each image
#     processed_count = 0
#     skipped_count = 0
    
#     for i, image_path in enumerate(image_files, 1):
#         image_filename = os.path.basename(image_path)
#         base_name = os.path.splitext(image_filename)[0]
#         text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
#         # Check if text file already exists (efficiency check)
#         if os.path.exists(text_file_path):
#             print(f"\n[{i}/{len(image_files)}] Skipping {image_filename} - already processed")
#             skipped_count += 1
#             continue
        
#         print(f"\n[{i}/{len(image_files)}] Processing {image_filename}...")
        
#         try:
#             # Load and preprocess image
#             image = Image.open(image_path).convert("RGB")
            
#             # Resize large images to optimize GPU memory
#             max_dim = 1024  # Adjust based on your GPU memory
#             if max(image.size) > max_dim:
#                 image.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
#                 print(f"Resized image to {image.size} for GPU optimization")
            
#             # Prepare prompt for OCR task
#             prompt = "USER: <image>\nExtract all text from this image. Provide only the extracted text without any additional commentary.\nASSISTANT:"
            
#             # Process with LLAVA
#             start_time = time.time()
            
#             # TODO: Adjust this section based on your specific LLAVA implementation
#             inputs = processor(prompt, image, return_tensors="pt").to("cuda")
            
#             # Generate text with GPU
#             with torch.no_grad():
#                 generated_ids = model.generate(
#                     **inputs,
#                     max_new_tokens=1024,
#                     temperature=0.1,  # Low temperature for more accurate OCR
#                     do_sample=False,
#                     use_cache=True
#                 )
            
#             # Decode the generated text
#             extracted_text = processor.decode(generated_ids[0], skip_special_tokens=True)
            
#             # Remove the prompt from the output
#             if "ASSISTANT:" in extracted_text:
#                 extracted_text = extracted_text.split("ASSISTANT:")[-1].strip()
            
#             end_time = time.time()
#             print(f"OCR completed in {end_time - start_time:.2f} seconds")
            
#             if not extracted_text.strip():
#                 print("No text detected in this image")
#                 # Save empty file to avoid reprocessing
#                 with open(text_file_path, 'w', encoding='utf-8') as f:
#                     f.write("")
#                 continue
            
#             # Save extracted text
#             with open(text_file_path, 'w', encoding='utf-8') as f:
#                 f.write(extracted_text)
            
#             print(f"Saved text to: {text_file_path}")
#             print(f"Text preview: {extracted_text[:100]}..." if len(extracted_text) > 100 else f"Text: {extracted_text}")
            
#             processed_count += 1
            
#             # Clear GPU memory after each image
#             torch.cuda.empty_cache()
            
#         except Exception as e:
#             print(f"Error processing {image_filename}: {e}")
#             # Save error file to avoid reprocessing
#             with open(text_file_path, 'w', encoding='utf-8') as f:
#                 f.write(f"ERROR: {str(e)}")
#             torch.cuda.empty_cache()
#             continue
    
#     # Final cleanup
#     del model
#     del processor
#     torch.cuda.empty_cache()
#     gc.collect()
    
#     print(f"\n=== LLAVA Processing Complete ===")
#     print(f"Processed: {processed_count} images")
#     print(f"Skipped: {skipped_count} images (already processed)")
#     print(f"Total: {len(image_files)} images")

# # Run LLAVA OCR processing
# if __name__ == "__main__":
#     process_images_with_llava()



# LLAVA OCR Cell - GPU Accelerated Implementation

import os
import gc
import time
import glob
import torch
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration
import warnings
warnings.filterwarnings("ignore")

# Force GPU usage
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
# os.environ['HF_HOME'] = 'D:/HuggingFaceCache' 
torch.backends.cudnn.benchmark = True

# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
use_gpu = torch.cuda.is_available()

print("=== LLAVA OCR Processing ===")
if use_gpu:
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print("Using GPU acceleration")
else:
    print("GPU not available - using CPU (will be slower)")
    print("For better performance, install CUDA-compatible PyTorch")

def process_images_with_llava(images_folder="part_2_images", output_folder="extracted_text/llava"):
    """
    Process images using LLAVA model for OCR with GPU acceleration
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder: {output_folder}")
    
    # Clear GPU memory before starting (only if GPU available)
    if use_gpu:
        torch.cuda.empty_cache()
        gc.collect()
    
    # Initialize LLAVA model and processor
    print("Loading LLAVA model...")
    try:
        # Updated model name - use the correct LLAVA model
        model_name = "llava-hf/llava-1.5-7b-hf"
        
        print("Downloading/Loading processor...")
        processor = AutoProcessor.from_pretrained(
            model_name, 
            use_fast=False,
            trust_remote_code=True
        )
        
        print("Downloading/Loading model...")
        model = LlavaForConditionalGeneration.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if use_gpu else torch.float32,  # Use fp16 only if GPU available
            device_map="auto" if use_gpu else None,  # Use auto device mapping only if GPU available
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )
        
        # Move model to appropriate device
        if use_gpu and not next(model.parameters()).is_cuda:
            model = model.to(device)
        elif not use_gpu:
            model = model.to(device)
        
        model.eval()
        print("LLAVA model loaded successfully!")
        
    except Exception as e:
        print(f"Error loading LLAVA model: {e}")
        print("Trying alternative approach...")
        
        # Alternative: Try different model or installation
        try:
            # Alternative model names to try
            alternative_models = [
                "llava-hf/llava-1.5-13b-hf",
                "llava-hf/llava-v1.6-mistral-7b-hf",
                "llava-hf/llava-v1.6-vicuna-7b-hf"
            ]
            
            for alt_model in alternative_models:
                try:
                    print(f"Trying {alt_model}...")
                    processor = AutoProcessor.from_pretrained(alt_model, trust_remote_code=True)
                    model = LlavaForConditionalGeneration.from_pretrained(
                        alt_model,
                        torch_dtype=torch.float16 if use_gpu else torch.float32,
                        device_map="auto" if use_gpu else None,
                        low_cpu_mem_usage=True,
                        trust_remote_code=True
                    )
                    if use_gpu and not next(model.parameters()).is_cuda:
                        model = model.to(device)
                    elif not use_gpu:
                        model = model.to(device)
                    model.eval()
                    print(f"Successfully loaded {alt_model}!")
                    model_name = alt_model
                    break
                except Exception as alt_e:
                    print(f"Failed to load {alt_model}: {alt_e}")
                    continue
            else:
                print("All model loading attempts failed.")
                print("Please ensure you have installed the requirements:")
                print("pip install transformers torch torchvision accelerate")
                print("pip install git+https://github.com/huggingface/transformers.git")
                return
                
        except Exception as final_e:
            print(f"Final error: {final_e}")
            return
    
    # Get all image files
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []
    
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
    if not image_files:
        print(f"No images found in {images_folder}")
        return
    
    print(f"Found {len(image_files)} images to process")
    
    # Process each image
    processed_count = 0
    skipped_count = 0
    
    for i, image_path in enumerate(image_files, 1):
        image_filename = os.path.basename(image_path)
        base_name = os.path.splitext(image_filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
        # Check if text file already exists (efficiency check)
        if os.path.exists(text_file_path):
            print(f"\n[{i}/{len(image_files)}] Skipping {image_filename} - already processed")
            skipped_count += 1
            continue
        
        print(f"\n[{i}/{len(image_files)}] Processing {image_filename}...")
        
        try:
            # Load and preprocess image
            image = Image.open(image_path).convert("RGB")
            
            # Resize large images to optimize memory
            max_dim = 1024 if use_gpu else 512  # Smaller images for CPU processing
            if max(image.size) > max_dim:
                image.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
                print(f"Resized image to {image.size} for {'GPU' if use_gpu else 'CPU'} optimization")
            
            # Prepare prompt for OCR task - Updated format
            prompt = "USER: <image>\nExtract all text from this image. Provide only the extracted text without any additional commentary.\nASSISTANT:"
            
            # Process with LLAVA
            start_time = time.time()
            
            # Updated processing approach
            inputs = processor(prompt, image, return_tensors="pt")
            
            # Move inputs to appropriate device
            if use_gpu:
                inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
            else:
                inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
            
            # Generate text with appropriate device
            with torch.no_grad():
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=1024 if use_gpu else 512,  # Reduce tokens for CPU
                    temperature=0.1,  # Low temperature for more accurate OCR
                    do_sample=False,
                    use_cache=True,
                    pad_token_id=processor.tokenizer.eos_token_id
                )
            
            # Decode the generated text
            generated_text = processor.decode(generated_ids[0], skip_special_tokens=True)
            
            # Clean up the output - remove prompt
            if "ASSISTANT:" in generated_text:
                extracted_text = generated_text.split("ASSISTANT:")[-1].strip()
            else:
                extracted_text = generated_text.strip()
            
            # Remove any remaining prompt artifacts
            if "USER:" in extracted_text:
                extracted_text = extracted_text.split("USER:")[-1].strip()
            
            end_time = time.time()
            print(f"OCR completed in {end_time - start_time:.2f} seconds")
            
            if not extracted_text.strip():
                print("No text detected in this image")
                # Save empty file to avoid reprocessing
                with open(text_file_path, 'w', encoding='utf-8') as f:
                    f.write("")
                continue
            
            # Save extracted text
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(extracted_text)
            
            print(f"Saved text to: {text_file_path}")
            print(f"Text preview: {extracted_text[:100]}..." if len(extracted_text) > 100 else f"Text: {extracted_text}")
            
            processed_count += 1
            
            # Clear memory after each image (only if GPU available)
            if use_gpu:
                torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing {image_filename}: {e}")
            # Save error file to avoid reprocessing
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(f"ERROR: {str(e)}")
            if use_gpu:
                torch.cuda.empty_cache()
            continue
    
    # Final cleanup
    del model
    del processor
    if use_gpu:
        torch.cuda.empty_cache()
    gc.collect()
    
    print(f"\n=== LLAVA Processing Complete ===")
    print(f"Device used: {'GPU' if use_gpu else 'CPU'}")
    print(f"Processed: {processed_count} images")
    print(f"Skipped: {skipped_count} images (already processed)")
    print(f"Total: {len(image_files)} images")

# Install requirements function
def install_requirements():
    """Install required packages if not already installed"""
    import subprocess
    import sys
    
    required_packages = [
        "transformers>=4.36.0",
        "torch",
        "torchvision", 
        "accelerate",
        "pillow",
        "bitsandbytes"  # For efficient loading
    ]
    
    print("Installing/updating required packages...")
    for package in required_packages:
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        except subprocess.CalledProcessError as e:
            print(f"Failed to install {package}: {e}")
    
    # Install latest transformers from git for LLAVA support
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/huggingface/transformers.git"])
    except subprocess.CalledProcessError as e:
        print(f"Failed to install transformers from git: {e}")

# Run LLAVA OCR processing
if __name__ == "__main__":
    # Uncomment the line below if you need to install requirements
    # install_requirements()
    
    process_images_with_llava()

  from .autonotebook import tqdm as notebook_tqdm
2025-07-11 01:58:30.995160: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752181111.007441   14370 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752181111.011080   14370 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752181111.021525   14370 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752181111.021542   14370 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752181111.021544   14370

=== LLAVA OCR Processing ===
GPU: NVIDIA GeForce RTX 2050
GPU Memory: 3.96 GB
Using GPU acceleration
Output folder: extracted_text/llava
Loading LLAVA model...
Downloading/Loading processor...
Error loading LLAVA model: 
LlamaTokenizer requires the SentencePiece library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.

Trying alternative approach...
Trying llava-hf/llava-1.5-13b-hf...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

# Simplified version LLava

In [None]:
import os
import glob
import torch
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration

def process_images_with_llava(images_folder="part_2_images", output_folder="extracted_text/llava"):
    """Simple LLAVA OCR processing"""
    
    # Setup
    os.makedirs(output_folder, exist_ok=True)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    # Load model
    print("Loading LLAVA model...")
    model_name = "llava-hf/llava-1.5-7b-hf"
    processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
    model = LlavaForConditionalGeneration.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        device_map="auto" if device == "cuda" else None
    ).to(device)
    
    # Get images
    extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp"]
    image_files = []
    for ext in extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
    
    print(f"Found {len(image_files)} images")
    
    # Process each image
    for i, image_path in enumerate(image_files, 1):
        filename = os.path.basename(image_path)
        text_file = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")
        
        if os.path.exists(text_file):
            print(f"[{i}/{len(image_files)}] Skipping {filename} - already processed")
            continue
            
        print(f"[{i}/{len(image_files)}] Processing {filename}...")
        
        try:
            # Load image
            image = Image.open(image_path).convert("RGB")
            
            # Resize if too large
            if max(image.size) > 1024:
                image.thumbnail((1024, 1024), Image.Resampling.LANCZOS)
            
            # OCR prompt
            prompt = "USER: <image>\nExtract all text from this image.\nASSISTANT:"
            
            # Process
            inputs = processor(prompt, image, return_tensors="pt").to(device)
            
            with torch.no_grad():
                output = model.generate(**inputs, max_new_tokens=512, do_sample=False)
            
            # Extract text
            text = processor.decode(output[0], skip_special_tokens=True)
            if "ASSISTANT:" in text:
                text = text.split("ASSISTANT:")[-1].strip()
            
            # Save
            with open(text_file, 'w', encoding='utf-8') as f:
                f.write(text)
            
            print(f"Extracted: {text[:100]}...")
            
        except Exception as e:
            print(f"Error: {e}")
            with open(text_file, 'w', encoding='utf-8') as f:
                f.write(f"ERROR: {e}")
    
    print("Processing complete!")

if __name__ == "__main__":
    process_images_with_llava()

# Shikra Model

In [None]:
# SHIKRA OCR Cell - GPU Accelerated Implementation
import os
import glob
import torch
from PIL import Image
import time
import gc
from transformers import AutoProcessor, AutoModelForVision2Seq

# Force GPU usage
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
torch.backends.cudnn.benchmark = True

# Check GPU availability
if not torch.cuda.is_available():
    print("ERROR: CUDA not available. SHIKRA requires GPU acceleration.")
    exit(1)

print("=== SHIKRA OCR Processing ===")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

def process_images_with_shikra(images_folder="part_2_images", output_folder="extracted_text/shikra"):
    """
    Process images using SHIKRA model for OCR with GPU acceleration
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder: {output_folder}")
    
    # Clear GPU memory before starting
    torch.cuda.empty_cache()
    gc.collect()
    
    # Initialize SHIKRA model and processor
    print("Loading SHIKRA model...")
    try:
        # TODO: Replace with your specific SHIKRA model path/name
        model_name = "shikras/shikra-7b-v1"  # Example model name
        
        processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
        model = AutoModelForVision2Seq.from_pretrained(
            model_name,
            torch_dtype=torch.float16,  # Use fp16 for RTX 2060 efficiency
            device_map="cuda",
            low_cpu_mem_usage=True
        )
        model.eval()
        print("SHIKRA model loaded successfully!")
        
    except Exception as e:
        print(f"Error loading SHIKRA model: {e}")
        print("Please ensure you have the correct model name/path")
        return
    
    # Get all image files
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []
    
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
    if not image_files:
        print(f"No images found in {images_folder}")
        return
    
    print(f"Found {len(image_files)} images to process")
    
    # Process each image
    processed_count = 0
    skipped_count = 0
    
    for i, image_path in enumerate(image_files, 1):
        image_filename = os.path.basename(image_path)
        base_name = os.path.splitext(image_filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
        # Check if text file already exists (efficiency check)
        if os.path.exists(text_file_path):
            print(f"\n[{i}/{len(image_files)}] Skipping {image_filename} - already processed")
            skipped_count += 1
            continue
        
        print(f"\n[{i}/{len(image_files)}] Processing {image_filename}...")
        
        try:
            # Load and preprocess image
            image = Image.open(image_path).convert("RGB")
            
            # Resize large images to optimize GPU memory
            max_dim = 1024  # Adjust based on your GPU memory
            if max(image.size) > max_dim:
                image.thumbnail((max_dim, max_dim), Image.Resampling.LANCZOS)
                print(f"Resized image to {image.size} for GPU optimization")
            
            # Prepare prompt for OCR task
            # SHIKRA might use a different prompt format - adjust as needed
            prompt = "<image> Extract and transcribe all text visible in this image."
            
            # Process with SHIKRA
            start_time = time.time()
            
            # TODO: Adjust this section based on your specific SHIKRA implementation
            inputs = processor(
                text=prompt,
                images=image,
                return_tensors="pt"
            ).to("cuda")
            
            # Generate text with GPU
            with torch.no_grad():
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=1024,
                    temperature=0.1,  # Low temperature for more accurate OCR
                    do_sample=False,
                    num_beams=1,  # Adjust based on accuracy vs speed tradeoff
                    use_cache=True
                )
            
            # Decode the generated text
            extracted_text = processor.decode(generated_ids[0], skip_special_tokens=True)
            
            # Clean up the output (remove prompt if included)
            if prompt in extracted_text:
                extracted_text = extracted_text.replace(prompt, "").strip()
            
            end_time = time.time()
            print(f"OCR completed in {end_time - start_time:.2f} seconds")
            
            if not extracted_text.strip():
                print("No text detected in this image")
                # Save empty file to avoid reprocessing
                with open(text_file_path, 'w', encoding='utf-8') as f:
                    f.write("")
                continue
            
            # Save extracted text
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(extracted_text)
            
            print(f"Saved text to: {text_file_path}")
            print(f"Text preview: {extracted_text[:100]}..." if len(extracted_text) > 100 else f"Text: {extracted_text}")
            
            processed_count += 1
            
            # Clear GPU memory after each image
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing {image_filename}: {e}")
            # Save error file to avoid reprocessing
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(f"ERROR: {str(e)}")
            torch.cuda.empty_cache()
            continue
    
    # Final cleanup
    del model
    del processor
    torch.cuda.empty_cache()
    gc.collect()
    
    print(f"\n=== SHIKRA Processing Complete ===")
    print(f"Processed: {processed_count} images")
    print(f"Skipped: {skipped_count} images (already processed)")
    print(f"Total: {len(image_files)} images")

# Run SHIKRA OCR processing
if __name__ == "__main__":
    process_images_with_shikra()

=== SHIKRA OCR Processing ===
GPU: NVIDIA GeForce RTX 2060
GPU Memory: 12.88 GB
Output folder: extracted_text/shikra
Loading SHIKRA model...
Error loading SHIKRA model: shikras/shikra-7b-v1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
Please ensure you have the correct model name/path


# Hugging face version

In [None]:
import torch
import os
import glob
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration

# Load the model in half-precision
print("Loading LLAVA model for batch processing...")
model = LlavaForConditionalGeneration.from_pretrained("liuhaotian/llava-v1.6-vicuna-7b", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf", use_fast=False)

def process_images_batch(images_folder="part_2_images", output_folder="extracted_text/llava_batch", batch_size=2):
    """
    Process images from folder using batch processing with LLAVA
    """
    # Create output directory
    os.makedirs(output_folder, exist_ok=True)
    
    # Get all image files
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []
    
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
    if not image_files:
        print(f"No images found in {images_folder}")
        return
    
    print(f"Found {len(image_files)} images to process in batches of {batch_size}")
    
    # Process images in batches
    for i in range(0, len(image_files), batch_size):
        batch_files = image_files[i:i + batch_size]
        print(f"\nProcessing batch {i//batch_size + 1}: {len(batch_files)} images")
        
        # Prepare conversations for this batch
        conversations = []
        valid_files = []
        
        for image_path in batch_files:
            try:
                # Load and validate image
                image = Image.open(image_path).convert("RGB")
                
                # Create conversation for this image
                conversation = [
                    {
                        "role": "user",
                        "content": [
                            {"type": "image", "image": image},
                            {"type": "text", "text": "Extract all text from this image. Provide only the extracted text without any additional commentary."},
                        ],
                    },
                ]
                
                conversations.append(conversation)
                valid_files.append(image_path)
                
            except Exception as e:
                print(f"Error loading {image_path}: {e}")
                continue
        
        if not conversations:
            print("No valid images in this batch, skipping...")
            continue
        
        try:
            # Process batch
            print(f"Processing {len(conversations)} images...")
            
            inputs = processor.apply_chat_template(
                conversations,
                add_generation_prompt=True,
                tokenize=True,
                return_dict=True,
                padding=True,
                return_tensors="pt"
            ).to(model.device, torch.float16)
            
            # Generate text for all images in batch
            with torch.no_grad():
                generate_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
            
            # Decode results
            results = processor.batch_decode(generate_ids, skip_special_tokens=True)
            
            # Save results for each image
            for j, (image_path, result) in enumerate(zip(valid_files, results)):
                filename = os.path.basename(image_path)
                base_name = os.path.splitext(filename)[0]
                text_file_path = os.path.join(output_folder, f"{base_name}.txt")
                
                # Clean up the result text
                if "assistant" in result.lower():
                    # Find the last occurrence of assistant and take text after it
                    parts = result.lower().split("assistant")
                    if len(parts) > 1:
                        extracted_text = result[result.lower().rfind("assistant") + len("assistant"):].strip()
                    else:
                        extracted_text = result.strip()
                else:
                    extracted_text = result.strip()
                
                # Remove any remaining artifacts
                lines = extracted_text.split('\n')
                cleaned_lines = []
                for line in lines:
                    line = line.strip()
                    if line and not line.lower().startswith(('user:', 'assistant:', 'extract all text')):
                        cleaned_lines.append(line)
                
                final_text = '\n'.join(cleaned_lines).strip()
                
                # Save to file
                with open(text_file_path, 'w', encoding='utf-8') as f:
                    f.write(final_text)
                
                print(f"Saved: {filename} -> {text_file_path}")
                if final_text:
                    preview = final_text[:100] + "..." if len(final_text) > 100 else final_text
                    print(f"Preview: {preview}")
                else:
                    print("No text extracted")
            
            # Clear GPU memory after each batch
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing batch: {e}")
            # Save error files for this batch
            for image_path in valid_files:
                filename = os.path.basename(image_path)
                base_name = os.path.splitext(filename)[0]
                text_file_path = os.path.join(output_folder, f"{base_name}.txt")
                with open(text_file_path, 'w', encoding='utf-8') as f:
                    f.write(f"ERROR: {str(e)}")
            torch.cuda.empty_cache()
            continue
    
    print(f"\n=== Batch Processing Complete ===")
    print(f"Results saved to: {output_folder}")

# Run the batch processing
if __name__ == "__main__":
    process_images_batch(batch_size=2)  # Adjust batch_size based on your GPU memory

  from .autonotebook import tqdm as notebook_tqdm
2025-07-11 02:05:52.383426: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752181552.395305   18129 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752181552.398922   18129 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752181552.409883   18129 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752181552.409900   18129 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1752181552.409902   18129

Loading LLAVA model for batch processing...


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

# LLaVA-Torch Alternative Implementation

In [1]:
# Install llava-torch package
import subprocess
import sys
import os

def install_llava_torch():
    """Install llava-torch package"""
    try:
        print("Installing llava-torch...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "llava-torch"])
        print("✅ llava-torch installed successfully!")
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to install llava-torch: {e}")
        return False

# Install the package
install_success = install_llava_torch()

if install_success:
    print("Ready to use llava-torch for OCR processing!")

Installing llava-torch...
Collecting llava-torch
  Downloading llava_torch-1.2.2.post1-py3-none-any.whl (102 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 102.2/102.2 KB 584.4 kB/s eta 0:00:00
Collecting torchvision==0.15.2
  Downloading torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 6.0/6.0 MB 5.1 MB/s eta 0:00:00
Collecting gradio==4.16.0
  Downloading gradio-4.16.0-py3-none-any.whl (16.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.7/16.7 MB 3.9 MB/s eta 0:00:00
Collecting httpx==0.24.0
  Downloading httpx-0.24.0-py3-none-any.whl (75 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 75.3/75.3 KB 3.4 MB/s eta 0:00:00
Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 619.9/619.9 MB 2.0 MB/s eta 0:00:00
Collecting transformers==4.36.2
  Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━



Collecting shellingham>=1.3.0
  Using cached shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)
Collecting svgwrite
  Downloading svgwrite-1.4.3-py3-none-any.whl (67 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 67.1/67.1 KB 2.1 MB/s eta 0:00:00
Collecting rpds-py>=0.7.1
  Using cached rpds_py-0.26.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (383 kB)
Collecting referencing>=0.28.4
  Using cached referencing-0.36.2-py3-none-any.whl (26 kB)
Collecting jsonschema-specifications>=2023.03.6
  Using cached jsonschema_specifications-2025.4.1-py3-none-any.whl (18 kB)
Building wheels for collected packages: wavedrom
  Building wheel for wavedrom (setup.py): started
  Building wheel for wavedrom (setup.py): finished with status 'done'
  Created wheel for wavedrom: filename=wavedrom-2.0.3.post3-py2.py3-none-any.whl size=30106 sha256=9cae49b94f8e9a01c77874c7b12995ccc4ceb848eebb29dad9b84728b68afbd4
  Stored in directory: /home/osama/.cache/pip/wheels/9c/52/8c/38b454b42f712f325e26f

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.7.1+cu118 requires torch==2.7.1, but you have torch 2.0.1 which is incompatible.


Successfully installed accelerate-0.21.0 aiofiles-23.2.1 altair-5.5.0 bitsandbytes-0.41.0 cmake-4.0.3 einops-0.6.1 einops-exts-0.0.4 fastapi-0.116.0 ffmpy-0.6.0 gradio-4.16.0 gradio-client-0.8.1 h11-0.14.0 httpcore-0.17.3 httpx-0.24.0 importlib-resources-6.5.2 jsonschema-4.24.0 jsonschema-specifications-2025.4.1 latex2mathml-3.78.0 lit-18.1.8 llava-torch-1.2.2.post1 markdown2-2.5.3 markupsafe-2.1.5 narwhals-1.46.0 numpy-1.26.4 nvidia-cublas-cu11-11.10.3.66 nvidia-cuda-cupti-cu11-11.7.101 nvidia-cuda-nvrtc-cu11-11.7.99 nvidia-cuda-runtime-cu11-11.7.99 nvidia-cudnn-cu11-8.5.0.96 nvidia-cufft-cu11-10.9.0.58 nvidia-curand-cu11-10.2.10.91 nvidia-cusolver-cu11-11.4.0.1 nvidia-cusparse-cu11-11.7.4.91 nvidia-nccl-cu11-2.14.3 nvidia-nvtx-cu11-11.7.91 peft-0.4.0 pillow-10.4.0 pydub-0.25.1 python-multipart-0.0.20 referencing-0.36.2 rpds-py-0.26.0 ruff-0.12.2 scikit-learn-1.2.2 semantic-version-2.10.0 sentencepiece-0.1.99 shellingham-1.5.4 shortuuid-1.0.13 starlette-0.46.2 svgwrite-1.4.3 timm-0.6.

In [2]:
# LLaVA-Torch OCR Implementation
import os
import glob
import torch
from PIL import Image
import time

def process_images_with_llava_torch(images_folder="part_2_images", output_folder="extracted_text/llava_torch"):
    """
    Process images using llava-torch package for OCR
    """
    try:
        # Import llava-torch components
        from llava.model.builder import load_pretrained_model
        from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
        from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
        from llava.conversation import conv_templates, SeparatorStyle
        
        print("✅ llava-torch imported successfully!")
        
    except ImportError as e:
        print(f"❌ Failed to import llava-torch: {e}")
        print("Make sure llava-torch is installed: pip install llava-torch")
        return
    
    # Create output directory
    os.makedirs(output_folder, exist_ok=True)
    print(f"Output folder: {output_folder}")
    
    # Setup device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    
    try:
        # Load model - using a smaller model path that should work with llava-torch
        model_path = "liuhaotian/llava-v1.5-7b"  # This should work with llava-torch
        model_name = get_model_name_from_path(model_path)
        
        print(f"Loading model: {model_path}")
        print("This may take a few minutes for first-time download...")
        
        tokenizer, model, image_processor, context_len = load_pretrained_model(
            model_path=model_path,
            model_base=None,
            model_name=model_name,
            load_8bit=False,
            load_4bit=False,
            device=device
        )
        
        print("✅ Model loaded successfully!")
        
    except Exception as e:
        print(f"❌ Failed to load model: {e}")
        print("Trying alternative model...")
        
        # Try alternative model paths
        alternative_models = [
            "liuhaotian/llava-v1.5-13b",
            "liuhaotian/llava-v1.6-mistral-7b",
            "liuhaotian/llava-v1.6-vicuna-7b"
        ]
        
        model_loaded = False
        for alt_model in alternative_models:
            try:
                print(f"Trying {alt_model}...")
                model_name = get_model_name_from_path(alt_model)
                tokenizer, model, image_processor, context_len = load_pretrained_model(
                    model_path=alt_model,
                    model_base=None,
                    model_name=model_name,
                    load_8bit=False,
                    load_4bit=False,
                    device=device
                )
                print(f"✅ Successfully loaded {alt_model}!")
                model_loaded = True
                break
            except Exception as alt_e:
                print(f"❌ Failed {alt_model}: {alt_e}")
                continue
        
        if not model_loaded:
            print("❌ All model loading attempts failed.")
            return
    
    # Get all image files
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []
    
    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))
    
    if not image_files:
        print(f"No images found in {images_folder}")
        return
    
    print(f"Found {len(image_files)} images to process")
    
    # Process each image
    processed_count = 0
    
    for i, image_path in enumerate(image_files, 1):
        image_filename = os.path.basename(image_path)
        base_name = os.path.splitext(image_filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
        # Check if already processed
        if os.path.exists(text_file_path):
            print(f"[{i}/{len(image_files)}] Skipping {image_filename} - already processed")
            continue
        
        print(f"[{i}/{len(image_files)}] Processing {image_filename}...")
        
        try:
            # Load and process image
            image = Image.open(image_path).convert("RGB")
            
            # Resize if too large
            max_size = 512  # Conservative size for llava-torch
            if max(image.size) > max_size:
                image.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
                print(f"Resized image to {image.size}")
            
            # Process image
            image_tensor = process_images([image], image_processor, model.config)
            if type(image_tensor) is list:
                image_tensor = [image.to(dtype=torch.float16, device=device) for image in image_tensor]
            else:
                image_tensor = image_tensor.to(dtype=torch.float16, device=device)
            
            # Prepare conversation
            conv_mode = "llava_v1"  # Default conversation mode
            conv = conv_templates[conv_mode].copy()
            
            # OCR prompt
            inp = "Extract all text from this image. Provide only the extracted text without any additional commentary."
            
            if model.config.mm_use_im_start_end:
                inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
            else:
                inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
            
            conv.append_message(conv.roles[0], inp)
            conv.append_message(conv.roles[1], None)
            prompt = conv.get_prompt()
            
            # Tokenize
            input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device)
            
            # Generate
            start_time = time.time()
            
            with torch.inference_mode():
                output_ids = model.generate(
                    input_ids,
                    images=image_tensor,
                    image_sizes=[image.size],
                    do_sample=False,
                    temperature=0.1,
                    max_new_tokens=512,
                    use_cache=True
                )
            
            # Decode output
            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
            
            # Clean up output - remove the input prompt
            if prompt in outputs:
                extracted_text = outputs.replace(prompt, "").strip()
            else:
                extracted_text = outputs.strip()
            
            end_time = time.time()
            print(f"OCR completed in {end_time - start_time:.2f} seconds")
            
            # Save result
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(extracted_text)
            
            print(f"Saved text to: {text_file_path}")
            if extracted_text:
                preview = extracted_text[:100] + "..." if len(extracted_text) > 100 else extracted_text
                print(f"Preview: {preview}")
            else:
                print("No text extracted")
            
            processed_count += 1
            
            # Clear GPU memory
            if device == "cuda":
                torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing {image_filename}: {e}")
            # Save error file
            with open(text_file_path, 'w', encoding='utf-8') as f:
                f.write(f"ERROR: {str(e)}")
            continue
    
    print(f"\n=== LLaVA-Torch Processing Complete ===")
    print(f"Processed: {processed_count} images")
    print(f"Results saved to: {output_folder}")

# Run the processing
if __name__ == "__main__":
    process_images_with_llava_torch()

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
2025-07-11 02:26:04.685752: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752182764.698357   18694 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752182764.701981   18694 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752182764.712514   18694 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid l

✅ llava-torch imported successfully!
Output folder: extracted_text/llava_torch
Using device: cuda
Loading model: liuhaotian/llava-v1.5-7b
This may take a few minutes for first-time download...


You are using a model of type llava to instantiate a model of type llava_llama. This is not supported for all configurations of models and can yield errors.
Downloading shards:   0%|          | 0/2 [07:29<?, ?it/s]


KeyboardInterrupt: 