# Simple TTS using pyttsx3 Gpu based:

In [1]:
import os
import glob
import easyocr
import torch
import pyttsx3
from langdetect import detect
import cv2
from PIL import Image
import numpy as np
import time
import gc

# Force GPU usage and optimization for RTX 2060
os.environ['CUDA_VISIBLE_DEVICES'] = '0'  # Use first GPU
torch.backends.cudnn.benchmark = True  # Enable cudnn autotuner for performance

# Check GPU and setup
if not torch.cuda.is_available():
    print("ERROR: CUDA not available. This script requires GPU acceleration.")
    print("Please check your NVIDIA drivers and PyTorch installation.")
    exit(1)

# Display GPU information
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU detected: {torch.cuda.get_device_name(0)}")
print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Clear GPU memory before starting
torch.cuda.empty_cache()
gc.collect()

# Initialize EasyOCR reader with explicit GPU settings
print("Initializing EasyOCR with GPU acceleration...")
reader = easyocr.Reader(
    ["ar", "ur", "en"], 
    gpu=True,
    verbose=False,
    # For RTX 2060, set reasonable batch size and model parameters
    detector=True,
    recognizer=True
)

# Initialize TTS engine
tts_engine = pyttsx3.init()

def log_gpu_memory():
    """Log current GPU memory usage"""
    allocated = torch.cuda.memory_allocated(0) / 1e9
    reserved = torch.cuda.memory_reserved(0) / 1e9
    print(f"GPU Memory: {allocated:.2f} GB allocated, {reserved:.2f} GB reserved")

def process_images_ocr_save_text(images_folder, output_folder):
    """
    Process all images in the folder with GPU-accelerated OCR and save to text files
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")
    
    # Get all image files in the folder
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []

    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

    if not image_files:
        print(f"No images found in {images_folder}")
        return

    print(f"Found {len(image_files)} images to process")
    
    # Report initial GPU memory
    log_gpu_memory()

    # Process files
    for i, image_path in enumerate(image_files, 1):
        image_filename = os.path.basename(image_path)
        base_name = os.path.splitext(image_filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
        print(f"\n--- Processing Image {i}/{len(image_files)}: {image_filename} ---")

        try:
            # Read image
            image = cv2.imread(image_path)
            if image is None:
                print(f"Could not read image: {image_path}")
                continue

            # Optimize image for GPU processing (resize large images)
            h, w = image.shape[:2]
            max_dim = 2000  # Optimal for RTX 2060 memory
            if max(h, w) > max_dim:
                scale = max_dim / max(h, w)
                image = cv2.resize(image, (int(w * scale), int(h * scale)))
                print(f"Resized image to {image.shape[1]}x{image.shape[0]} to optimize GPU memory")
            
            # Report GPU memory before OCR
            log_gpu_memory()

            # Extract text using GPU-accelerated EasyOCR
            print("Extracting text with GPU acceleration...")
            start_time = time.time()
            
            # For RTX 2060, use appropriate batch size
            results = reader.readtext(
                image,
                batch_size=2,  # Adjust based on your GPU memory
                paragraph=True,  # Group text into paragraphs
                detail=0  # 0 for more accuracy
            )

            end_time = time.time()
            print(f"OCR completed in {end_time - start_time:.2f} seconds")

            # Combine all detected text with confidence filtering
            extracted_text = " ".join(
                [result[1] for result in results if result[2] > 0.5]
            )  # confidence > 0.5

            if not extracted_text.strip():
                print("No text detected in this image")
                continue

            print(f"Extracted text: {extracted_text}")
            
            # Try to detect language
            try:
                detected_lang = detect(extracted_text)
                print(f"Detected language: {detected_lang}")
                
                # Save text with language information to file
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(f"LANG:{detected_lang}\n")
                    text_file.write(extracted_text)
                
                print(f"Saved text to: {text_file_path}")
                
            except Exception as lang_error:
                print(f"Language detection failed: {lang_error}")
                # Save text without language information
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(f"LANG:unknown\n")
                    text_file.write(extracted_text)
                
                print(f"Saved text to: {text_file_path} (language unknown)")

            # Clear GPU memory after each image
            torch.cuda.empty_cache()
            
        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            # Clear GPU memory on error
            torch.cuda.empty_cache()
            continue

    # Final GPU memory cleanup
    torch.cuda.empty_cache()
    gc.collect()
    print("\n--- OCR Processing Complete ---")
    log_gpu_memory()
    
    return output_folder

def read_text_files_aloud(text_folder):
    """
    Read all text files in the folder aloud using TTS, one by one with clear separation
    """
    # Get all text files
    text_files = glob.glob(os.path.join(text_folder, "*.txt"))
    
    if not text_files:
        print(f"No text files found in {text_folder}")
        return
    
    print(f"\nFound {len(text_files)} text files to read")
    
    # Language mapping for TTS
    lang_mapping = {
        "en": "english",
        "ar": "arabic",
        "ur": "urdu",
        "hi": "hindi",
        "fa": "persian",
        "ps": "pashto"
    }
    
    # Sort text files alphabetically to ensure consistent reading order
    text_files.sort()
    
    for i, text_file_path in enumerate(text_files, 1):
        file_name = os.path.basename(text_file_path)
        print(f"\n===== Reading File {i}/{len(text_files)}: {file_name} =====")
        
        # Announce the file being read (optional)
        tts_engine.setProperty("rate", 150)
        announcement = f"Reading file {i} of {len(text_files)}: {os.path.splitext(file_name)[0]}"
        print(announcement)
        tts_engine.say(announcement)
        tts_engine.runAndWait()
        
        # Pause between announcement and content
        time.sleep(1)
        
        try:
            # Read text file
            with open(text_file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                
            if not lines:
                print(f"File is empty: {text_file_path}")
                continue
                
            # Extract language information from first line
            lang_line = lines[0].strip()
            if lang_line.startswith("LANG:"):
                detected_lang = lang_line[5:]
                print(f"Language: {detected_lang}")
                # Remove the language line
                content = "".join(lines[1:])
            else:
                # No language information, treat all lines as content
                detected_lang = "unknown"
                content = "".join(lines)
            
            if not content.strip():
                print("No content to read")
                continue
                
            # Print a preview of the content
            content_preview = content[:100] + "..." if len(content) > 100 else content
            print(f"Text content: {content_preview}")
            
            # Get available voices
            voices = tts_engine.getProperty("voices")
            
            # Try to set appropriate voice based on language
            voice_set = False
            for voice in voices:
                tts_lang = lang_mapping.get(detected_lang, "english")
                if (tts_lang.lower() in voice.name.lower() or 
                    detected_lang in voice.id.lower()):
                    tts_engine.setProperty("voice", voice.id)
                    voice_set = True
                    print(f"Using voice: {voice.name}")
                    break
            
            if not voice_set:
                print(f"No specific voice found for {detected_lang}, using default")
            
            # Adjust speech rate based on language
            if detected_lang in ["ar", "ur"]:
                # Slower for Arabic and Urdu
                tts_engine.setProperty("rate", 130)
            else:
                tts_engine.setProperty("rate", 150)
            
            # Read the text aloud
            print(f"Reading text aloud...")
            tts_engine.say(content)
            tts_engine.runAndWait()
            
            # Pause between files to clearly separate them
            print("Finished reading file.")
            time.sleep(2)
            
        except Exception as e:
            print(f"Error reading {text_file_path}: {e}")
            continue
    
    print("\n===== All Text Files Have Been Read =====")

def get_user_confirmation():
    """
    Ask the user if they want to proceed to the TTS reading phase
    """
    while True:
        response = input("\nOCR processing complete. Proceed with reading text files? (y/n): ").lower()
        if response in ['y', 'yes']:
            return True
        elif response in ['n', 'no']:
            return False
        else:
            print("Please enter 'y' or 'n'")

# Main process
if __name__ == "__main__":
    try:
        # Define folders
        images_folder = "part_2_images"
        output_folder = "extracted_text"
        
        # Print GPU optimization message
        print("=== Running GPU-Optimized OCR for RTX 2060 ===")
        
        # Step 1: Process images with OCR and save text
        start_time = time.time()
        text_folder = process_images_ocr_save_text(images_folder, output_folder)
        end_time = time.time()
        
        print(f"OCR processing completed in {end_time - start_time:.2f} seconds")
        
        # Optional: Ask for user confirmation before proceeding to TTS
        if get_user_confirmation():
            # Step 2: Read the saved text files aloud one by one
            read_text_files_aloud(text_folder)
        else:
            print("TTS reading canceled. Text files are saved in the output folder.")
        
        print("Process completed successfully!")
        
    except KeyboardInterrupt:
        print("\nProcess interrupted by user")
        # Clean up GPU memory on interrupt
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"An error occurred: {e}")
        # Clean up GPU memory on error
        torch.cuda.empty_cache()
    finally:
        # Final cleanup
        torch.cuda.empty_cache()
        gc.collect()

PyTorch version: 2.7.1+cu118
CUDA version: 11.8
GPU detected: NVIDIA GeForce RTX 2060
GPU memory: 12.88 GB
Initializing EasyOCR with GPU acceleration...
=== Running GPU-Optimized OCR for RTX 2060 ===
Found 6 images to process
GPU Memory: 0.30 GB allocated, 0.56 GB reserved

--- Processing Image 1/6: IMG_20250629_214324_528.jpg ---
Resized image to 1500x2000 to optimize GPU memory
GPU Memory: 0.30 GB allocated, 0.56 GB reserved
Extracting text with GPU acceleration...
OCR completed in 3.44 seconds
Error processing part_2_images\IMG_20250629_214324_528.jpg: '>' not supported between instances of 'str' and 'float'

--- Processing Image 2/6: IMG_20250629_214514_439.jpg ---
Resized image to 2000x1500 to optimize GPU memory
GPU Memory: 0.31 GB allocated, 0.34 GB reserved
Extracting text with GPU acceleration...
OCR completed in 3.30 seconds
Error processing part_2_images\IMG_20250629_214514_439.jpg: '>' not supported between instances of 'str' and 'float'

--- Processing Image 3/6: textbook_