In [1]:
# Check CUDA installation and GPU detection
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# Check if nvidia-smi works
import subprocess

try:
    result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
    if result.returncode == 0:
        print("NVIDIA GPU detected by system:")
        print(result.stdout.split("\n")[8])  # GPU info line
    else:
        print("nvidia-smi failed - no NVIDIA driver detected")
except FileNotFoundError:
    print("nvidia-smi not found - NVIDIA drivers may not be installed")


import easyocr
reader = easyocr.Reader(['en'], gpu=True)
# If no errors, GPU should be working with EasyOCR

PyTorch version: 2.7.1+cpu
CUDA available: False
CUDA version: None
NVIDIA GPU detected by system:
|   0  NVIDIA GeForce RTX 2060      WDDM  |   00000000:01:00.0  On |                  N/A |


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

# 1. With Python Script

In [7]:
import os
import glob
import easyocr
import torch
import pyttsx3
from langdetect import detect
import cv2
from PIL import Image
import numpy as np

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize EasyOCR reader with GPU support
reader = easyocr.Reader(["ar", "ur", "en"], gpu=torch.cuda.is_available())

# Initialize TTS engine
tts_engine = pyttsx3.init()


def process_images_with_ocr_tts(images_folder):
    """
    Process all images in the folder, extract text, detect language, and read aloud
    """
    # Get all image files in the folder
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []

    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

    if not image_files:
        print(f"No images found in {images_folder}")
        return

    print(f"Found {len(image_files)} images to process")

    for i, image_path in enumerate(image_files, 1):
        print(
            f"\n--- Processing Image {i}/{len(image_files)}: {os.path.basename(image_path)} ---"
        )

        try:
            # Read image
            image = cv2.imread(image_path)
            if image is None:
                print(f"Could not read image: {image_path}")
                continue

            # Extract text using EasyOCR
            print("Extracting text...")
            results = reader.readtext(image)

            # Combine all detected text
            extracted_text = " ".join(
                [result[1] for result in results if result[2] > 0.5]
            )  # confidence > 0.5

            if not extracted_text.strip():
                print("No text detected in this image")
                continue

            print(f"Extracted text: {extracted_text}")

            # Detect language
            try:
                detected_lang = detect(extracted_text)
                print(f"Detected language: {detected_lang}")

                # Language mapping for TTS
                lang_mapping = {
                    "en": "english",
                    "ar": "arabic",
                    "ur": "urdu",
                    "hi": "hindi",
                }

                # Set TTS language if available
                tts_lang = lang_mapping.get(detected_lang, "english")

                # Get available voices
                voices = tts_engine.getProperty("voices")

                # Try to set appropriate voice based on language
                voice_set = False
                for voice in voices:
                    if (
                        tts_lang.lower() in voice.name.lower()
                        or detected_lang in voice.id.lower()
                    ):
                        tts_engine.setProperty("voice", voice.id)
                        voice_set = True
                        break

                if not voice_set:
                    print(f"No specific voice found for {detected_lang}, using default")

                # Adjust speech rate
                tts_engine.setProperty("rate", 150)  # Adjust speed as needed

                # Read the text aloud
                print(f"Reading text aloud in {detected_lang}...")
                tts_engine.say(extracted_text)
                tts_engine.runAndWait()

            except Exception as lang_error:
                print(f"Language detection failed: {lang_error}")
                print("Reading with default language...")
                tts_engine.say(extracted_text)
                tts_engine.runAndWait()

        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            continue

    print("\n--- Processing Complete ---")


# Process images in the part_2_images folder
images_folder = "part_2_images"
process_images_with_ocr_tts(images_folder)



Using device: cpu
Found 6 images to process

--- Processing Image 1/6: IMG_20250629_214324_528.jpg ---
Extracting text...
Extracted text: 4: Employee Hierarchy ٥. Employee ( parent ): 0 Manager, like name, id; method like get_ role specifie methods Code: lass Employee : def init ( self, self .name _i0): name self . emp_id emp_id def get_details(self): print (f"Name : {self name}, I0: {self ,emp_id}" ) tlass Manager ( Employee)= def calculatesalary(self): def assign_task(self): print ( "Manager assigns tasks ) rlass Developer ( Employee ) def calculatesalary(self): 6000  U$D") def assign_task(self): Intern Employee ) : def calculatesalary(self): print ( Intern salary: 2000 U$0") def assign task(self): print( "Intern assists in tasks" ) Manager ( "Alice 101) Lelculate_salary( 10?) Task attributes Developer , implement assign_ name , dass
Detected language: en
Reading text aloud in en...

--- Processing Image 2/6: IMG_20250629_214328_880.jpg ---
Extracting text...
Extracted text: 11 Pyiio

KeyboardInterrupt: 

# V2 saving in file and reading aloud from there

In [3]:
import os
import glob
import easyocr
import torch
import pyttsx3
from langdetect import detect
import cv2
from PIL import Image
import numpy as np
import time

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize EasyOCR reader with GPU support
reader = easyocr.Reader(["ar", "ur", "en"], gpu=torch.cuda.is_available())

# Initialize TTS engine
tts_engine = pyttsx3.init()

def process_images_ocr_save_text(images_folder, output_folder):
    """
    Process all images in the folder, extract text with OCR, and save to text files
    """
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"Created output folder: {output_folder}")
    
    # Get all image files in the folder
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []

    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

    if not image_files:
        print(f"No images found in {images_folder}")
        return

    print(f"Found {len(image_files)} images to process")

    for i, image_path in enumerate(image_files, 1):
        image_filename = os.path.basename(image_path)
        base_name = os.path.splitext(image_filename)[0]
        text_file_path = os.path.join(output_folder, f"{base_name}.txt")
        
        print(f"\n--- Processing Image {i}/{len(image_files)}: {image_filename} ---")

        try:
            # Read image
            image = cv2.imread(image_path)
            if image is None:
                print(f"Could not read image: {image_path}")
                continue

            # Extract text using EasyOCR
            print("Extracting text...")
            results = reader.readtext(image)

            # Combine all detected text
            extracted_text = " ".join(
                [result[1] for result in results if result[2] > 0.5]
            )  # confidence > 0.5

            if not extracted_text.strip():
                print("No text detected in this image")
                continue

            print(f"Extracted text: {extracted_text}")
            
            # Try to detect language
            try:
                detected_lang = detect(extracted_text)
                print(f"Detected language: {detected_lang}")
                
                # Save text with language information to file
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(f"LANG:{detected_lang}\n")
                    text_file.write(extracted_text)
                
                print(f"Saved text to: {text_file_path}")
                
            except Exception as lang_error:
                print(f"Language detection failed: {lang_error}")
                # Save text without language information
                with open(text_file_path, 'w', encoding='utf-8') as text_file:
                    text_file.write(f"LANG:unknown\n")
                    text_file.write(extracted_text)
                
                print(f"Saved text to: {text_file_path} (language unknown)")

        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            continue

    print("\n--- OCR Processing Complete ---")
    return output_folder

def read_text_files_aloud(text_folder):
    """
    Read all text files in the folder aloud using TTS
    """
    # Get all text files
    text_files = glob.glob(os.path.join(text_folder, "*.txt"))
    
    if not text_files:
        print(f"No text files found in {text_folder}")
        return
    
    print(f"\nFound {len(text_files)} text files to read")
    
    # Language mapping for TTS
    lang_mapping = {
        "en": "english",
        "ar": "arabic",
        "ur": "urdu",
        "hi": "hindi",
    }
    
    for i, text_file_path in enumerate(text_files, 1):
        file_name = os.path.basename(text_file_path)
        print(f"\n--- Reading File {i}/{len(text_files)}: {file_name} ---")
        
        try:
            # Read text file
            with open(text_file_path, 'r', encoding='utf-8') as file:
                lines = file.readlines()
                
            if not lines:
                print(f"File is empty: {text_file_path}")
                continue
                
            # Extract language information from first line
            lang_line = lines[0].strip()
            if lang_line.startswith("LANG:"):
                detected_lang = lang_line[5:]
                print(f"Language: {detected_lang}")
                # Remove the language line
                content = "".join(lines[1:])
            else:
                # No language information, treat all lines as content
                detected_lang = "unknown"
                content = "".join(lines)
            
            if not content.strip():
                print("No content to read")
                continue
                
            print(f"Text content: {content[:100]}..." if len(content) > 100 else f"Text content: {content}")
            
            # Get available voices
            voices = tts_engine.getProperty("voices")
            
            # Try to set appropriate voice based on language
            voice_set = False
            for voice in voices:
                tts_lang = lang_mapping.get(detected_lang, "english")
                if (tts_lang.lower() in voice.name.lower() or 
                    detected_lang in voice.id.lower()):
                    tts_engine.setProperty("voice", voice.id)
                    voice_set = True
                    print(f"Using voice: {voice.name}")
                    break
            
            if not voice_set:
                print(f"No specific voice found for {detected_lang}, using default")
            
            # Adjust speech rate
            tts_engine.setProperty("rate", 150)  # Adjust speed as needed
            
            # Read the text aloud
            print(f"Reading text aloud...")
            tts_engine.say(content)
            tts_engine.runAndWait()
            
        except Exception as e:
            print(f"Error reading {text_file_path}: {e}")
            continue
    
    print("\n--- TTS Reading Complete ---")

# Main process
if __name__ == "__main__":
    # Define folders
    images_folder = "part_2_images"
    output_folder = "extracted_text"
    
    # Step 1: Process images with OCR and save text
    text_folder = process_images_ocr_save_text(images_folder, output_folder)
    
    # Step 2: Read the saved text files aloud
    read_text_files_aloud(text_folder)
    
    print("Process completed successfully!")

Using CPU. Note: This module is much faster with a GPU.


Using device: cpu
Found 6 images to process

--- Processing Image 1/6: IMG_20250629_214324_528.jpg ---
Extracting text...
Extracted text: 4: Employee Hierarchy ٥. Employee ( parent ): 0 Manager, like name, id; method like get_ role specifie methods Code: lass Employee : def init ( self, self .name _i0): name self . emp_id emp_id def get_details(self): print (f"Name : {self name}, I0: {self ,emp_id}" ) tlass Manager ( Employee) = def calculatesalary(self): def assign_task(self): print ( "Manager assigns tasks ) rlass Developer ( Employee ) def calculatesalary(self): 6000  U$D") def assign_task(self): print( "Developer writes code") Intern Employee ) : def calculatesalary(self): print ( Intern salary: 2000 U$0") def assign task(self): print( "Intern assists in tasks" ) Manager ( "Alice 101) Lelculate_salary( 10?) Task attributes Developer , implement assign_ name , dass
Detected language: en
Saved text to: extracted_text\IMG_20250629_214324_528.txt

--- Processing Image 2/6: IMG_20250629