In [1]:
# Check CUDA installation and GPU detection
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

# Check if nvidia-smi works
import subprocess

try:
    result = subprocess.run(["nvidia-smi"], capture_output=True, text=True)
    if result.returncode == 0:
        print("NVIDIA GPU detected by system:")
        print(result.stdout.split("\n")[8])  # GPU info line
    else:
        print("nvidia-smi failed - no NVIDIA driver detected")
except FileNotFoundError:
    print("nvidia-smi not found - NVIDIA drivers may not be installed")

PyTorch version: 2.7.1+cu126
CUDA available: False
CUDA version: 12.6
nvidia-smi not found - NVIDIA drivers may not be installed


# 1. With Python Script

In [2]:
import os
import glob
import easyocr
import torch
import pyttsx3
from langdetect import detect
import cv2
from PIL import Image
import numpy as np

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialize EasyOCR reader with GPU support
reader = easyocr.Reader(["ar", "ur", "en"], gpu=torch.cuda.is_available())

# Initialize TTS engine
tts_engine = pyttsx3.init()


def process_images_with_ocr_tts(images_folder):
    """
    Process all images in the folder, extract text, detect language, and read aloud
    """
    # Get all image files in the folder
    image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff"]
    image_files = []

    for ext in image_extensions:
        image_files.extend(glob.glob(os.path.join(images_folder, ext)))
        image_files.extend(glob.glob(os.path.join(images_folder, ext.upper())))

    if not image_files:
        print(f"No images found in {images_folder}")
        return

    print(f"Found {len(image_files)} images to process")

    for i, image_path in enumerate(image_files, 1):
        print(
            f"\n--- Processing Image {i}/{len(image_files)}: {os.path.basename(image_path)} ---"
        )

        try:
            # Read image
            image = cv2.imread(image_path)
            if image is None:
                print(f"Could not read image: {image_path}")
                continue

            # Extract text using EasyOCR
            print("Extracting text...")
            results = reader.readtext(image)

            # Combine all detected text
            extracted_text = " ".join(
                [result[1] for result in results if result[2] > 0.5]
            )  # confidence > 0.5

            if not extracted_text.strip():
                print("No text detected in this image")
                continue

            print(f"Extracted text: {extracted_text}")

            # Detect language
            try:
                detected_lang = detect(extracted_text)
                print(f"Detected language: {detected_lang}")

                # Language mapping for TTS
                lang_mapping = {
                    "en": "english",
                    "ar": "arabic",
                    "ur": "urdu",
                    "hi": "hindi",
                }

                # Set TTS language if available
                tts_lang = lang_mapping.get(detected_lang, "english")

                # Get available voices
                voices = tts_engine.getProperty("voices")

                # Try to set appropriate voice based on language
                voice_set = False
                for voice in voices:
                    if (
                        tts_lang.lower() in voice.name.lower()
                        or detected_lang in voice.id.lower()
                    ):
                        tts_engine.setProperty("voice", voice.id)
                        voice_set = True
                        break

                if not voice_set:
                    print(f"No specific voice found for {detected_lang}, using default")

                # Adjust speech rate
                tts_engine.setProperty("rate", 150)  # Adjust speed as needed

                # Read the text aloud
                print(f"Reading text aloud in {detected_lang}...")
                tts_engine.say(extracted_text)
                tts_engine.runAndWait()

            except Exception as lang_error:
                print(f"Language detection failed: {lang_error}")
                print("Reading with default language...")
                tts_engine.say(extracted_text)
                tts_engine.runAndWait()

        except Exception as e:
            print(f"Error processing {image_path}: {e}")
            continue

    print("\n--- Processing Complete ---")


# Process images in the part_2_images folder
images_folder = "part_2_images"
process_images_with_ocr_tts(images_folder)

Using CPU. Note: This module is much faster with a GPU.


Using device: cpu
Found 3 images to process

--- Processing Image 1/3: IMG_20250629_214514_439.jpg ---
Extracting text...


KeyboardInterrupt: 

# Groq + Live Kit

In [None]:
# import os
# import cv2
# import pytesseract
# import requests

# # Load secrets from .env or set directly
# GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
# GROQ_API_KEY = "your_groq_api_key"
# LIVEKIT_TTS_URL = "https://your-livekit-server.com/tts"
# LIVEKIT_API_KEY = "your_livekit_api_key"

# img_folder = "part_2_images"
# output_folder = "tts_outputs"
# os.makedirs(output_folder, exist_ok=True)


# def detect_language_groq(text):
#     prompt = f"Detect the language of the following text and respond with only the ISO 639-1 language code:\n\n{text}"
#     headers = {"Authorization": f"Bearer {GROQ_API_KEY}"}
#     data = {
#         "model": "llama3-8b-8192",  # Or another Groq-supported model
#         "messages": [{"role": "user", "content": prompt}],
#     }
#     response = requests.post(GROQ_API_URL, headers=headers, json=data)
#     lang_code = response.json()["choices"][0]["message"]["content"].strip()
#     return lang_code


# for img_file in os.listdir(img_folder):
#     if img_file.lower().endswith((".jpg", ".jpeg", ".png")):
#         img_path = os.path.join(img_folder, img_file)
#         img = cv2.imread(img_path)
#         text = pytesseract.image_to_string(img)
#         print(f"\nText from {img_file}:\n{text}")

#         if text.strip():
#             try:
#                 lang = detect_language_groq(text)
#                 print(f"Detected language (Groq): {lang}")

#                 # --- LiveKit TTS API Call ---
#                 tts_payload = {"text": text, "voice": "default", "lang": lang}
#                 headers = {"Authorization": f"Bearer {LIVEKIT_API_KEY}"}
#                 response = requests.post(
#                     LIVEKIT_TTS_URL, json=tts_payload, headers=headers
#                 )
#                 if response.status_code == 200:
#                     audio_path = os.path.join(
#                         output_folder, f"{os.path.splitext(img_file)[0]}_{lang}.mp3"
#                     )
#                     with open(audio_path, "wb") as f:
#                         f.write(response.content)
#                     print(f"TTS audio saved: {audio_path}")
#                 else:
#                     print(f"LiveKit TTS error: {response.status_code} {response.text}")

#             except Exception as e:
#                 print(f"Error processing {img_file}: {e}")
#         else:
#             print(f"No text detected in {img_file}.")

TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.