In [None]:
!pip install torch whisper gradio gtts deep-translator
!pip install langdetect
!pip install deep-translator
!pip install gtts
!pip install gradio




In [None]:
import cv2
from transformers import pipeline, AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import numpy as np

In [None]:
# Load the processor and model
processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224")

# High-level pipeline for classification
pipe = pipeline("image-classification", model=model, feature_extractor=processor)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Use the pipeline for high-level operations
pipe1 = pipeline("text2text-generation", model="google/flan-t5-large")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
tokenizer1 = AutoTokenizer.from_pretrained("google/flan-t5-large")
model1 = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

In [None]:
def get_brief_description(word):
    """
    Generate a brief explanation of a word using the flan-t5-large model.
    Args:
    - word (str): The word to explain.

    Returns:
    - str: The model-generated description.
    """
    prompt = f"explain what is  {word}"

    # Encode the prompt and generate the explanation
    inputs = tokenizer1(prompt, return_tensors="pt")
    outputs = model1.generate(inputs["input_ids"], max_length=50, num_return_sequences=1, do_sample=False)

    # Decode and return the result
    return tokenizer1.decode(outputs[0], skip_special_tokens=True)

In [None]:
from langdetect import detect  # For language detection
from deep_translator import GoogleTranslator  # For language translation
from gtts import gTTS  # For text-to-speech
import gradio as gr  # For creating the interface

In [None]:
def multilingual_tts_runtime(text):
    """
    Ask for the target language at runtime, process a predefined text, and display the output.
    """
    # Predefined input text
    input_text = text  # Example input text

    # Get target language from the user
    target_language = input("Enter the target language code (e.g., 'en' for English): ").strip()

    # Inner function for processing
    def process_translation():
        try:
            # Step 1: Detect the language of the text
            detected_lang = detect(input_text)
            print(f"Detected Language: {detected_lang}")

            # Step 2: Translate the text to the target language
            translated_text = GoogleTranslator(source=detected_lang, target=target_language).translate(input_text)
            print(f"Translated Text: {translated_text}")

            # Step 3: Convert the translated text to speech using gTTS
            tts = gTTS(translated_text, lang=target_language)
            output_file = "output.mp3"
            tts.save(output_file)

            # Return results
            return detected_lang, translated_text, output_file

        except Exception as e:
            return "Error", f"An error occurred: {str(e)}", None

    # Wrapper for Gradio output
    def gradio_interface():
        detected_lang, translated_text, audio_file = process_translation()
        return (
            f"Detected Language: {detected_lang}\nTranslated Text: {translated_text}",
            audio_file,
        )

    # Create Gradio app
    interface = gr.Interface(
        fn=lambda: gradio_interface(),  # Function without user-provided inputs
        inputs=None,  # No user inputs required
        outputs=[
            gr.Textbox(label="Translation Details"),  # Show detected language and translation
            gr.Audio(label="Generated Speech"),  # Play the audio file
        ],
        title="Multilingual Text-to-Speech Generator",
        description="This example takes predefined input and displays the results.",
    )

    # Launch the Gradio interface
    interface.launch()



In [None]:
def classify_real_time_image():
    # Initialize the camera
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print("Error: Could not open the camera.")
        return

    print("Press 'c' to capture an image for classification, or 'q' to quit.")
    while True:
        ret, frame = cap.read()
        if not ret:
            print("Error: Could not read frame.")
            break

        # Show the camera feed
        cv2.imshow("Camera Feed", frame)

        # Wait for key press
        key = cv2.waitKey(1) & 0xFF
        if key == ord('c'):  # 'c' to capture
            # Convert the frame to PIL Image
            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            # Perform classification
            results = pipe(image)
            print("Predicted class:", results[0]['label'])
        elif key == ord('q'):  # 'q' to quit
            break

    # Release the camera and close OpenCV window
    cap.release()
    cv2.destroyAllWindows()

In [None]:
def classify_user_image(image_path):
    # Load the image
    image = Image.open(image_path).convert("RGB")
    # Perform classification
    results = pipe(image)

    word = results[0]['label']
    description = get_brief_description(word)
    # Run the function
    multilingual_tts_runtime(description)

# Example usage
if __name__ == "__main__":
    print("Select an option:")
    print("1: Classify real-time image from camera")
    print("2: Classify image from file")
    choice = input("Enter your choice (1/2): ")

    if choice == "1":
        classify_real_time_image()
    elif choice == "2":
        image_path = input("Enter the path to the image: ")
        classify_user_image(image_path)
    else:
        print("Invalid choice!")

Select an option:
1: Classify real-time image from camera
2: Classify image from file
Enter your choice (1/2): 2
Enter the path to the image: /content/peacock.jpg
Enter the target language code (e.g., 'en' for English): ta
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b46e684dc3cf6d26c2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
