In [10]:
from PIL import Image
from transformers import pipeline
from collections import Counter
from scipy.io.wavfile import write
from IPython.display import Audio as IPythonAudio
import numpy as np
import matplotlib.pyplot as plt

#load image
image = Image.open("images/image_1.jpeg")

# Step-1 : object detection
object_detection = pipeline(
    task="object-detection",
    model="facebook/detr-resnet-50",
)

detected_objects = object_detection(image)

# Step-2: call custom function to extract Labels from above json object and Convert to Text
# function to extract Labels from above object json and Convert to Text
def detr_pipeline_to_text(detections, conf_threshold=0.7):
    detected = []

    for obj in detections:
        if obj["score"] >= conf_threshold:
            detected.append(obj["label"])

    if not detected:
        return "No confident objects were detected in the image."

    counts = Counter(detected)

    parts = []
    for obj, count in counts.items():
        if count == 1:
            parts.append(f"a {obj}")
        else:
            parts.append(f"{count} {obj}s")

    if len(parts) == 1:
        return f"The image contains {parts[0]}."
    else:
        return "The image contains " + ", ".join(parts[:-1]) + " and " + parts[-1] + "."
        
result_text = detr_pipeline_to_text(detected_objects, conf_threshold=0.7)

# Step-3: 
# create object for text-to-speech
narrator = pipeline(
    task="text-to-speech",
    model="suno/bark-small",
)

# convert from text to speech
narrated_text = narrator(result_text)

# Play the audio
IPythonAudio(
    narrated_text["audio"],
    rate=narrated_text["sampling_rate"]
)

# to save audio file on system
audio = narrated_text["audio"]
sr = narrated_text["sampling_rate"]
# Ensure float32 format
audio = np.asarray(audio, dtype=np.float32) 
write("output1.wav", sr, audio)


Loading weights:   0%|          | 0/530 [00:00<?, ?it/s]

[1mDetrForObjectDetection LOAD REPORT[0m from: facebook/detr-resnet-50
Key                                                            | Status     |  | 
---------------------------------------------------------------+------------+--+-
model.backbone.model.layer2.0.downsample.1.num_batches_tracked | UNEXPECTED |  | 
model.backbone.model.layer1.0.downsample.1.num_batches_tracked | UNEXPECTED |  | 
model.backbone.model.layer4.0.downsample.1.num_batches_tracked | UNEXPECTED |  | 
model.backbone.model.layer3.0.downsample.1.num_batches_tracked | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Loading weights:   0%|          | 0/542 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
Both `max_new_tokens` (=768) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=60) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=60) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both

In [9]:
del object_detection
del narrator

import gc
gc.collect()

3252