In [3]:
# Step 1: Install necessary libraries
!pip install torch torchvision transformers gradio pydub gtts speechrecognition

# Step 2: Import required libraries
import torch
from transformers import CLIPProcessor, CLIPModel, VisionEncoderDecoderModel, AutoTokenizer, pipeline
import gradio as gr
from PIL import Image
import logging
from pydub import AudioSegment
from gtts import gTTS
import speech_recognition as sr

# Setup logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(message)s')

# Step 3: Initialize models
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
caption_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
caption_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")

def generate_caption(image):
    inputs = clip_processor(images=image, return_tensors="pt")
    pixel_values = inputs.pixel_values
    generated_ids = caption_model.generate(pixel_values)
    caption = caption_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return caption

def answer_question(caption, question):
    qa_input = {
        "question": question,
        "context": caption
    }
    answer = qa_model(qa_input)
    return answer['answer']

def vqa_pipeline(image, question_text, question_audio, input_type):
    try:
        question = ""
        if input_type == "Text":
            question = question_text
        else:
            # Convert audio question to text
            recognizer = sr.Recognizer()
            with sr.AudioFile(question_audio) as source:
                audio_data = recognizer.record(source)
                question = recognizer.recognize_google(audio_data)
        
        # Generate a caption for the image
        caption = generate_caption(image)
        logging.debug(f"Generated caption: {caption}")
        
        # Answer the question based on the caption
        answer = answer_question(caption, question)
        logging.debug(f"Answer: {answer}")
        
        # Convert text answer to audio
        if answer:
            tts = gTTS(text=answer, lang='en')
            tts.save("output.mp3")
            return answer, "output.mp3"
        else:
            return "No answer found", None
    except Exception as e:
        logging.error(f"Error in VQA pipeline: {e}")
        return str(e), None

def toggle_inputs(input_type):
    if input_type == "Text":
        return gr.update(visible=True), gr.update(visible=False)
    else:
        return gr.update(visible=False), gr.update(visible=True)

# Step 5: Create Gradio interface
with gr.Blocks() as iface:
    input_type = gr.Radio(["Text", "Audio"], label="Input Type", value="Text")
    with gr.Row():
        image_input = gr.Image(type="pil", label="Upload Image")
    with gr.Row():
        text_input = gr.Textbox(label="Ask a question about the image (Text Input)", visible=True)
        audio_input = gr.Audio(type="filepath", label="Ask a question about the image (Audio Input)", visible=False)
    submit_button = gr.Button("Submit")
    answer_output = gr.Textbox(label="Answer")
    audio_output = gr.Audio(type="filepath", label="Voice Output")
    
    input_type.change(toggle_inputs, inputs=input_type, outputs=[text_input, audio_input])
    submit_button.click(vqa_pipeline, inputs=[image_input, text_input, audio_input, input_type], outputs=[answer_output, audio_output])

# Step 6: Launch the interface
iface.launch(share=True)


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu


2024-08-02 09:56:45,907 - DEBUG - https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch32/resolve/main/config.json HTTP/11" 200 0
2024-08-02 09:56:45,965 - DEBUG - https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch32/resolve/main/config.json HTTP/11" 200 0
2024-08-02 09:56:45,974 - DEBUG - Starting new HTTPS connection (1): huggingface.co:443
2024-08-02 09:56:46,065 - DEBUG - https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch32/resolve/main/model.safetensors HTTP/11" 404 0
2024-08-02 09:56:46,395 - DEBUG - https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch32/resolve/main/preprocessor_config.json HTTP/11" 200 0
2024-08-02 09:56:46,449 - DEBUG - https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch32/resolve/main/tokenizer_config.json HTTP/11" 200 0
2024-08-02 09:56:46,597 - DEBUG - https://huggingface.co:443 "HEAD /openai/clip-vit-base-patch32/resolve/main/processor_config.json HTTP/11" 404 0
2024-08-02 09:56:46,730 - DEBUG - https://huggingfa

Running on local URL:  http://127.0.0.1:7862


2024-08-02 09:56:51,590 - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x7eff94287250>
2024-08-02 09:56:51,591 - DEBUG - send_request_headers.started request=<Request [b'GET']>
2024-08-02 09:56:51,593 - DEBUG - send_request_headers.complete
2024-08-02 09:56:51,594 - DEBUG - send_request_body.started request=<Request [b'GET']>
2024-08-02 09:56:51,595 - DEBUG - send_request_body.complete
2024-08-02 09:56:51,595 - DEBUG - receive_response_headers.started request=<Request [b'GET']>
2024-08-02 09:56:51,629 - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x7f00c6e76e20>
2024-08-02 09:56:51,631 - DEBUG - send_request_headers.started request=<Request [b'GET']>
2024-08-02 09:56:51,632 - DEBUG - send_request_headers.complete
2024-08-02 09:56:51,633 - DEBUG - send_request_body.started request=<Request [b'GET']>
2024-08-02 09:56:51,634 - DEBUG - send_request_body.complete
2024-08-02 09:56:51,635 - DEBUG - receive_resp

Running on public URL: https://6c557bb7ce44d067c0.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


2024-08-02 09:56:52,308 - DEBUG - connect_tcp.complete return_value=<httpcore._backends.sync.SyncStream object at 0x7eff951268b0>
2024-08-02 09:56:52,309 - DEBUG - start_tls.started ssl_context=<ssl.SSLContext object at 0x7eff942f42c0> server_hostname='6c557bb7ce44d067c0.gradio.live' timeout=3
2024-08-02 09:56:52,442 - DEBUG - start_tls.complete return_value=<httpcore._backends.sync.SyncStream object at 0x7eff94295ac0>
2024-08-02 09:56:52,443 - DEBUG - send_request_headers.started request=<Request [b'HEAD']>
2024-08-02 09:56:52,445 - DEBUG - send_request_headers.complete
2024-08-02 09:56:52,446 - DEBUG - send_request_body.started request=<Request [b'HEAD']>
2024-08-02 09:56:52,447 - DEBUG - send_request_body.complete
2024-08-02 09:56:52,447 - DEBUG - receive_response_headers.started request=<Request [b'HEAD']>
2024-08-02 09:56:52,581 - DEBUG - receive_response_headers.complete return_value=(b'HTTP/1.1', 200, b'OK', [(b'Date', b'Fri, 02 Aug 2024 09:56:52 GMT'), (b'Content-Type', b'text/

2024-08-02 09:56:52,609 - DEBUG - Starting new HTTPS connection (1): huggingface.co:443




2024-08-02 09:56:52,664 - DEBUG - https://huggingface.co:443 "HEAD /api/telemetry/gradio/launched HTTP/11" 200 0
2024-08-02 09:57:17,170 - DEBUG - Calling on_part_begin with no data
2024-08-02 09:57:17,172 - DEBUG - Calling on_header_field with data[42:61]
2024-08-02 09:57:17,173 - DEBUG - Calling on_header_value with data[63:140]
2024-08-02 09:57:17,174 - DEBUG - Calling on_header_end with no data
2024-08-02 09:57:17,175 - DEBUG - Calling on_header_field with data[142:154]
2024-08-02 09:57:17,175 - DEBUG - Calling on_header_value with data[156:165]
2024-08-02 09:57:17,176 - DEBUG - Calling on_header_end with no data
2024-08-02 09:57:17,177 - DEBUG - Calling on_headers_finished with no data
2024-08-02 09:57:17,180 - DEBUG - Calling on_part_data with data[169:16375]
2024-08-02 09:57:17,183 - DEBUG - Calling on_part_data with data[0:2614]
2024-08-02 09:57:17,184 - DEBUG - Calling on_part_data with data[0:1]
2024-08-02 09:57:17,184 - DEBUG - Calling on_part_data with data[2615:16375]
2024