In [1]:
#
# 1. A simple Python application that will opens the Web Camera and displays frames.
# 2. The frames that are read from the camera are then sent to Google GenAI.
# 3. The only change that is needed is to plug in your Google API key and then run.
#

# import cv2
import threading
from PIL import Image, ImageTk
import tkinter as tk
import cv2
import io
from dotenv import load_dotenv
import os
import google.generativeai as genai
import google.ai.generativelanguage as glm

# Plug in the API key to run the code.
GOOGLE_API_KEY = os.getenv("Your Google API Key")
genai.configure(api_key=GOOGLE_API_KEY)

load_dotenv()

# genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
model = genai.GenerativeModel("gemini-pro-vision")


class ContentDescriber:
    def __init__(self, root, user_input, video_handler):
        self.root = root
        self.user_input = user_input
        self.video_handler = video_handler
        self.message_var = tk.StringVar()

    def describe_content(self):
        current_frame = self.video_handler.get_current_frame()
        if current_frame is not None:
            pil_image = Image.fromarray(cv2.cvtColor(current_frame, cv2.COLOR_BGR2RGB))
            img_byte_arr = io.BytesIO()
            pil_image.save(img_byte_arr, format="JPEG")
            blob = glm.Blob(mime_type="image/jpeg", data=img_byte_arr.getvalue())
            user_request = self.user_input.get()
            response = model.generate_content([user_request, blob], stream=True)
            for chunk in response:
                self.root.after(0, self.update_message, chunk.text)
        else:
            self.root.after(0, self.update_message, "No frame available")

    def threaded_describe_content(self):
        describe_thread = threading.Thread(target=self.describe_content)
        describe_thread.start()

    def update_message(self, new_text):
        current_text = self.message_var.get()
        updated_text = current_text + new_text + "\n"
        self.message_var.set(updated_text)


class VideoStreamHandler:
    def __init__(self, root, canvas):
        self.root = root
        self.canvas = canvas
        self.cap = cv2.VideoCapture(0)
        self.photo = None
        self.current_frame = None

    def video_stream(self):
        while self.cap.isOpened():
            ret, frame = self.cap.read()
            if ret:
                self.current_frame = frame
                cv2image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                img = Image.fromarray(cv2image)
                self.photo = ImageTk.PhotoImage(image=img)
                self.canvas.create_image(0, 0, image=self.photo, anchor=tk.NW)
                self.root.update()

    def start_stream(self):
        thread = threading.Thread(target=self.video_stream)
        thread.start()

    def stop_video(self):
        if self.cap.isOpened():
            self.cap.release()
        self.root.destroy()

    def get_current_frame(self):
        return self.current_frame


# Main GUI setup and button handlers
root = tk.Tk()
root.title("Webcam Stream")

user_input = tk.Entry(root, width=50)
user_input.pack()

canvas = tk.Canvas(root, width=640, height=480)
canvas.pack()

video_handler = VideoStreamHandler(root, canvas)
content_describer = ContentDescriber(root, user_input, video_handler)

button = tk.Button(root, text="Stop", width=50, command=video_handler.stop_video)
button.pack(anchor=tk.CENTER, expand=True)

describe_button = tk.Button(
    root, text="Describe the frame", width=50, command=content_describer.threaded_describe_content
)
describe_button.pack(anchor=tk.CENTER, expand=True)

message_label = tk.Label(root, textvariable=content_describer.message_var, wraplength=500)
message_label.pack()

video_handler.start_stream()

root.mainloop()

  from .autonotebook import tqdm as notebook_tqdm
