In [None]:
import gradio as gr
import cv2
import numpy as np
import random
import platform
import io
import base64
from PIL import Image, ImageDraw, ImageFont
import requests
import re

def random_color():
    return (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))

def get_font():
    font_size=50
    if platform.system() == "Darwin":
        font = ImageFont.truetype("AppleGothic.ttf", size=font_size)
    elif platform.system() == "Windows":
        font = ImageFont.trutype("malgun.ttf", size=font_size)
    else:
        font = ImageFont.load_default(size=font_size)
    
    return font


# function
weights_path = "yolo실습자료/yolov3.weights"
config_path = "yolo실습자료/yolov3.cfg"
names_path = "yolo실습자료/coco_korean.names"

net = cv2.dnn.readNet(weights_path, config_path)

with open(names_path, 'r', encoding='utf-8') as file:
    label_list = file.read().strip().split('\n')
    
#print(net, label_list)

###################################################
# Open AI
###################################################
def request_gpt(image_array):
    endpoint = "{}openai/deployments/{}/chat/completions?api-version=2025-01-01-preview".format(OPENAI_ENDPOINT, DEPLOYMENT_NAME)
    headers = {
        "Content-Type" : "application/json", 
        "api-key" : OPENAI_API_KEY
    }


    # numpy -> PIL
    image = Image.fromarray(image_array)
    # PIL -> 바이너리
    buffered_io = io.BytesIO()
    image.save(buffered_io, format='png')
    # Base64로 인코딩
    base64_image = base64.b64encode(buffered_io.getvalue()).decode('utf-8')
    #print(base64_image[:100])

    message_list = list()

    message_list.append({
        "role" : "system", 
        "content" : [{
            "type" : "text", 
            "text" : """
            너는 사진 속제서 감지된 물체를 분석하는 봇이야.
            무조건 분석 결과를 한국어로 답변해줘
            """
        }],
        
    })

    message_list.append({
        "role" : "user", 
        "content" : [{
            "type" : "text", 
            "text" : """
            너는 물체를 감지하는 YOLO 모델이야.
            이 사진에서 감지된 물체데 대해 감지 확률과 함께 자세한 설명을 붙여줘.
            반드시 감지된 물체, 바운딩박스,안에 있는 물체에 대해서만 설명해줘
            부연 설명은 필요없고 감지왼 물체에 대해서만 설명해줘야 해
            """
        },
        {
            "type" : "image_url",
            "image_url" : {
                "url" : "data:image/png;base64,{}".format(base64_image)
            }
        }],
        
    })

    body = {
        "messages" : message_list,
        "temperature" : 0.7,
        "top_p" : 0.95,
        "max_tokens" : 16000
    }

    response = requests.post(endpoint, headers=headers, json=body)

    print(response.status_code, response.text)

    if response.status_code ==200:
        response_json = response.json()
        content = response_json['choices'][0]['message']['content']
    else:
        content = response.text

    return content


#####################################
# TTS
#####################################
def request_tts(text):
    endpoint = SPEECH_ENDPOINT
    headers = {
        "Ocp-Apim-Subscription-Key" : SPEECH_API_KEY,
        "Content-Type" : "application/ssml+xml",
        "X-Microsoft-OutputFormat" : "audio-16khz-128kbitrate-mono-mp3"
    }
    body = f""" 
        <speak version='1.0' xml:lang='ko-KR'>
            <voice xml:lang='en-US' xml:gender='Female' name='ko-KR-GookMinNeural'>
                <prosody rate="50%">
                    {text}
                </prosody>
            </voice>
        </speak>
    """

    response = requests.post(endpoint, headers=headers, data=body)
    print(response.status_code, response.text)

    if response.status_code ==200:
        file_name = "response_audio.wav"

        with open(file_name, "wb") as audio_file:
            audio_file.write(response.content)
        
        return file_name
    
    else:
        return None



def detect_objects(image):
    
    drawn_image = Image.fromarray(image.copy())
    draw = ImageDraw.Draw(drawn_image)
    
    # 이미지의 height, width 를 뽑아낸다.
    height , width = image.shape[:2]
    #print(height, width)
    
    blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob=blob)
    # 블롭을 생성하고 전방향 전파 진행.
    layer_name_list = net.getLayerNames()
    output_layer_list  = [layer_name_list[i - 1] for i in net.getUnconnectedOutLayers()]
    
    # yolo_82, yolo_94, yolo_102 총 3개의 레이어들이 예측을 진행. detection_list에는 총 3개의 예측이 있음.
    detection_list = net.forward(output_layer_list)
    
    bounding_box_list = list()
    confidence_list = list()
    class_index_list = list()

    for output in detection_list:
        # output : 각 레이어의 예측 정보.
        for detection in output: 
            # detection : 총 85개. x, y, w, h, confidence + 총 80개의 names정보.
            score_list = detection[5:] 
            class_index = np.argmax(score_list)
            confidence = score_list[class_index]
            if confidence > 0:
                #print(class_index, label_list[class_index], confidence)
                bounding_box = detection[:4] * np.array([width, height, width, height])
                center_x, center_y, w, h = bounding_box.astype('int')
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                
                if x < 0:
                    x = 0
                if y < 0:
                    y = 0
                
                #print(x, y, w, h)

                bounding_box_list.append([x, y, w, h])
                confidence_list.append(confidence)
                class_index_list.append(class_index)

                
                # draw.rectangle((x, y, x + w, y + h), outline=(0, 255, 0), width=3)
                # draw.text((x + 5, y + 5), text="{} : {:0.2f}%".format(label_list[class_index], confidence * 100), fill=(255, 0, 0))
            # print(detection[:5])
            # print(detection[5:])

    # 중복 제거
    extracted_index_list = cv2.dnn.NMSBoxes(bounding_box_list, confidence_list, 0.5, 0.4)
    #print(class_index_list, extracted_index_list)


    for extracted_index in extracted_index_list:
        x, y, w, h = bounding_box_list[extracted_index]
        confidence = confidence_list[extracted_index]
        class_index = class_index_list[extracted_index]
        label = label_list[class_index]

        color = random_color()
        #print(label, x, y, w, h, confidence)


        draw.rectangle((x, y, x + w, y + h), outline=color, width=3)
        draw.text((x + 5, y + 5), text="{} : {:0.2f}%".format(label_list[class_index], confidence * 100), fill=color, font=get_font())

    # print(image)
    
    return drawn_image

with gr.Blocks() as demo:
    
    def stream_webcam(image):
        # detect function.
        drawn_image = detect_objects(image)
        return drawn_image
    
    def click_capture(image):
        return image
    
    def click_send_gpt(image_array, histories):
        content = request_gpt(image_array)
        histories.append({"role" : "user", "content" : gr.Image(label="감지화면", value=image_array)})
        histories.append({"role" : "assistant", "content" : content})

        return histories
    
    def change_chatbot(histories):
        #print(histories)
        content = histories[-1]['content']
        print(content)
        pattern = r'[^가-힣a-zA-Z\s%,\.\d]'
        cleaned_content = re.sub(pattern, '', content)
        print(cleaned_content)

        # tts
        file_name = request_tts(cleaned_content)
        return file_name
    
    with gr.Row():
        # webcam, stream output image, capture image component

        webcam_input = gr.Image(label="실시간 화면", sources="webcam", width=480, height=270, mirror_webcam=False)
        output_image = gr.Image(label="검출 화면", type="pil", interactive=False)
        output_capture_image = gr.Image(label="캡쳐 화면", interactive=False)


    with gr.Row():
        # 캡쳐 버튼, 분석 버튼
        capture_button = gr.Button("화면 캡쳐")
        send_gpt_button = gr.Button("GPT로 전송")
        

    with gr.Column():
        # 챗봇
        # 분석 내용을 읽어주는 TTS
        chatbot = gr.Chatbot(label="분석 결과", type="messages")
        chatbot_audio = gr.Audio(label="GPT", interactive=False, autoplay=True) 

    
    webcam_input.stream(stream_webcam, inputs=[webcam_input], outputs=[output_image])
    capture_button.click(click_capture, inputs=[output_image], outputs=[output_capture_image])
    send_gpt_button.click(click_send_gpt, inputs=[output_capture_image, chatbot], outputs=[chatbot])
    chatbot.change(change_chatbot, inputs=[chatbot], outputs=[chatbot_audio])

demo.launch()
image = cv2.imread("./yolo실습자료/traffic-7033509_1280.jpg")
# detect_objects(image)
# request_gpt(image)
#request_tts("안녕하세요")

* Running on local URL:  http://127.0.0.1:7876

To create a public link, set `share=True` in `launch()`.




In [55]:
import numpy as np
bounding_box = [1,2,3,4] * np.array([1.123123123,2.123123,3.213123,4.123123])
print(bounding_box)                                     
bounding_box.astype('int')

[ 1.12312312  4.246246    9.639369   16.492492  ]


array([ 1,  4,  9, 16])