In [None]:
%pip install ultralytics opencv-python pyyaml

## Vision inferencing with a local model

In the first exercise, we can test out a basic [computer vision](https://www.microsoft.com/en-us/research/research-area/computer-vision/?msockid=22ee1fda33f46de00ef10b8532d86c89) inferencing task using a popular AI model called [YOLOv8](https://docs.ultralytics.com/models/yolov8/). 

To get started we will initialize the model via the Ultralytics python library. This will automatically download the model. Different sizes for the YOLOv8 model can be specified depending on the workload to adjust balance for accuracy versus speed. Once we initialize the model in our code, we can label the detected objects using [COCO dataset](https://cocodataset.org/#overview) class labels. The class labels dataset can be viewed [here](../artifacts/coco.yaml) where you can see the different types of objects that can be potentially identified.

Click on the Play icon to the left of the cell below to initialize the model.

In [1]:
import cv2, yaml
from ultralytics import YOLO
from pprint import pprint 

model = YOLO('yolov8n.pt')  # You can use 'yolov8s.pt', 'yolov8m.pt', etc. for different model sizes

# This code loads the class names from the COCO dataset yaml file. 
def load_class_names(yaml_file):
    with open(yaml_file, 'r') as f:
        data = yaml.safe_load(f)
    return data['names']

class_names = load_class_names('../artifacts/coco.yaml')  # Adjust the path to your .names file

pprint(class_names)


{0: 'person',
 1: 'bicycle',
 2: 'car',
 3: 'motorcycle',
 4: 'airplane',
 5: 'bus',
 6: 'train',
 7: 'truck',
 8: 'boat',
 9: 'traffic light',
 10: 'fire hydrant',
 11: 'stop sign',
 12: 'parking meter',
 13: 'bench',
 14: 'bird',
 15: 'cat',
 16: 'dog',
 17: 'horse',
 18: 'sheep',
 19: 'cow',
 20: 'elephant',
 21: 'bear',
 22: 'zebra',
 23: 'giraffe',
 24: 'backpack',
 25: 'umbrella',
 26: 'handbag',
 27: 'tie',
 28: 'suitcase',
 29: 'frisbee',
 30: 'skis',
 31: 'snowboard',
 32: 'sports ball',
 33: 'kite',
 34: 'baseball bat',
 35: 'baseball glove',
 36: 'skateboard',
 37: 'surfboard',
 38: 'tennis racket',
 39: 'bottle',
 40: 'wine glass',
 41: 'cup',
 42: 'fork',
 43: 'knife',
 44: 'spoon',
 45: 'bowl',
 46: 'banana',
 47: 'apple',
 48: 'sandwich',
 49: 'orange',
 50: 'broccoli',
 51: 'carrot',
 52: 'hot dog',
 53: 'pizza',
 54: 'donut',
 55: 'cake',
 56: 'chair',
 57: 'couch',
 58: 'potted plant',
 59: 'bed',
 60: 'dining table',
 61: 'toilet',
 62: 'tv',
 63: 'laptop',
 64: 'mou

### Basic object detection on a static image

The next code block will load an image from disk using the Python [OpenCV](https://opencv.org/) library and send it to the model for basic object detection. Any detected objects will be annotated with a box drawn around them.

In [2]:
# Load image
image_path = '../media/image/people_on_street.jpg'
image = cv2.imread(image_path)

# Perform basic detection
results = model(image)

# Draw bounding boxes on the image and label objects by referencing the class names
for result in results:
    for box in result.boxes:
        class_id = int(box.cls[0])
        x1, y1, x2, y2 = map(int, box.xyxy[0])
        confidence = box.conf[0]
        label = f'{class_names[class_id]} {confidence:.2f}'
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(image, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

# Save or display the image
#cv2.imwrite('output_image.jpg', image)
cv2.imshow('Press Q to close', image)
cv2.waitKey(0)
cv2.destroyAllWindows()


0: 448x640 10 persons, 2 handbags, 39.0ms
Speed: 3.4ms preprocess, 39.0ms inference, 131.2ms postprocess per image at shape (1, 3, 448, 640)


### YOLO with video

Now let's try the same technique but this time using a live video. 

In [None]:
# Load the YOLOv8 model
model = YOLO('yolov8n.pt')

# Load video
video_path = '../media/video/sample.mp4'
cap = cv2.VideoCapture(video_path)

delay = 1

# Get video writer initialized to save the output video
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter('output_video.avi', fourcc, 20.0, (int(cap.get(3)), int(cap.get(4))))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Perform detection
    results = model(frame)

    # Draw bounding boxes on the frame
    for result in results:
        for box in result.boxes:
            class_id = int(box.cls[0])
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            confidence = box.conf[0]
            label = f'{class_names[class_id]} {confidence:.2f}'
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)


    # Write the frame into the output video
    out.write(frame)

    # Display the frame until q is pressed
    cv2.imshow('Detected People (press Q to exit)', frame)
    if cv2.waitKey(delay) & 0xFF == ord('q'): 
        break

cap.release()
out.release() 
cv2.destroyAllWindows()


0: 384x640 6 persons, 65.2ms
Speed: 3.8ms preprocess, 65.2ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 8 persons, 1 boat, 10.1ms
Speed: 2.2ms preprocess, 10.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 boat, 5.4ms
Speed: 1.4ms preprocess, 5.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 5.8ms
Speed: 1.7ms preprocess, 5.8ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 persons, 11.7ms
Speed: 2.7ms preprocess, 11.7ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 1 apple, 5.1ms
Speed: 2.0ms preprocess, 5.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 6 persons, 2 apples, 5.3ms
Speed: 1.5ms preprocess, 5.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 2 apples, 5.2ms
Speed: 1.4ms preprocess, 5.2ms inference, 1.6

In [None]:
# Load the YOLOv8 model
model = YOLO('yolov8n.pt')

# Load video
video_path = 'rtsp://admin:Passw0rd123!!@192.168.2.245:554/cam/realmonitor?channel=1&subtype=0'
cap = cv2.VideoCapture(video_path)

# Get video writer initialized to save the output video
fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter('output_video.avi', fourcc, 20.0, (int(cap.get(3)), int(cap.get(4))))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Perform detection
    results = model(frame)

    # Draw bounding boxes on the frame
    for result in results:
        for box in result.boxes:
            class_id = int(box.cls[0])
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            confidence = box.conf[0]
            label = f'{class_names[class_id]} {confidence:.2f}'
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)


    # Write the frame into the output video
    out.write(frame)

    # Display the frame until q is pressed
    cv2.imshow('Detected People (press Q to exit)', frame)
    if cv2.waitKey(1) & 0xFF == ord('q'): 
        break

cap.release()
out.release() 
cv2.destroyAllWindows()


0: 384x640 1 person, 1 suitcase, 1 tv, 1 laptop, 21.5ms
Speed: 5.0ms preprocess, 21.5ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 suitcase, 1 tv, 6.1ms
Speed: 1.9ms preprocess, 6.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 suitcase, 1 tv, 11.7ms
Speed: 1.5ms preprocess, 11.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 suitcase, 1 tv, 11.6ms
Speed: 1.8ms preprocess, 11.6ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 suitcase, 1 tv, 6.9ms
Speed: 1.8ms preprocess, 6.9ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 suitcase, 1 tv, 6.1ms
Speed: 1.5ms preprocess, 6.1ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 1 suitcase, 1 tv, 6.2ms
Speed: 1.8ms preprocess, 6.2ms inference, 1.4ms postprocess per image at shape (1, 3, 

## Vision inferencing with Azure OpenAI and GPT4o

In [None]:
import os
import requests
import base64
from pprint import pprint

API_KEY = "xxxxxx"

encoded_image = base64.b64encode(open(IMAGE_PATH, 'rb').read()).decode('ascii')
headers = {
    "Content-Type": "application/json",
    "api-key": API_KEY,
}

# Payload for the request
payload = {
  "messages": [
    {
      "role": "system",
      "content": [
        {
          "type": "text",
          "text": "You are an AI assistant that helps people find information."
        }
      ]
    }
  ],
  "temperature": 0.7,
  "top_p": 0.95,
  "max_tokens": 800
}

ENDPOINT = "https://jsextoai.openai.azure.com/openai/deployments/gpt-4/chat/completions?api-version=2024-08-01-preview"

# Send request
try:
    response = requests.post(ENDPOINT, headers=headers, json=payload)
    response.raise_for_status()  # Will raise an HTTPError if the HTTP request returned an unsuccessful status code
except requests.RequestException as e:
    raise SystemExit(f"Failed to make the request. Error: {e}")

# Handle the response as needed (e.g., print or process)
response_json = response.json()
pprint(response_json)

In [None]:
import json
from pprint import pprint

def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


IMAGE_PATH = "./images/columns.png"
image_base64 = image_to_base64(IMAGE_PATH)
payload = {
  "messages": [
    { "role": "system", "content": "You are a helpful assistant." },
        { "role": "user", "content": [  
            { 
                "type": "text", 
                "text": "Describe this picture:" 
            },
            { 
                "type": "image_url",
                "image_url": {
                    "url": "data:image/png;base64," + image_base64
                }
            }
        ] } 
  ],
  "temperature": 0.7,
  "top_p": 0.95,
  "max_tokens": 800
}

response = requests.post(ENDPOINT, headers=headers, json=payload)
response_json = response.json()

# Pretty print the JSON response
pprint(response_json)