# Примеры решения различных задач машинного зрения

Как Вы знаете из лекции к основным задачам относятся:
1. *Классификация изображений* - определение категории или метки всего изображения.
2. *Детекция объектов* - Поиск и локализация объектов на изображении с указанием их координат.   
3. *Сегментация изображений*
   - *Семантическая сегментация* - классификация каждого пикселя изображения на основе принадлежности к определённому классу.
   - *Инстанс-сегментация* - обнаружение отдельных экземпляров объектов с учетом их границ.
4. *Распознавание объектов (лиц)* - детекция и идентификация объектов (по сути, совмещение задач классификации и детекции)
5. *Работа с пространственными данными (облаками точек)* - определение глубины изображения и построение трёхмерной модели.
6. *Анализ и обработка видео* - трекинг объектов, детектирование движений и распознавания действий.

Больше примеров можно найти [тут](https://github.com/Charmve/computer-vision-in-action/tree/main/notebooks).

In [None]:
import os
import json

import cv2
import torch
import torchvision.transforms as transforms
import torchvision.models as models

from segment_anything import (SamPredictor, 
                              sam_model_registry)

from sentence_transformers import (SentenceTransformer, 
                                   util)

from transformers import (BlipProcessor, 
                          BlipForConditionalGeneration)

import numpy as np

import urllib.request
import requests
from PIL import Image
import matplotlib.pyplot as plt

In [2]:
def download_file(url, 
                  local_path):
    if not os.path.exists(local_path):
        print(f"Downloading {local_path} from {url} ...")
        r = requests.get(url, stream=True)
        with open(local_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        print(f"Downloaded {local_path}")
    else:
        print(f"{local_path} already exists.")

In [None]:
image_url = "https://djl.ai/examples/src/test/resources/dog_bike_car.jpg"
response = requests.get(image_url)
image_data = np.asarray(bytearray(response.content), dtype="uint8")
image = cv2.imdecode(image_data, cv2.IMREAD_COLOR)
image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
plt.imshow(image)
plt.axis("off")
plt.show()

## Classification with ResNet50

In [None]:
labels_url = "https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json"

labels_path = "imagenet_labels.json"

download_file(labels_url, labels_path)

model = models.resnet50(weights=True)
model.eval()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

img = Image.fromarray(image)
img_tensor = preprocess(img)
img_tensor = img_tensor.unsqueeze(0)

with torch.no_grad():
    outputs = model(img_tensor)
    # get 3 highest probabilities
    _, predicted = torch.topk(outputs, 5)

predicted = predicted[0].tolist()

with open(labels_path, "r") as f:
    labels = json.load(f)

predicted_label = ""
for label in predicted:
    predicted_label += str(labels[label]) + ", "

plt.figure(figsize=(8, 6))
plt.imshow(img)
plt.title(f"Predicted: {predicted_label}")
plt.axis("off")
plt.show()

## Object Detection with Yolo V3

In [None]:
yolo_cfg_url = "https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"
yolo_weights_url = "https://pjreddie.com/media/files/yolov3.weights"
coco_names_url = "https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names"

yolo_cfg_path = "yolov3.cfg"
yolo_weights_path = "yolov3.weights"
coco_names_path = "coco.names"

download_file(yolo_cfg_url, yolo_cfg_path)
download_file(yolo_weights_url, yolo_weights_path)
download_file(coco_names_url, coco_names_path)

with open(coco_names_path, "r") as f:
    classes = [line.strip() for line in f.readlines()]

net = cv2.dnn.readNetFromDarknet(yolo_cfg_path, yolo_weights_path)

layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)
net.setInput(blob)
outs = net.forward(output_layers)

image_copy = image.copy()
height, width, _ = image_copy.shape
class_ids = []
confidences = []
boxes = []

for out in outs:
    for detection in out:
        scores = detection[5:]
        class_id = np.argmax(scores)
        confidence = scores[class_id]
        if confidence > 0.5:
            center_x = int(detection[0] * width)
            center_y = int(detection[1] * height)
            w = int(detection[2] * width)
            h = int(detection[3] * height)
            x = int(center_x - w / 2)
            y = int(center_y - h / 2)
            boxes.append([x, y, w, h])
            confidences.append(float(confidence))
            class_ids.append(class_id)

indices = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

for i in indices:
    i = i[0] if isinstance(i, (list, tuple, np.ndarray)) else i
    x, y, w, h = boxes[i]
    label = str(classes[class_ids[i]])
    conf = confidences[i]
    cv2.rectangle(image_copy, (x, y), (x+w, y+h), (128, 0, 128), 2)
    cv2.putText(image_copy, f"{label} {conf:.2f}", (x, y - 10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (128, 0, 128), 2)

plt.figure(figsize=(12, 8))
plt.imshow(image_copy)
plt.axis("off")
plt.show()


## Instance segmentation with Segment Anything Model (SAM)

[Пример](https://github.com/facebookresearch/segment-anything/blob/main/notebooks/predictor_example.ipynb) от авторов.

In [None]:
# !pip install git+https://github.com/facebookresearch/segment-anything.git -q
# !curl https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth -O sam_vit_h.pth

In [None]:
sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")
predictor = SamPredictor(sam)

predictor.set_image(image)

input_point = np.array([[200, 375]])
input_label = np.array([1])

masks, scores, logits = predictor.predict(
    point_coords=input_point,
    point_labels=input_label,
    multimask_output=True,
)

plt.figure(figsize=(12, 8))
plt.imshow(image)
for mask in masks:
    plt.imshow(mask, alpha=0.5, cmap='jet')
plt.scatter(input_point[:, 0], input_point[:, 1], color='red', s=100)
plt.axis('off')
plt.show()

## Zero-shot classification with CLIP

In [None]:
model = SentenceTransformer('clip-ViT-B-32')

img_emb = model.encode(Image.fromarray(image))

texts = ['Haskey', 'Dog', 'Cat', 'Bicycle']
text_emb = model.encode(texts)

cos_scores = util.cos_sim(img_emb, text_emb)[0]
print(cos_scores)

predicted_text = texts[torch.argmax(cos_scores).item()]
print(f"Best description: {predicted_text}")

## VLM description generation with BLIP

In [None]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

inputs = processor(images=Image.fromarray(image), return_tensors="pt")

outputs = model.generate(**inputs)
caption = processor.decode(outputs[0], skip_special_tokens=True)

print(f"Generated caption: {caption}")