In [None]:
import urllib
from PIL import Image, ImageFont, ImageDraw
from IPython import display
import time
from io import BytesIO
import ipywidgets as widgets
import cv2
print("Opencv version: "+cv2.__version__)
import torch
print("Torch version: "+torch.__version__)
import torchvision
print("Torchvision version: "+torchvision.__version__)

## Localización y clasificación de entidades en una imagen

Descargamos una imagen cualquier de internet

In [None]:
url, filename = ("http://4.bp.blogspot.com/_PED_9yjYLVs/S6-fsdvtV9I/AAAAAAAAAGY/3py429MtnQ4/s1600/perros+vagos.jpg", "dogs.jpg")
urllib.request.urlretrieve(url, filename)
img = Image.open('dogs.jpg')
!rm dogs.jpg
display.display(img)

Usaremos un modelo Faster RCNN implementado en pytorch y entrenado en el dataset [COCO](https://github.com/nightrome/cocostuff/blob/master/labels.md)

In [None]:
label2name = {1: 'persona', 2: 'bicicleta', 3: 'auto', 4: 'moto', 8: 'camioneta', 18: 'perro'}

from torchvision.models.detection import fasterrcnn_resnet50_fpn
# Descargo un modelo detector pre-entrenado
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval();
model = model.to('cuda:0')

El modelo retorna 
- las coordenadas de la detección (bounding box)
- la etiqueta (label) de la detección
- la probabilidad de la detección 

In [None]:
transform = torchvision.transforms.ToTensor()
img_tensor = transform(img)
img_tensor = img_tensor.to('cuda:0')
result = model(img_tensor.unsqueeze(0))[0]

def filter_results(result, threshold=0.9):
    mask = result['scores'] > 0.9
    bbox = result['boxes'][mask].detach().cpu().numpy()
    lbls = result['labels'][mask].detach().cpu().numpy()
    return bbox, lbls

In [None]:
fnt = ImageFont.truetype("arial.ttf", 30) 

def draw_rectangles(img, bbox, lbls):
    draw = ImageDraw.Draw(img)
    for k in range(len(bbox)):
        if lbls[k] in label2name.keys():
            draw.rectangle(bbox[k], fill=None, outline='white', width=4)
            draw.text([int(d) for d in bbox[k][:2]], label2name[lbls[k]], font=fnt, fill='white')

bbox, lbls = filter_results(result)
draw_rectangles(img, bbox, lbls)
display.display(img)

In [None]:
fnt = ImageFont.truetype("arial.ttf", 40) 
out = widgets.Output(layout=widgets.Layout(height='480px', width = '720px', border='none'))
display.display(out)            
vid = cv2.VideoCapture('valdivia.mp4')

try:
    while True:
        for k in range(2): # drop one frame
            ret, frame = vid.read()
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        torch_frame = transform(frame)
        torch_frame = torch_frame.to('cuda:0')
        result = model(torch_frame.unsqueeze(0))[0]
        bbox, lbls = filter_results(result, threshold=0.1)
        img = Image.fromarray(frame)
        draw_rectangles(img, bbox, lbls)
        with out:       
            buffer = BytesIO() 
            img.save(buffer, format='JPEG')
            display.display(display.Image(data=buffer.getvalue()))
            display.clear_output(wait=True)
        #time.sleep(0.1)
        
except KeyboardInterrupt:
    vid.release()
#https://github.com/NicksonYap/Jupyter-Webcam/blob/master/Realtime_video_ipython_py3.ipynb