In [3]:
from pathlib import Path

import numpy as np
from skimage import io
from skimage.transform import rescale, resize
from scipy.ndimage import gaussian_filter
from scipy import ndimage

import torch
from torchvision import transforms
# from transformers import Dinov2Backbone
from torchvision.models import resnet18, resnet50

In [4]:
code_dir = Path.home() / "work/brox/python-example-droid-dataset"
backbone = resnet50(pretrained=True)
children = list(backbone.children())
newmodel = torch.nn.Sequential(*children[0:10])
newmodel



Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [3]:
imginfo = lambda img: print(type(img), img.dtype, img.shape, img.min(), img.max())

# === load first touch coors
import json
with open(code_dir / "data/single_log.json") as f:
    episode_log = json.load(f)
first_x, first_y, first_z = map(int, episode_log["first_touch"])
cropw = 200

In [4]:
# === first image VS max distant image during grip
data_path = code_dir / "data/frames"
file_list = sorted(data_path.glob("center*jpg"))
first_image = io.imread(file_list[0])
mid_image = io.imread(file_list[-1])
max_image = io.imread(data_path / "max_image.jpg")

h, w = first_image.shape[:2]
if (0 <= first_y - cropw and first_y + cropw < h and
    0 <= first_x - cropw and first_x + cropw < w):
    good_episode = True

In [16]:
def to_batch(image):
    downscaled = resize(image, (224, 224))
    #batch = torch.from_numpy(downscaled)[None, :, :, :].to(torch.float32) / 255 # [h, w, c] -> [1, h, w, c]
    #batch = batch.permute(0, 3, 1, 2)  # [1, h, w, c] -> [1, c, h, w]

    batch = torch.from)image / 255

    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    batch = preprocess(batch)
    imginfo(batch)
    
    return batch

def cut_crop(image):
    """
    :param image: [h, w, c]
    """
    return image[first_y-cropw:first_y+cropw, first_x-cropw:first_x+cropw]

save_images = {
    "crop_first": cut_crop(first_image),
    "crop_max": cut_crop(max_image)
}

for name, image in save_images.items():
    io.imsave(data_path / (name + ".jpg"), image.astype(np.uint8))

In [17]:
# === Run resnet on crops
dino1 = newmodel(to_batch(cut_crop(first_image))).detach() # [1, c, h, w] range [0, 1] torch.float32
dino2 = newmodel(to_batch(cut_crop(max_image))).detach()

def cos_sim(vec1: np.ndarray, vec2: np.ndarray):
    # (vec1, vec2) = |vec1||vec2|cos alpha
    return np.sum(vec1 * vec2) / np.linalg.norm(vec1) / np.linalg.norm(vec2)

v1 = dino1.flatten().numpy()
v2 = dino2.flatten().numpy()
cos_sim(v1, v2)

TypeError: Unexpected type <class 'numpy.ndarray'>

In [8]:
s = 50
x, y = 795, 363
crop = first_image[y-s:y+s, x-s:x+s]
#io.imsave(data_path / ("small.jpg"), crop.astype(np.uint8))

In [34]:
s = 100
x, y = 795, 363
crop = first_image[y-s:y+s, x-s:x+s]
dino1 = newmodel(to_batch(crop)).detach()

s = 100
x, y = 750, 360
crop = first_image[y-s:y+s, x-s:x+s]
dino2 = newmodel(to_batch(crop)).detach()

v1 = dino1.flatten().numpy()
v2 = dino2.flatten().numpy()
cos_sim(v1, v2)

0.9982982

In [15]:
from torch.nn import functional as F

# === just run resnet
image = io.imread(code_dir / "data/penguin.jpg")
h, w, c = image.shape
image = image[:, w//2-h//2:w//2+h//2]

v = backbone(to_batch(image)).detach()
pred_probs = F.softmax(v[0], dim=0).cpu().data.numpy()

<class 'torch.Tensor'> torch.float32 torch.Size([1, 3, 224, 224]) tensor(-2.1179) tensor(-1.7870)


In [13]:
sorted(list(enumerate(list(pred_probs))), key=lambda tup: tup[1], reverse=True)

[(600, 0.006743107),
 (852, 0.005591981),
 (731, 0.0055306284),
 (463, 0.005397659),
 (733, 0.0042607603),
 (700, 0.004144021),
 (792, 0.0037841764),
 (837, 0.0037685703),
 (898, 0.0037080306),
 (618, 0.0035728852),
 (845, 0.0035567062),
 (836, 0.0035521074),
 (899, 0.0034810852),
 (784, 0.003459974),
 (523, 0.0034291914),
 (902, 0.0034145112),
 (515, 0.003413358),
 (728, 0.0032963085),
 (987, 0.0032383718),
 (587, 0.0032284337),
 (428, 0.0031915458),
 (447, 0.0031775096),
 (696, 0.0031581838),
 (488, 0.0031161313),
 (813, 0.0031005843),
 (851, 0.0030727931),
 (879, 0.0030680099),
 (676, 0.0030454095),
 (469, 0.0030447233),
 (412, 0.003011957),
 (769, 0.002968858),
 (758, 0.0029618328),
 (910, 0.0029563888),
 (409, 0.0029318084),
 (541, 0.0029209664),
 (151, 0.0028280187),
 (929, 0.0028185362),
 (840, 0.0027814151),
 (778, 0.0027719901),
 (749, 0.0027434537),
 (456, 0.0027335056),
 (818, 0.0027277102),
 (764, 0.0026953153),
 (722, 0.002686507),
 (868, 0.0026332168),
 (596, 0.0025742203

In [None]:
io.imsave(data_path / ("pen.jpg"), image.astype(np.uint8))

## Segmentation

In [8]:
#from torchvision.io.image import read_image
from PIL import Image
from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights
from torchvision.transforms.functional import to_pil_image

img = Image.open(code_dir / "data" / 'dog.jpg')

# Step 1: Initialize model with the best available weights
weights = FCN_ResNet50_Weights.DEFAULT
model = fcn_resnet50(weights=weights)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Step 3: Apply inference preprocessing transforms
batch = preprocess(img).unsqueeze(0)

# Step 4: Use the model and visualize the prediction
prediction = model(batch)["out"]
normalized_masks = prediction.softmax(dim=1)
class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta["categories"])}
mask = normalized_masks[0, class_to_idx["dog"]]
to_pil_image(mask).show()


In [6]:
weights.meta["categories"]

['__background__',
 'aeroplane',
 'bicycle',
 'bird',
 'boat',
 'bottle',
 'bus',
 'car',
 'cat',
 'chair',
 'cow',
 'diningtable',
 'dog',
 'horse',
 'motorbike',
 'person',
 'pottedplant',
 'sheep',
 'sofa',
 'train',
 'tvmonitor']

In [1]:
import torch
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
# or any of these variants
# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet34', pretrained=True)
# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet50', pretrained=True)
# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet101', pretrained=True)
# model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet152', pretrained=True)
model.eval()


Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to /home/iternal/.cache/torch/hub/v0.10.0.zip


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [2]:
# Download an example image from the pytorch website
import urllib
url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
try: urllib.URLopener().retrieve(url, filename)
except: urllib.request.urlretrieve(url, filename)

In [22]:
filename = "crop2.jpg"

In [23]:
# sample execution (requires torchvision)
from PIL import Image
from torchvision import transforms
input_image = Image.open(filename)
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model

# move the input and model to GPU for speed if available
if torch.cuda.is_available():
    input_batch = input_batch.to('cuda')
    model.to('cuda')

with torch.no_grad():
    output = model(input_batch)
# Tensor of shape 1000, with confidence scores over ImageNet's 1000 classes
# print(output[0])
# The output has unnormalized scores. To get probabilities, you can run a softmax on it.
probabilities = torch.nn.functional.softmax(output[0], dim=0)
# print(probabilities)


In [4]:
# Download ImageNet labels
!wget https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt

--2024-07-13 19:40:07--  https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10472 (10K) [text/plain]
Saving to: ‘imagenet_classes.txt’


2024-07-13 19:40:07 (39.0 MB/s) - ‘imagenet_classes.txt’ saved [10472/10472]



In [21]:
# Read the categories
with open("imagenet_classes.txt", "r") as f:
    categories = [s.strip() for s in f.readlines()]
# Show top categories per image
top5_prob, top5_catid = torch.topk(probabilities, 5)
for i in range(top5_prob.size(0)):
    print(categories[top5_catid[i]], top5_prob[i].item())

washbasin 0.4116014838218689
soap dispenser 0.09876617789268494
toilet seat 0.06358987838029861
bathtub 0.04791147634387016
can opener 0.025523997843265533
