## **Kaggle Notebook Info**
> This Python 3 environment comes with many helpful analytics libraries installed
> It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
> For example, here's several helpful packages to load
```python
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
```
> Input data files are available in the read-only "../input/" directory
> For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
```python
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
```
> You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
> You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install supabase

In [None]:
from dataclasses import dataclass
from io import BytesIO, BufferedReader

from kaggle_secrets import UserSecretsClient
from supabase import create_client, Client
import numpy as np
import ray
import requests
import torch
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2, FasterRCNN_ResNet50_FPN_V2_Weights
from torchvision.transforms.functional import convert_image_dtype, to_pil_image, to_tensor
from torchvision.utils import draw_bounding_boxes
from PIL import Image

In [None]:
try:
    ray.init(num_cpus=2, num_gpus=1)
except RuntimeError:
    ray.shutdown()
    ray.init(num_cpus=4, num_gpus=2)


In [None]:
user_secrets = UserSecretsClient()
data_store_url = user_secrets.get_secret("SUPABASE_URL")
data_store_key = user_secrets.get_secret("SUPABASE_KEY")
supabase_client = create_client(data_store_url, data_store_key)


def store_data_in_bucket(data, file_path, bucket_name="test_bucket"):
    def to_binary_stream(d):
        b_handle = BytesIO()
        b_handle.write(d)
        help(b_handle.write)
        b_handle.seek(0)
        return BufferedReader(b_handle)

    def file_exists():
        file_parts = file_path.split("/")
        folder = "/".join(file_parts[:-1])
        for file in supabase_client.storage.from_(bucket_name).list(path=folder):
            if file["name"] == file_parts[-1]:
                return True
        return False

    if not supabase_client.storage.get_bucket(bucket_name):
        supabase_client.storage.create_bucket(bucket_name)
    # upload file
    file_options = {
        "content-type": "application/vnd.apache.parquet",
        "cache-control": "3600",
        "upsert": "true"
    }
    if file_exists():
        supabase_client.storage.from_("test_bucket").update(
            file=to_binary_stream(data),
            path=file_path,
            file_options=file_options
        )
    else:
        supabase_client.storage.from_("test_bucket").upload(
            file=to_binary_stream(data),
            path=file_path,
            file_options=file_options
        )


def preprocess(data: dict[str, np.ndarray]):
    weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
    preprocessor: transforms.Compose = transforms.Compose(
        [transforms.ToTensor(), weights.transforms()]
    )
    return {
        "image": data["image"],
        "transformed": preprocessor(data["image"])
    }


@dataclass
class ObjectDetectionModel:
    weights = FasterRCNN_ResNet50_FPN_V2_Weights.DEFAULT
    model = fasterrcnn_resnet50_fpn_v2(weights=weights, box_score_thresh=0.9)
    preprocessor: transforms.Compose = transforms.Compose(
        [transforms.ToTensor(), weights.transforms()]
    )

    def __call__(self, input_batch: dict[str, np.ndarray]):
        batch = [torch.from_numpy(img) for img in input_batch["transformed"]]
        if torch.cuda.is_available():
            batch = [img.cuda() for img in batch]
        predictions = self.model(batch)
        return {
            "image": input_batch["image"],
            "labels": [p["labels"].detach().cpu().numpy() for p in predictions],
            "boxes": [p["boxes"].detach().cpu().numpy() for p in predictions],
        }

    def _eval(self):
        if torch.cuda.is_available():
            self.model = self.model.cuda()
        self.model.eval()
        return self

    def visualize_detection(self, image_object):
        transformed_image = transforms.Compose([transforms.PILToTensor()])(image_object)
        preprocess = self.weights.transforms()
        batch = preprocess(transformed_image)
        prediction = self.model(batch)[0]
        labels = [self.weights.meta["categories"][i] for i in prediction["labels"]]
        bounding_boxes = draw_bounding_boxes(
            transformed_image,
            boxes=prediction["boxes"],
            labels=labels,
            colors="red",
            width=4
        )
        display(to_pil_image(bounding_boxes.detach()))


def process_data(concurrency=4, batch_size=4, num_gpus=1):
    raw_dataset = ray.data.read_images("s3://anonymous@air-example-data/AnimalDetection/JPEGImages")
    preprocessed_data = raw_dataset.map(preprocess)  # function=based UDFs run as short-running ray "tasks"
    dataset = preprocessed_data.map_batches(
        ObjectDetectionModel,  # class-based UDFs run as long-running ray "actors"
        concurrency=concurrency,  # number of parallel actors
        batch_size=batch_size,
        num_gpus=num_gpus
    )
    return dataset


def show_sample(dataset, batch_size=2):
    batch = dataset.take_batch(batch_size=batch_size)
    for image, labels, boxes in zip(batch["image"], batch["labels"], batch["boxes"]):
        img = convert_image_dtype(to_tensor(image), torch.uint8)
        labels = [weights.meta["categories"][i] for i in labels]
        boxes = torch.from_numpy(boxes)
        image_object = to_pil_image(draw_bounding_boxes(
            img, boxes, labels=labels, width=4
        ))
        display(image_object)


In [None]:
def main(concurrency=4, batch_size=4, num_gpus=1):
    dataset = process_data(
        concurrency=concurrency, 
        batch_size=batch_size, 
        num_gpus=num_gpus
    )
    show_sample(dataset)
    try:
        store_data_in_bucket(
            file_path="data/object_detection_dataset.parquet"
        )
    except Exception as e:
        print(f"Could not upload file: {e}")
    finally:
        return dataset


In [None]:
transformed_dataset = main(concurrency=2, batch_size=2, num_gpus=1)