#Instalar Librerias

In [None]:
%%bash
pip install fiftyone


In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

In [None]:
# @title Install libraries!!
import tensorflow as tf
import urllib.request
from shutil import copyfile
import tensorflow_datasets as tfds
import tensorflow_hub as hub
import zipfile
import glob
import fiftyone
import fiftyone.zoo as foz
import os
import json
import numpy as np


To make a code easy to read, we are going to define a function to download the datasets:

In [None]:
def download_dataset(classes, max_samples, split, seed=51):
  dataset= fiftyone.zoo.load_zoo_dataset(
    "open-images-v6",
    label_types=["detections", "classifications"],
    classes=classes,
    max_samples=max_samples,
    seed=seed,
    shuffle=True,
    split=split,
    label_field='ground_truth')
  return dataset
  

In [None]:
dataset_train= download_dataset(['Car', 'Cat'], 1000, 'train')

In [None]:
fiftyone.launch_app(dataset_train)

In [None]:
dataset_test=download_dataset(['Car', 'Cat'], 250, split='validation')

Now we are going to apply a filter to the background labels in order to reduce the number of classes, because fiftyone although we had set a limit when downloading the dataset, this library automatically repopulates them

In [None]:
dataset_train = dataset_train.filter_labels("ground_truth_detections", fiftyone.ViewField("label").is_in(["Cat", "Car"]))

In [None]:
dataset_test = dataset_test.filter_labels("ground_truth_detections", fiftyone.ViewField("label").is_in(["Cat", "Car"]))

Now let's export the datasets with the **coco** format

In [None]:
dataset_train.export('/content/dataset_train_tensorflow', dataset_type=fiftyone.types.COCODetectionDataset, label_field='ground_truth_detections')

In [None]:
dataset_test.export('/content/dataset_test_tensorflow', dataset_type=fiftyone.types.COCODetectionDataset, label_field='ground_truth_detections')







Now let's make a Zip to export the images contained in the dataset

In [None]:
%%bash
zip -r /content/train_images.zip /content/dataset_train_tensorflow/data
zip -r /content/test_images.zip /content/dataset_test_tensorflow/data

In order to reduce the complexity of the labels, we are going to parse the **json** objects to save the bounding boxes as a dictionary in order to pass that lists to Tensorflow whe training the model. Let's define a funtions that parses teh Json objects and returns the dictionaries

In [None]:
def get_dictionaries(json_path):
  with open(json_path, 'r') as json_file:
    data=json.load(json_file)
    json_file.close()
    annotations=data['annotations']
    data_image=data['images']
  return annotations, data_image 

Now we are going to parse the data for the train and validation split with the function that we have defined before

In [None]:
train_annotations, train_data_image = get_dictionaries('/content/dataset_train_tensorflow/labels.json')

In [None]:
test_annotations, test_data_image = get_dictionaries('/content/dataset_test_tensorflow/labels.json')

Let's create a Dictionary that contains the bounding boxes for each image, the general structure of this dictionray will be the following one:

datos={

    'image_id':[b_boxes]
}

The model from the object detection API expects the bounding boxes to be normalized, so what we have to do is to parse the shape of each image and then storage it in a dictionary, and finally we will normalize the bounding boxes.

In [None]:
def bounding_boxes_normalized(dict_with_boxes, dict_with_image_shape):
  dict_with_shapes={}
  shapes=dict_with_image_shape
  data_boxes={}
  bboxes=dict_with_boxes

  for i in range(len(shapes)):
    if shapes[i]['id'] not in dict_with_shapes:
      dict_with_shapes[ shapes[i]['id']]=[shapes[i]['width'], shapes[i]['height']]

  for i in range(len(bboxes)):
    if bboxes[i]['image_id'] not in data_boxes:
      bx, by, bw, bh =bboxes[i]['bbox']
      nx, ny =dict_with_shapes[bboxes[i]['image_id']]
      data_boxes[bboxes[i]['image_id']]=[bx/nx, by/ny, bw/nx, bh/ny]
    
    else:
      bx, by, bw, bh =bboxes[i]['bbox']
      nx, ny =dict_with_shapes[bboxes[i]['image_id']]
      data_boxes[bboxes[i]['image_id']].extend([bx/nx, by/ny, bw/nx, bh/ny])
  
  return data_boxes


In [None]:
train_bboxes=bounding_boxes_normalized(train_annotations, train_data_image )

In [None]:
test_bboxes=bounding_boxes_normalized(test_annotations, test_data_image )

Now, let's create another json object that contains the class id in each image:

This dictionary has the following format:

    image_id:[classes contained in the image]

In [None]:
def get_dict_with_classes(dictionary):
  classes={}
  dict_with_classes=dictionary
  for i in range (len(dict_with_classes)):
    if dict_with_classes[i]['image_id'] not in classes:
      classes[dict_with_classes[i]['image_id']]=[dict_with_classes[i]['category_id']]
    else:
      classes[dict_with_classes[i]['image_id']].extend([dict_with_classes[i]['category_id']])
  return classes

In [None]:
train_classes=get_dict_with_classes(train_annotations)

In [None]:
test_classes=get_dict_with_classes(test_annotations)

We have the boxes, we have the classes. The only thing we need is another dicitionary that contains as a **key** the id if the image, and as its **value** the name of the image. We need this dictionary to load the images from its path. We will do it in another notebook.
The structure for this dictionary will be the next:


      paths{
        id:'name_of_the_image.jpg'
      }

Now let's create another Json file that contains each classes contained in each image

In [None]:
def get_image_path(dictionary_with_path):
  paths={}
  img_ids=dictionary_with_path
  for i in range(len(img_ids)):
    if img_ids[i]['id'] not in paths:
      paths[img_ids[i]['id']]=img_ids[i]['file_name']
  return paths

In [None]:
train_images_id=get_image_path(train_data_image)

In [None]:
test_images_id=get_image_path(test_data_image)

Finally, we have all the data stored in dictionaries, no we will export them and save them in Google Drive

In [None]:
def save_dicts(abs_path, dictionary):
  json_data=json.dumps(dictionary, indent=4)
  with open(abs_path, 'w+') as json_file:
    json.dump(json_data, json_file)


In [None]:
save_dicts('/content/train_images_id.json', train_images_id)
save_dicts('/content/test_images_id.json', test_images_id)
save_dicts('/content/train_bboxes.json', train_bboxes)
save_dicts('/content/test_bboxes.json', test_bboxes)
save_dicts('/content/train_classes.json', train_classes)
save_dicts('/content/test_classes.json', test_classes)


In [None]:
!rm -r /gdrive/MyDrive/Test
!rm -r /gdrive/MyDrive/Train








We already have the json files an zip directories containing the data for the train dataset, now let's do the same in order to create the test files








In [None]:
%%bash
mkdir /gdrive/MyDrive/Train
cp /content/train*  /gdrive/MyDrive/Train
mkdir /gdrive/MyDrive/Test
cp /content/test*  /gdrive/MyDrive/Test


