Creation of a subset of Coco dataset



In [1]:
# download coco's annotations trainval2017 and by using coco API, create a subset

!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip annotations_trainval2017.zip
!rm -rf annotations_trainval2017.zip

# keep only instances_train/val.json 
%cd /content/annotations/
!rm captions_train2017.json
!rm captions_val2017.json
!rm person_keypoints_train2017.json
!rm person_keypoints_val2017.json

--2021-03-24 17:53:46--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.228.104
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.228.104|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252907541 (241M) [application/zip]
Saving to: ‘annotations_trainval2017.zip’


2021-03-24 17:53:48 (98.2 MB/s) - ‘annotations_trainval2017.zip’ saved [252907541/252907541]

Archive:  annotations_trainval2017.zip
  inflating: annotations/instances_train2017.json  
  inflating: annotations/instances_val2017.json  
  inflating: annotations/captions_train2017.json  
  inflating: annotations/captions_val2017.json  
  inflating: annotations/person_keypoints_train2017.json  
  inflating: annotations/person_keypoints_val2017.json  
/content/annotations


In [2]:
# use of coco API

from pycocotools.coco import COCO
import requests
import random

# take 45 images with their annotations for each of 4 classes for training and another 10 for testing, respectively
classes = ['person', 'horse', 'cow', 'sheep']

imagesTrain = []
imagesVal = []

#unique_imagesTrain = []
#unique_imagesVal = []

subimagesTrain = []
subimagesVal = []

cocoTrain = COCO('/content/annotations/instances_train2017.json')
cocoVal = COCO('/content/annotations/instances_val2017.json')

for className in classes:

  # images for training
  catIdsTrain = cocoTrain.getCatIds(catNms=className)
  imgIdsTrain = cocoTrain.getImgIds(catIds=catIdsTrain)
  imagesTrain = cocoTrain.loadImgs(imgIdsTrain)

  # filter out the repeated images    
  #for k in range(len(imagesTrain)):
  #  if imagesTrain[k] not in unique_imagesTrain:
  #    unique_imagesTrain.append(imagesTrain[k])

  # take 45 of them randomly
  subimagesTrain += random.sample(imagesTrain, 45)

  # images for testing
  catIdsVal = cocoVal.getCatIds(catNms=className)
  imgIdsVal = cocoVal.getImgIds(catIds=catIdsVal)
  imagesVal = cocoVal.loadImgs(imgIdsVal)

  # filter out the repeated images
  #for l in range(len(imagesVal)):
  #  if imagesVal[l] not in unique_imagesVal:
  #    unique_imagesVal.append(imagesVal[l])

  # take 10 of them randomly
  subimagesVal += random.sample(imagesVal, 10)



loading annotations into memory...
Done (t=15.32s)
creating index...
index created!
loading annotations into memory...
Done (t=0.50s)
creating index...
index created!


In [3]:
# save images to dir
!mkdir train_images
!mkdir test_images

for im in subimagesTrain:
  #print("im: ", im)
  img_data = requests.get(im['coco_url']).content
  with open('train_images/' + im['file_name'], 'wb') as handler:
    handler.write(img_data)

for im2 in subimagesVal:
  #print("im: ", im)
  img_data2 = requests.get(im2['coco_url']).content
  with open('test_images/' + im2['file_name'], 'wb') as handler:
    handler.write(img_data2)


In [4]:
# check the lengths of dirs
import os

print(len(os.listdir('/content/annotations/train_images')))
print(len(os.listdir('/content/annotations/test_images')))

180
39


In [5]:
# keep annotations of the above train/test images at .csv and .json data files

# csv

import csv
header = ['filename', 'x', 'y', 'width', 'height', 'category_id' ]

with open('train_annotations'  + '.csv', mode='w', newline='') as annot:
  annot_writer = csv.writer(annot)
  annot_writer.writerow(header)
  for im in subimagesTrain:    
    annIdsTrain = cocoTrain.getAnnIds(imgIds=im['id'])
    annsTrain = cocoTrain.loadAnns(annIdsTrain)
    for i in range(len(annsTrain)):
      #annot_writer.writerow([im['coco_url'], anns[i]['bbox'][0], anns[i]['bbox'][1], anns[i]['bbox'][0] + anns[i]['bbox'][2], anns[i]['bbox'][1] + anns[i]['bbox'][3], anns[i].get('category_id')])
      annot_writer.writerow([im['file_name'], int(round(annsTrain[i]['bbox'][0])), int(round(annsTrain[i]['bbox'][1])), int(round(annsTrain[i]['bbox'][0] + annsTrain[i]['bbox'][2])), int(round(annsTrain[i]['bbox'][1] + annsTrain[i]['bbox'][3])), annsTrain[i].get('category_id')])
annot.close()

with open('test_annotations'  + '.csv', mode='w', newline='') as annot2:
  annot_writer2 = csv.writer(annot2)
  annot_writer2.writerow(header)
  for im2 in subimagesVal:    
    annIdsVal = cocoVal.getAnnIds(imgIds=im2['id'])
    annsVal = cocoVal.loadAnns(annIdsVal)
    for i2 in range(len(annsVal)):
      #annot_writer.writerow([im['coco_url'], anns[i]['bbox'][0], anns[i]['bbox'][1], anns[i]['bbox'][0] + anns[i]['bbox'][2], anns[i]['bbox'][1] + anns[i]['bbox'][3], anns[i2].get('category_id')])
      annot_writer2.writerow([im2['file_name'], int(round(annsVal[i2]['bbox'][0])), int(round(annsVal[i2]['bbox'][1])), int(round(annsVal[i2]['bbox'][0] + annsVal[i2]['bbox'][2])), int(round(annsVal[i2]['bbox'][1] + annsVal[i2]['bbox'][3])), annsVal[i2].get('category_id')])
annot2.close()


In [6]:
# json
import json 

def csv_to_json(csvFilePath, jsonFilePath):
    jsonArray = []
      
    #read csv file
    with open(csvFilePath, encoding='utf-8') as csvf: 
        #load csv file data using csv library's dictionary reader
        csvReader = csv.DictReader(csvf) 

        #convert each csv row into python dict
        for row in csvReader: 
            #add this python dict to json array
            jsonArray.append(row)
  
    #convert python jsonArray to JSON String and write to file
    with open(jsonFilePath, 'w', encoding='utf-8') as jsonf: 
        jsonString = json.dumps(jsonArray, indent=4)
        jsonf.write(jsonString)

csvFilePathTrain = r'/content/annotations/train_annotations.csv'
jsonFilePathTrain = r'train_annotations.json'
csv_to_json(csvFilePathTrain, jsonFilePathTrain)

csvFilePathVal = r'/content/annotations/test_annotations.csv'
jsonFilePathVal = r'test_annotations.json'
csv_to_json(csvFilePathVal, jsonFilePathVal)


In [7]:
# all in one file
%cd /content/
os.rename('annotations', 'images_set2')


/content


images_set2 is now a subset of Coco dataset, combrised of 4 classes with 45 images each for training and 10 for testing.

In [16]:
import shutil
shutil.make_archive('images_set2', 'zip', '/content/')

'/content/images_set2.zip'