In [1]:
%matplotlib inline
from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
import json
pylab.rcParams['figure.figsize'] = (8.0, 10.0)

In [2]:
dataDir='/data/dataset/coco'

### Adding some helper functions

In [3]:
def getCocoFromAnnotations(annFile):
        coco=COCO(annFile)
        return coco

def showImage(imgId):
    # load and display image
    # I = io.imread('%s/images/%s/%s'%(dataDir,dataType,img['file_name']))
    # use url to load image
    # print(img)
    I = io.imread(img['coco_url'])
    plt.axis('off')
    plt.imshow(I)
    plt.show()
    
def writeToFile(outdoorImageIds, dataType):
    filename = f'{dataType}_outdoor'
    print(f'\n Writing OutdoorImageIds to file {filename}')
    file1 = open(f'{dataType}_outdoor', 'w')
    file1.writelines(json.dumps(outdoorImageIds))

### Outdoor Dataset Generator can be used to get Outdoor Image Ids

In [4]:
class OutdoorDatasetGenerator:
    def __init__(self, coco):
        self.coco = coco
    
    def getCocoCategories(self):
        cats = self.coco.loadCats(self.coco.getCatIds())
        nms=[cat['name'] for cat in cats]
        print(f'There are {len(cats)} coco categories')
        return cats

    def getSuperCategories(self):
        cats = self.getCocoCategories()
        supercats = set([cat['supercategory'] for cat in cats])
        print(f'There are {len(supercats)} coco supercategories')
        # print('COCO supercategories: \n{}'.format(' '.join(nms)))
        return supercats

    def getOutdoorCategories(self):
        cats = self.getCocoCategories()
        outdoorCats = []
        for category in cats:
            if category['supercategory'] == 'outdoor' or category['supercategory'] == 'vehicle':
                outdoorCats.append(category['name'])
        print('Outdoor Categories are', outdoorCats)
        return outdoorCats

    def getOutdoorImageIds(self):
        outdoorCats = self.getOutdoorCategories()
        outdoorImageIds = []
        for cat in outdoorCats:
            catId = self.coco.getCatIds(catNms=[cat])[0];
            images = self.coco.getImgIds(catIds=catId)
            outdoorImageIds.extend(images)
        outdoorImageIds = list(set(outdoorImageIds))
        print(f'Total outdoor images are {len(outdoorImageIds)}')
        return outdoorImageIds
    
    def getImage(self, imgId):
        return self.coco.loadImgs(imgId)[0]

In [7]:
def GetOutdoorImages(dataType):
    print(f'Generating outdoor image dataset for {dataType}\n')
    print('Initialising COCO object for Instances data')
    instanceFile='{}/annotations/instances_{}.json'.format(dataDir, dataType)
    instanceDataCoco = getCocoFromAnnotations(instanceFile)
    generator = OutdoorDatasetGenerator(instanceDataCoco)
    imageIds =  generator.getOutdoorImageIds()
    images = [generator.getImage(id) for id in imageIds]
    # print(len(images))
    return images

In [30]:
dataType='val2017'
outdoorImages = GetOutdoorImages(dataType)
writeToFile(outdoorImages, dataType)

Generating outdoor image dataset for val2017

Initialising COCO object for Instances data
loading annotations into memory...
Done (t=0.46s)
creating index...
index created!
There are 80 coco categories
Outdoor Categories are ['bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench']
Total outdoor images are 1424

 Writing OutdoorImageIds to file val2017_outdoor


### Captions Viewer can be used to get the captions given an image

In [24]:
class CaptionsViewer:
    def __init__(self, coco):
        self.coco = coco
        
    def showCaptions(self, img):
        annIds = self.coco.getAnnIds(imgIds=img['id']);
        anns = self.coco.loadAnns(annIds)
        # self.coco.showAnns(anns)
        # showImage(img)
        return anns

In [29]:
# dataType='train2017'
dataType='val2017'
outdoorImages = GetOutdoorImages(dataType)

print('\nGetting random outdoor image')
img = outdoorImages[np.random.randint(0,len(outdoorImages))]

captionsFile='{}/annotations/captions_{}.json'.format(dataDir, dataType)
captionsDataCoco = getCocoFromAnnotations(captionsFile)
print(f'\nCaptions for Image: {img} are\n')
captionsViewer = CaptionsViewer(captionsDataCoco)
print(captionsViewer.showCaptions(img))

Generating outdoor image dataset for val2017

Initialising COCO object for Instances data
loading annotations into memory...
Done (t=0.45s)
creating index...
index created!
There are 80 coco categories
Outdoor Categories are ['bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench']
Total outdoor images are 1424

Getting random outdoor image
loading annotations into memory...
Done (t=0.04s)
creating index...
index created!

Captions for Image: {'license': 3, 'file_name': '000000472030.jpg', 'coco_url': 'http://images.cocodataset.org/val2017/000000472030.jpg', 'height': 480, 'width': 640, 'date_captured': '2013-11-16 19:50:55', 'flickr_url': 'http://farm4.staticflickr.com/3340/3570719653_be3552d961_z.jpg', 'id': 472030} are

[{'image_id': 472030, 'id': 503953, 'caption': 'A lush green forest filled with lots of trees.'}, {'image_id': 472030, 'id': 507424, 'caption': 'A jacket is draped on a bench 

In [25]:
img = outdoorImages[np.random.randint(0,len(outdoorImages))]
captionsViewer = CaptionsViewer(captionsDataCoco)
anns = captionsViewer.showCaptions(img)
print(anns)

[{'image_id': 238117, 'id': 829360, 'caption': 'A sky blue aircraft is displayed in front of a building, with clouds overhead.'}, {'image_id': 238117, 'id': 829473, 'caption': 'A large blue airplane parked in a stationary position.'}, {'image_id': 238117, 'id': 829657, 'caption': 'A big fighter jet sits on the te'}, {'image_id': 238117, 'id': 829695, 'caption': 'A jet aircraft strangely painted like the sky with clouds on display at an airport'}, {'image_id': 238117, 'id': 829719, 'caption': 'A blue camouflage airplane is on a runway.'}]


In [32]:
# Calculating if there are any images without any captions
nocaps = 0
for i in outdoorImages:
    anns = captionsViewer.showCaptions(i)
    if(len(anns) == 0):
        nocaps = nocaps+1

print(nocaps)

0


In [22]:
len(outdoorImages)

33799