In [1]:
import os

import glob
import pandas as pd

DIRPATH = '/mnt/ml-team/open-images-v4/bounding-boxes'
DIRPATH_COMPETITION = '/mnt/ml-team/minerva/open-solutions/googleai-object-detection/data'

In [2]:
glob.glob('{}/*'.format(DIRPATH))

['/mnt/ml-team/open-images-v4/bounding-boxes/boxes',
 '/mnt/ml-team/open-images-v4/bounding-boxes/image-labels',
 '/mnt/ml-team/open-images-v4/bounding-boxes/imageIds',
 '/mnt/ml-team/open-images-v4/bounding-boxes/metadata',
 '/mnt/ml-team/open-images-v4/bounding-boxes/validation',
 '/mnt/ml-team/open-images-v4/bounding-boxes/train',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018']

# boxes

In [3]:
glob.glob('{}/boxes/*'.format(DIRPATH))

['/mnt/ml-team/open-images-v4/bounding-boxes/boxes/train-annotations-bbox.csv',
 '/mnt/ml-team/open-images-v4/bounding-boxes/boxes/validation-annotations-bbox.csv',
 '/mnt/ml-team/open-images-v4/bounding-boxes/boxes/test-annotations-bbox.csv']

In [None]:
train_boxes = pd.read_csv(os.path.join(DIRPATH,'boxes','train-annotations-bbox.csv'))
train_boxes.head()

In [None]:
valid_boxes = pd.read_csv(os.path.join(DIRPATH,'boxes','validation-annotations-bbox.csv'))
valid_boxes.head()

## Source
Let's look at the data sources in train and valid

In [None]:
train_boxes['Source'].value_counts()

In [None]:
valid_boxes['Source'].value_counts()

There are different sources used in train and valid, which could affect how the labels were collected

## Confidence

In [None]:
train_boxes['Confidence'].describe()

In [None]:
valid_boxes['Confidence'].describe()

It seems that both in train and valid confidence on every bbox of every image is `1`.

Why have this flag then?

## LabelName

In [None]:
train_boxes['LabelName'].nunique(), valid_boxes['LabelName'].nunique(), 

In [None]:
train_boxes['LabelName'].value_counts() 

`559` labels that have from `4` to `1438128` examples in the train set. 

Sounds like a lot of fun :)
# image-labels

In [None]:
glob.glob('{}/image-labels/*'.format(DIRPATH))

In [None]:
train_image_labels = pd.read_csv(os.path.join(DIRPATH,'image-labels','train-annotations-human-imagelabels-boxable.csv'))
train_image_labels.head()

In [None]:
valid_image_labels = pd.read_csv(os.path.join(DIRPATH,'image-labels','validation-annotations-human-imagelabels-boxable.csv'))
valid_image_labels.head()

In [None]:
train_image_labels['Source'].value_counts()

In [None]:
valid_image_labels['Source'].value_counts()

In [None]:
train_image_labels['Confidence'].value_counts()

In [None]:
valid_image_labels['Confidence'].value_counts()

Ok, this looks a bit more reasonable.

# imageIds

Just some info on the origin of data, authors, licence and stuff like that

In [None]:
glob.glob('{}/imageIds/*'.format(DIRPATH))

In [None]:
train_imageIds = pd.read_csv(os.path.join(DIRPATH,'imageIds','train-images-boxable-with-rotation.csv'))
train_imageIds.head()

In [None]:
valid_imageIds = pd.read_csv(os.path.join(DIRPATH,'imageIds','validation-images-with-rotation.csv'))
valid_imageIds.head()

# metadata

This is just label-code to label-name mapping

In [None]:
glob.glob('{}/metadata/*'.format(DIRPATH))

In [None]:
metadata = pd.read_csv(os.path.join(DIRPATH,'metadata','class-descriptions-boxable.csv'))
metadata.head()

# train, validation, test, test_challenge

Those are just folders with images

In [None]:
glob.glob('{}/test_challenge_2018/*'.format(DIRPATH))[:10]

# Competition data

In [4]:
glob.glob('{}/*'.format(DIRPATH_COMPETITION))

['/mnt/ml-team/minerva/open-solutions/googleai-object-detection/data/sample_submission.csv.zip',
 '/mnt/ml-team/minerva/open-solutions/googleai-object-detection/data/sample_submission.csv',
 '/mnt/ml-team/minerva/open-solutions/googleai-object-detection/data/annotations',
 '/mnt/ml-team/minerva/open-solutions/googleai-object-detection/data/imageIds',
 '/mnt/ml-team/minerva/open-solutions/googleai-object-detection/data/metadata']

## annotations

In [5]:
glob.glob('{}/annotations/*'.format(DIRPATH_COMPETITION))

['/mnt/ml-team/minerva/open-solutions/googleai-object-detection/data/annotations/challenge-2018-train-annotations-bbox.csv',
 '/mnt/ml-team/minerva/open-solutions/googleai-object-detection/data/annotations/challenge-2018-train-annotations-human-imagelabels.csv']

In [6]:
annotations_bbox = pd.read_csv(os.path.join(DIRPATH_COMPETITION,'annotations','challenge-2018-train-annotations-bbox.csv'))
annotations_bbox.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,8d6dec80235b6fea,xclick,/m/09j5n,1,0.76,0.778125,0.645892,0.673277,0,0,0,0,0
1,8d6dec80235b6fea,xclick,/m/09j5n,1,0.8175,0.831875,0.628895,0.661945,0,0,0,0,0
2,8d6dec80235b6fea,xclick,/m/09j5n,1,0.843125,0.87,0.619452,0.645892,0,0,0,0,0
3,8d6dec80235b6fea,xclick,/m/09j5n,1,0.8675,0.891875,0.597734,0.625118,0,0,0,0,0
4,8d6dec80235b6fea,xclick,/m/09j5n,1,0.895625,0.91125,0.625118,0.65628,0,0,0,0,0


In [None]:
annotations_bbox['Confidence'].value_counts()

In [None]:
annotations_label = pd.read_csv(os.path.join(DIRPATH_COMPETITION,'annotations','challenge-2018-train-annotations-human-imagelabels.csv'))
annotations_label.head()

## imageIds

In [None]:
glob.glob('{}/imageIds/*'.format(DIRPATH_COMPETITION))

In [None]:
imageIds = pd.read_csv(os.path.join(DIRPATH_COMPETITION,'imageIds','train-images-boxable-with-rotation.csv'))
imageIds.head()

## metadata

In [None]:
glob.glob('{}/metadata/*'.format(DIRPATH_COMPETITION))

In [None]:
class_descriptions = pd.read_csv(os.path.join(DIRPATH_COMPETITION,'metadata','challenge-2018-class-descriptions-500.csv'),header=None)
class_descriptions.head()

In [None]:
hierarchy = pd.read_json(os.path.join(DIRPATH_COMPETITION,'metadata','bbox_labels_500_hierarchy.json'))
hierarchy.head()

In [None]:
val_ids = pd.read_csv(os.path.join(DIRPATH_COMPETITION,'metadata','challenge-2018-image-ids-valset-od.csv'))
val_ids.head()

In [None]:
val_ids.shape

In [None]:
set(imageIds['ImageID']) & set(val_ids['ImageID'])

In [None]:
glob.glob('{}/*'.format(os.path.join(DIRPATH,'test_challenge_2018')))[:3]