In [16]:
import os

import glob
import pandas as pd

DIRPATH = '/mnt/ml-team/open-images-v4/bounding-boxes'

In [10]:
glob.glob('{}/*'.format(DIRPATH))

['/mnt/ml-team/open-images-v4/bounding-boxes/boxes',
 '/mnt/ml-team/open-images-v4/bounding-boxes/image-labels',
 '/mnt/ml-team/open-images-v4/bounding-boxes/imageIds',
 '/mnt/ml-team/open-images-v4/bounding-boxes/metadata',
 '/mnt/ml-team/open-images-v4/bounding-boxes/validation',
 '/mnt/ml-team/open-images-v4/bounding-boxes/train',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018']

# boxes

In [22]:
glob.glob('{}/boxes/*'.format(DIRPATH))

['/mnt/ml-team/open-images-v4/bounding-boxes/boxes/train-annotations-bbox.csv',
 '/mnt/ml-team/open-images-v4/bounding-boxes/boxes/validation-annotations-bbox.csv',
 '/mnt/ml-team/open-images-v4/bounding-boxes/boxes/test-annotations-bbox.csv']

In [45]:
train_boxes = pd.read_csv(os.path.join(DIRPATH,'boxes','train-annotations-bbox.csv'))
train_boxes.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,000002b66c9c498e,xclick,/m/01g317,1,0.0125,0.195312,0.148438,0.5875,0,1,0,0,0
1,000002b66c9c498e,xclick,/m/01g317,1,0.025,0.276563,0.714063,0.948438,0,1,0,0,0
2,000002b66c9c498e,xclick,/m/01g317,1,0.151562,0.310937,0.198437,0.590625,1,0,0,0,0
3,000002b66c9c498e,xclick,/m/01g317,1,0.25625,0.429688,0.651563,0.925,1,0,0,0,0
4,000002b66c9c498e,xclick,/m/01g317,1,0.257812,0.346875,0.235938,0.385938,1,0,0,0,0


In [46]:
valid_boxes = pd.read_csv(os.path.join(DIRPATH,'boxes','validation-annotations-bbox.csv'))
valid_boxes.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,0001eeaf4aed83f9,freeform,/m/0cmf2,1,0.022464,0.964178,0.070656,0.800164,0,0,0,0,0
1,000595fe6fee6369,freeform,/m/02wbm,1,0.0,1.0,0.000233,1.0,0,0,1,0,0
2,000595fe6fee6369,freeform,/m/02xwb,1,0.14103,0.180277,0.676262,0.732455,0,0,0,0,0
3,000595fe6fee6369,freeform,/m/02xwb,1,0.213781,0.253028,0.298764,0.354956,1,0,0,0,0
4,000595fe6fee6369,freeform,/m/02xwb,1,0.232926,0.288447,0.488954,0.545146,1,0,0,0,0


## Source
Let's look at the data sources in train and valid

In [34]:
train_boxes['Source'].value_counts()

xclick       13050532
activemil     1559697
Name: Source, dtype: int64

In [35]:
valid_boxes['Source'].value_counts()

freeform    204621
Name: Source, dtype: int64

There are different sources used in train and valid, which could affect how the labels were collected

## Confidence

In [38]:
train_boxes['Confidence'].describe()

count    14610229.0
mean            1.0
std             0.0
min             1.0
25%             1.0
50%             1.0
75%             1.0
max             1.0
Name: Confidence, dtype: float64

In [39]:
valid_boxes['Confidence'].describe()

count    204621.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: Confidence, dtype: float64

It seems that both in train and valid confidence on every bbox of every image is `1`.

Why have this flag then?

## LabelName

In [43]:
train_boxes['LabelName'].nunique(), valid_boxes['LabelName'].nunique(), 

(599, 559)

In [44]:
train_boxes['LabelName'].value_counts() 

/m/09j2d      1438128
/m/04yx4      1418594
/m/07j7r      1051344
/m/0dzct      1037710
/m/01g317     1034721
/m/03bt1vf     767337
/m/09j5n       744474
/m/0d4v4       503467
/m/0c9ph5      345296
/m/083wq       340639
/m/05s2s       267913
/m/0k4j        248075
/m/03q69       234057
/m/0dzf4       208982
/m/04hgtk      201633
/m/05r655      197155
/m/0cgh4       178634
/m/02p0tk3     175244
/m/04rky       156154
/m/03jm5       136152
/m/01mzpv      132483
/m/0h9mv       122615
/m/01xyhv      110848
/m/0463sg       91024
/m/02wbm        88422
/m/01bl7v       87555
/m/04bcr3       85691
/m/079cl        81261
/m/01prls       81108
/m/019jd        79113
               ...   
/m/025fsf          59
/m/0km7z           56
/m/0xfy            56
/m/027rl48         54
/m/02fh7f          53
/m/07mcwg          45
/m/02cvgx          43
/m/0f8s22          41
/m/03qjg           38
/m/0j496           36
/m/03wym           35
/m/03qhv5          35
/m/0_dqb           33
/m/04z4wx          31
/m/02xqq  

`559` labels that have from `4` to `1438128` examples in the train set. 

Sounds like a lot of fun :)
# image-labels

In [42]:
glob.glob('{}/image-labels/*'.format(DIRPATH))

['/mnt/ml-team/open-images-v4/bounding-boxes/image-labels/train-annotations-human-imagelabels-boxable.csv',
 '/mnt/ml-team/open-images-v4/bounding-boxes/image-labels/validation-annotations-human-imagelabels-boxable.csv',
 '/mnt/ml-team/open-images-v4/bounding-boxes/image-labels/test-annotations-human-imagelabels-boxable.csv']

In [47]:
train_image_labels = pd.read_csv(os.path.join(DIRPATH,'image-labels','train-annotations-human-imagelabels-boxable.csv'))
train_image_labels.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence
0,000002b66c9c498e,verification,/m/014j1m,0
1,000002b66c9c498e,verification,/m/014sv8,1
2,000002b66c9c498e,verification,/m/01599,0
3,000002b66c9c498e,verification,/m/015p6,0
4,000002b66c9c498e,verification,/m/015x4r,0


In [48]:
valid_image_labels = pd.read_csv(os.path.join(DIRPATH,'image-labels','validation-annotations-human-imagelabels-boxable.csv'))
valid_image_labels.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence
0,0001eeaf4aed83f9,verification,/m/0k5j,1
1,0001eeaf4aed83f9,verification,/m/07yv9,1
2,0001eeaf4aed83f9,verification,/m/0cmf2,1
3,0004886b7d043cfd,verification,/m/0463sg,0
4,0004886b7d043cfd,verification,/m/04hgtk,0


In [49]:
train_image_labels['Source'].value_counts()

verification                8659710
crowdsource-verification     337085
Name: Source, dtype: int64

In [50]:
valid_image_labels['Source'].value_counts()

verification                209880
crowdsource-verification      1900
Name: Source, dtype: int64

In [51]:
train_image_labels['Confidence'].value_counts()

1    6622219
0    2374576
Name: Confidence, dtype: int64

In [52]:
valid_image_labels['Confidence'].value_counts()

1    141785
0     69995
Name: Confidence, dtype: int64

Ok, this looks a bit more reasonable.

# imageIds

Just some info on the origin of data, authors, licence and stuff like that

In [61]:
glob.glob('{}/imageIds/*'.format(DIRPATH))

['/mnt/ml-team/open-images-v4/bounding-boxes/imageIds/train-images-boxable-with-rotation.csv',
 '/mnt/ml-team/open-images-v4/bounding-boxes/imageIds/validation-images-with-rotation.csv',
 '/mnt/ml-team/open-images-v4/bounding-boxes/imageIds/test-images-with-rotation.csv']

In [62]:
train_imageIds = pd.read_csv(os.path.join(DIRPATH,'imageIds','train-images-boxable-with-rotation.csv'))
train_imageIds.head()

Unnamed: 0,ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
0,4fa8054781a4c382,train,https://farm3.staticflickr.com/5310/5898076654...,https://www.flickr.com/photos/michael-beat/589...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/michael-beat/,Michael Beat,...die FNF-Kerze,4405052,KFukvivpCM5QXl5SqKe41g==,https://c1.staticflickr.com/6/5310/5898076654_...,0.0
1,b37f763ae67d0888,train,https://c1.staticflickr.com/1/67/197493648_628...,https://www.flickr.com/photos/drstarbuck/19749...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/drstarbuck/,Karen,Three boys on a hill,494555,9IzEn38GRNsVpATuv7gzEA==,https://c3.staticflickr.com/1/67/197493648_628...,0.0
2,7e8584b0f487cb9e,train,https://c7.staticflickr.com/8/7056/7143870979_...,https://www.flickr.com/photos/circasassy/71438...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/circasassy/,CircaSassy,A Christmas carol and The cricket on the heart...,2371584,3hQwu0iSzY1VIoXiwp0/Mg==,https://c7.staticflickr.com/8/7056/7143870979_...,0.0
3,86638230febe21c4,train,https://farm5.staticflickr.com/5128/5301868579...,https://www.flickr.com/photos/ajcreencia/53018...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/ajcreencia/,Alex,Abbey and Kenny,949267,onB+rCZnGQg5PRX7xOs18Q==,https://c4.staticflickr.com/6/5128/5301868579_...,
4,249086e72671397d,train,https://c6.staticflickr.com/4/3930/15342460029...,https://www.flickr.com/photos/codnewsroom/1534...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/codnewsroom/,COD Newsroom,Suburban Law Enforcement Academy 20th Annivers...,6541758,MjpaAVbMAWbCusSaxI1D7w==,https://c1.staticflickr.com/4/3930/15342460029...,0.0


In [63]:
valid_imageIds = pd.read_csv(os.path.join(DIRPATH,'imageIds','validation-images-with-rotation.csv'))
valid_imageIds.head()

Unnamed: 0,ImageID,Subset,OriginalURL,OriginalLandingURL,License,AuthorProfileURL,Author,Title,OriginalSize,OriginalMD5,Thumbnail300KURL,Rotation
0,fe600639ac5f36c1,validation,https://farm2.staticflickr.com/5612/1534025949...,https://www.flickr.com/photos/118815643@N04/15...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/118815643@N04/,LabHacker CD,_GUT6674,242145,0jBpbNION09+r02xkTIBcA==,https://c8.staticflickr.com/6/5612/15340259497...,
1,ba82c70cc6cdf449,validation,https://farm4.staticflickr.com/3859/1527670200...,https://www.flickr.com/photos/125612851@N04/15...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/125612851@N04/,Stockholms stadsarkiv,SE_SSA_0870_Ofört_029,1658910,qMzF2UrrpPdjO+Nhhl6pvQ==,https://c7.staticflickr.com/4/3859/15276702002...,0.0
2,e3ffa4c868b11b15,validation,https://farm6.staticflickr.com/5336/1717037798...,https://www.flickr.com/photos/101125222@N02/17...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/101125222@N02/,Ismaïl Taha,ct26,1085575,MXOLnofbdq410OH34woSeg==,https://c3.staticflickr.com/6/5336/17170377986...,0.0
3,7d00af2927a57eeb,validation,https://c4.staticflickr.com/7/6181/6085189538_...,https://www.flickr.com/photos/66833578@N02/608...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/66833578@N02/,Crycks,Hummingbird,126933,m0dy5a3Pjhbzx15MbScpxg==,https://c5.staticflickr.com/7/6181/6085189538_...,0.0
4,914dd6fb5eb17e85,validation,https://farm6.staticflickr.com/3324/4621899397...,https://www.flickr.com/photos/damo1977/4621899397,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/damo1977/,damo1977,"Christmas 1994 Hoticulture, Nuts & Dried Fruit...",855508,xeHPBJGWTrDVbOufALMFGw==,https://c6.staticflickr.com/4/3324/4621899397_...,


# metadata

This is just label-code to label-name mapping

In [64]:
glob.glob('{}/metadata/*'.format(DIRPATH))

['/mnt/ml-team/open-images-v4/bounding-boxes/metadata/class-descriptions-boxable.csv']

In [66]:
metadata = pd.read_csv(os.path.join(DIRPATH,'metadata','class-descriptions-boxable.csv'))
metadata.head()

Unnamed: 0,/m/011k07,Tortoise
0,/m/011q46kg,Container
1,/m/012074,Magpie
2,/m/0120dh,Sea turtle
3,/m/01226z,Football
4,/m/012n7d,Ambulance


# train, validation, test, test_challenge

Those are just folders with images

In [69]:
glob.glob('{}/test_challenge_2018/*'.format(DIRPATH))[:10]

['/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/0146dac8bdf414b1.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/033d32130a350cf8.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/0487e566f0f96052.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/048a9324d4ec2101.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/06880fcc7b44c18b.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/0a023dad20897ae9.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/0cf965850dc569d7.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/0e39a30fea285028.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/0f1a34412957c8dd.jpg',
 '/mnt/ml-team/open-images-v4/bounding-boxes/test_challenge_2018/110a23555ede8535.jpg']