**How to Query the Open Images Dataset (BigQuery)**

In [1]:
import bq_helper
from bq_helper import BigQueryHelper
# https://www.kaggle.com/sohier/introduction-to-the-bq-helper-package
open_images = bq_helper.BigQueryHelper(active_project="bigquery-public-data",
                                   dataset_name="open_images")

In [2]:
bq_assistant = BigQueryHelper("bigquery-public-data", "open_images")
bq_assistant.list_tables()

['annotations_bbox', 'dict', 'images', 'labels']

In [3]:
bq_assistant.head("images", num_rows=3)

Unnamed: 0,image_id,subset,original_url,original_landing_url,license,author_profile_url,author,title,original_size,original_md5,thumbnail_300k_url
0,248e87e437808ed3,train,https://farm4.staticflickr.com/7162/6707198107...,https://www.flickr.com/photos/spreadshirt/6707...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/spreadshirt/,Spreadshirt,Men’s Pique Polo (NA),513517,VYDCiO0fog76g7undQPcKA==,https://c8.staticflickr.com/8/7162/6707198107_...
1,248f1485f9cc476f,train,https://c8.staticflickr.com/4/3676/11012143656...,https://www.flickr.com/photos/pawel_pacholec/1...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/pawel_pacholec/,Pawel Pacholec,Ladybug on green leaf,3052937,hMoii6JGMfikAihKlmWZKA==,https://c2.staticflickr.com/4/3676/11012143656...
2,2490466ff9064c4b,train,https://c4.staticflickr.com/9/8157/7599640374_...,https://www.flickr.com/photos/biodivlibrary/75...,https://creativecommons.org/licenses/by/2.0/,https://www.flickr.com/people/biodivlibrary/,Biodiversity Heritage Library,n53_w1150,1454466,zson/PTZCYyXcEglDBd2dw==,https://c6.staticflickr.com/9/8157/7599640374_...


In [4]:
bq_assistant.table_schema("images")

[SchemaField('image_id', 'STRING', 'NULLABLE', None, ()),
 SchemaField('subset', 'STRING', 'NULLABLE', None, ()),
 SchemaField('original_url', 'STRING', 'NULLABLE', None, ()),
 SchemaField('original_landing_url', 'STRING', 'NULLABLE', None, ()),
 SchemaField('license', 'STRING', 'NULLABLE', None, ()),
 SchemaField('author_profile_url', 'STRING', 'NULLABLE', None, ()),
 SchemaField('author', 'STRING', 'NULLABLE', None, ()),
 SchemaField('title', 'STRING', 'NULLABLE', None, ()),
 SchemaField('original_size', 'INTEGER', 'NULLABLE', None, ()),
 SchemaField('original_md5', 'STRING', 'NULLABLE', None, ()),
 SchemaField('thumbnail_300k_url', 'STRING', 'NULLABLE', None, ())]

Which labels are in the dataset?

In [5]:
query1 = """SELECT
  *
FROM
  `bigquery-public-data.open_images.dict`
LIMIT
  10;
        """
response1 = open_images.query_to_pandas_safe(query1)
response1.head(10)

Unnamed: 0,label_name,label_display_name
0,/m/01www,CD
1,/m/0h989,Go
2,/m/03bx7vb,Ox
3,/m/01lgkm,RV
4,/m/012wxt,Vj
5,/m/05c2lbh,Ada
6,/m/0c5jxs,Ade
7,/m/0m09,Ale
8,/m/0_k2,Ant
9,/m/01hf_2,Ape


Which labels have "bus" in their display names?

In [6]:
query2 = """SELECT
  *
FROM
  `bigquery-public-data.open_images.dict`
WHERE
  label_display_name LIKE '%bus%'
LIMIT
  20;
        """
response2 = open_images.query_to_pandas_safe(query2)
response2.head(10)

Unnamed: 0,label_name,label_display_name
0,/m/0f454,Rubus
1,/m/015zfz,Airbus
2,/m/02539r,Sorbus
3,/m/0f3k6,Colobus
4,/m/045jsc,Minibus
5,/m/0c5q0q,Mi rebus
6,/m/05jlh5,Saltbush
7,/m/016_bh,Shadbush
8,/m/08hv3x,Kabusecha
9,/m/04lzj7,Bottlebush


How many images of a trolleybus are in the dataset?

In [7]:
query3 = """SELECT
  COUNT(*)
FROM
  `bigquery-public-data.open_images.labels` a
INNER JOIN
  `bigquery-public-data.open_images.images` b
ON
  a.image_id = b.image_id
WHERE
  a.label_name='/m/0f6pl'
  AND a.confidence > 0.5;
        """
response3 = open_images.query_to_pandas_safe(query3, max_gb_scanned=10)
response3.head(10)

Unnamed: 0,f0_
0,3595


What are some landing pages of images with a trolleybus?

In [8]:
query4 = """SELECT
  original_landing_url,
  confidence
FROM
  `bigquery-public-data.open_images.labels` l
INNER JOIN
  `bigquery-public-data.open_images.images` i
ON
  l.image_id = i.image_id
WHERE
  label_name='/m/0f6pl'
  AND confidence = 1
  AND subset='validation'
LIMIT
  10;
        """
response4 = open_images.query_to_pandas_safe(query4, max_gb_scanned=10)
response4.head(10)

Unnamed: 0,original_landing_url,confidence
0,https://www.flickr.com/photos/sergejf/8706867707,1.0


In [9]:
query5 = """SELECT
  original_landing_url,
  confidence
FROM
  `bigquery-public-data.open_images.labels` l
INNER JOIN
  `bigquery-public-data.open_images.images` i
ON
  l.image_id = i.image_id
WHERE
  label_name='/m/0f6pl'
  AND confidence = 1
  AND subset='validation'
LIMIT
  10;
        """
response5 = open_images.query_to_pandas_safe(query5, max_gb_scanned=10)
response5.head(10)

Unnamed: 0,original_landing_url,confidence
0,https://www.flickr.com/photos/sergejf/8706867707,1.0


Which images with cherries are in the training set?

In [10]:
query6 = """SELECT
  i.image_id AS image_id,
  original_url,
  confidence
FROM
  `bigquery-public-data.open_images.labels` l
INNER JOIN
  `bigquery-public-data.open_images.images` i
ON
  l.image_id = i.image_id
WHERE
  label_name='/m/0f8sw'
  AND confidence >= 0.85
  AND Subset='train'
LIMIT
  10;
        """
response6 = open_images.query_to_pandas_safe(query6, max_gb_scanned=10)
response6.head(10)

Unnamed: 0,image_id,original_url,confidence
0,11e30021dcffd987,https://farm8.staticflickr.com/2887/1103445203...,1.0
1,24ed494492b89654,https://farm5.staticflickr.com/3045/2639644807...,0.9
2,24ed494492b89654,https://farm5.staticflickr.com/3045/2639644807...,1.0
3,2571e3a90ce98756,https://farm7.staticflickr.com/6086/6053410905...,1.0
4,62a0a29805ad46c5,https://farm6.staticflickr.com/8220/8430826778...,1.0
5,62b28ea801205720,https://farm1.staticflickr.com/3344/3563052385...,1.0
6,62b28ea801205720,https://farm1.staticflickr.com/3344/3563052385...,0.9
7,b3a26e8e164b7026,https://farm7.staticflickr.com/2945/1529974103...,1.0
8,b3a26e8e164b7026,https://farm7.staticflickr.com/2945/1529974103...,0.9
9,d923fb3fdb415915,https://farm6.staticflickr.com/8352/8303394799...,0.9
