# Open Images Dataset

*This notebook is to download images from the Open Images dataset.*

In [1]:
import os
import requests
import json
import pandas as pd

pd.set_option('display.max_colwidth', None)

## 1. Get image URLs

In [2]:
images = pd.read_csv("https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv")

images = images[["ImageID", "OriginalURL"]]
images.columns = ["image_id", "image_url"]

images

Unnamed: 0,image_id,image_url
0,4fa8054781a4c382,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg
1,d05c3e451f79174d,https://c6.staticflickr.com/8/7375/13791738583_ceb93ee070_o.jpg
2,5bde9ad21e82deed,https://c6.staticflickr.com/1/223/495658260_8947fc0be8_o.jpg
3,b37f763ae67d0888,https://c1.staticflickr.com/1/67/197493648_628a7cb2ee_o.jpg
4,3231e75bd7120b7a,https://c3.staticflickr.com/4/3654/3619297168_9d7ec37be3_o.jpg
...,...,...
7337072,319fbd7d6fad9993,https://farm4.staticflickr.com/8663/16074403674_becd15c9a4_o.jpg
7337073,318b6e4322650cf3,https://farm4.staticflickr.com/4080/4803266278_b98b6eefed_o.jpg
7337074,757bac4b7c064925,https://farm6.staticflickr.com/3835/15022727040_f6b680d8bc_o.jpg
7337075,8a23c798ea665d15,https://farm6.staticflickr.com/7180/6973157007_6d633ffef3_o.jpg


In [3]:
labels = pd.read_csv("https://storage.googleapis.com/openimages/v7/oidv7-train-annotations-human-imagelabels.csv")

labels.columns = ["image_id", "source", "label_id", "confidence"]
labels = labels.loc[
    labels["confidence"] == 1,  # keep only human verified labels
    ["image_id", "label_id"]
]

labels

Unnamed: 0,image_id,label_id
5,000002b66c9c498e,/m/011xdj
11,000002b66c9c498e,/m/012mj
15,000002b66c9c498e,/m/012yh1
27,000002b66c9c498e,/m/014sv8
31,000002b66c9c498e,/m/015c4z
...,...,...
58783029,fffffdaec951185d,/m/01g317
58783030,fffffdaec951185d,/m/01xyhv
58783031,fffffdaec951185d,/m/03bt1vf
58783032,fffffdaec951185d,/m/09j2d


In [4]:
label_names = pd.read_csv("https://storage.googleapis.com/openimages/v7/oidv7-class-descriptions.csv")

label_names.columns = ["label_id", "label_name"]

label_names

Unnamed: 0,label_id,label_name
0,/m/0c4936,'Nduja
1,/m/06w6y06,10 cane
2,/m/079zcf,100 metres hurdles
3,/m/02pv3hz,100plus
4,/m/0bb154,110 metres hurdles
...,...,...
20926,/m/05_5x82,Škoda rapid
20927,/m/0d96wb,Škoda roomster
20928,/m/03vql3,Škoda superb
20929,/m/05_5ppm,Škoda yeti


### Merges

In [5]:
labels = pd.merge(
    labels,
    label_names,
    on="label_id",
    how="inner"
)

images = pd.merge(
    images,
    labels,
    on="image_id",
    how="inner"
)

images = images[["image_url", "label_name"]]
# images = images.drop_duplicates(subset=["image_url"])

images

Unnamed: 0,image_url,label_name
0,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Adult
1,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Lighting
2,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Person
3,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Event
4,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Temple (Body Part)
...,...,...
21144170,https://farm6.staticflickr.com/7180/6973157007_6d633ffef3_o.jpg,Dance
21144171,https://c7.staticflickr.com/1/23/27246359_825538fa3e_o.jpg,Sky
21144172,https://c7.staticflickr.com/1/23/27246359_825538fa3e_o.jpg,Boat
21144173,https://c7.staticflickr.com/1/23/27246359_825538fa3e_o.jpg,Lake


In [6]:
images.to_csv("open_images.csv", index=False)

## 2. Download images

In [7]:
images = pd.read_csv("open_images.csv")

In [8]:
def download_image(image_url, image_name, destination):
    response = requests.get(image_url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=3)
    with open(destination + image_name, 'wb') as f:
        f.write(response.content)

In [9]:
def download_images_with_label(label, limit, destination):
    selected_images = images[images["label_name"] == label]
    print(f"- {label}: {len(selected_images)} images found", end=" - ")
    if len(selected_images) > limit:
        selected_images = selected_images.sample(limit)
    image_idx = 0
    for image_url in selected_images["image_url"]:
        try:
            download_image(image_url, f"{label}_{image_idx}.jpg", destination)
            image_idx += 1
        except:
            pass
    print(f"{image_idx} images downloaded.")

In [10]:
with open("keywords/open_images_labels.json") as f:
    keywords = json.load(f)

print(f"Number of classes: {len(keywords)}")
print(f"Number of search keywords: {len([search for category in keywords.values() for search in category])}")

Number of classes: 30
Number of search keywords: 92


In [11]:
# Very long!
for category in keywords:
    
    print(f"\n¤ ¤ ¤ ¤ ¤ Class {category} ¤ ¤ ¤ ¤ ¤")
    
    # Creates category folder
    category_path = f"../../data/open_images/{category}/"
    if not os.path.exists(category_path):
        os.makedirs(category_path)
    
    for keyword in keywords[category]:
        download_images_with_label(
            label=keyword,
            limit=2000,
            destination=category_path
        )


¤ ¤ ¤ ¤ ¤ Class art ¤ ¤ ¤ ¤ ¤
- Art exhibition: 469 images found - 464 images downloaded.
- Paint tools: 134 images found - 132 images downloaded.

¤ ¤ ¤ ¤ ¤ Class barbecue ¤ ¤ ¤ ¤ ¤
- Barbecue: 612 images found - 607 images downloaded.
- Barbecue grill: 534 images found - 528 images downloaded.

¤ ¤ ¤ ¤ ¤ Class bathroom ¤ ¤ ¤ ¤ ¤
- Bathroom: 14598 images found - 1990 images downloaded.
- Bathroom accessory: 878 images found - 872 images downloaded.
- Bathroom sink: 465 images found - 457 images downloaded.
- Shower: 2482 images found - 1986 images downloaded.

¤ ¤ ¤ ¤ ¤ Class beach ¤ ¤ ¤ ¤ ¤
- Beach: 241473 images found - 1987 images downloaded.
- Beach towel: 109 images found - 109 images downloaded.

¤ ¤ ¤ ¤ ¤ Class bedroom ¤ ¤ ¤ ¤ ¤
- Bed: 4417 images found - 1986 images downloaded.
- Bedroom: 691 images found - 688 images downloaded.
- Sleep: 538 images found - 537 images downloaded.

¤ ¤ ¤ ¤ ¤ Class beer ¤ ¤ ¤ ¤ ¤
- Beer: 7917 images found - 1990 images downloaded.
- Beer glass: