# Open Images Dataset

*This notebook is to download images from the Open Images dataset.*

In [1]:
import os
import requests
from io import BytesIO
import json
from tqdm import tqdm
import pandas as pd
from PIL import Image

pd.set_option('display.max_colwidth', None)

## 1. Get image URLs

In [2]:
images = pd.read_csv("https://storage.googleapis.com/openimages/v6/oidv6-train-images-with-labels-with-rotation.csv")

images = images[["ImageID", "OriginalURL"]]
images.columns = ["image_id", "image_url"]

images

Unnamed: 0,image_id,image_url
0,4fa8054781a4c382,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg
1,d05c3e451f79174d,https://c6.staticflickr.com/8/7375/13791738583_ceb93ee070_o.jpg
2,5bde9ad21e82deed,https://c6.staticflickr.com/1/223/495658260_8947fc0be8_o.jpg
3,b37f763ae67d0888,https://c1.staticflickr.com/1/67/197493648_628a7cb2ee_o.jpg
4,3231e75bd7120b7a,https://c3.staticflickr.com/4/3654/3619297168_9d7ec37be3_o.jpg
...,...,...
7337072,319fbd7d6fad9993,https://farm4.staticflickr.com/8663/16074403674_becd15c9a4_o.jpg
7337073,318b6e4322650cf3,https://farm4.staticflickr.com/4080/4803266278_b98b6eefed_o.jpg
7337074,757bac4b7c064925,https://farm6.staticflickr.com/3835/15022727040_f6b680d8bc_o.jpg
7337075,8a23c798ea665d15,https://farm6.staticflickr.com/7180/6973157007_6d633ffef3_o.jpg


In [3]:
labels = pd.read_csv("https://storage.googleapis.com/openimages/v7/oidv7-train-annotations-human-imagelabels.csv")

labels.columns = ["image_id", "source", "label_id", "confidence"]
labels = labels.loc[
    labels["confidence"] == 1,  # keep only human verified labels
    ["image_id", "label_id"]
]

labels

Unnamed: 0,image_id,label_id
5,000002b66c9c498e,/m/011xdj
11,000002b66c9c498e,/m/012mj
15,000002b66c9c498e,/m/012yh1
27,000002b66c9c498e,/m/014sv8
31,000002b66c9c498e,/m/015c4z
...,...,...
58783029,fffffdaec951185d,/m/01g317
58783030,fffffdaec951185d,/m/01xyhv
58783031,fffffdaec951185d,/m/03bt1vf
58783032,fffffdaec951185d,/m/09j2d


In [4]:
label_names = pd.read_csv("https://storage.googleapis.com/openimages/v7/oidv7-class-descriptions.csv")

label_names.columns = ["label_id", "label_name"]

label_names

Unnamed: 0,label_id,label_name
0,/m/0c4936,'Nduja
1,/m/06w6y06,10 cane
2,/m/079zcf,100 metres hurdles
3,/m/02pv3hz,100plus
4,/m/0bb154,110 metres hurdles
...,...,...
20926,/m/05_5x82,Škoda rapid
20927,/m/0d96wb,Škoda roomster
20928,/m/03vql3,Škoda superb
20929,/m/05_5ppm,Škoda yeti


### Merges

In [5]:
labels = pd.merge(
    labels,
    label_names,
    on="label_id",
    how="inner"
)

images = pd.merge(
    images,
    labels,
    on="image_id",
    how="inner"
)

images = images[["image_url", "label_name"]]
# images = images.drop_duplicates(subset=["image_url"])

images

Unnamed: 0,image_url,label_name
0,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Adult
1,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Lighting
2,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Person
3,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Event
4,https://farm3.staticflickr.com/5310/5898076654_51085e157c_o.jpg,Temple (Body Part)
...,...,...
21144170,https://farm6.staticflickr.com/7180/6973157007_6d633ffef3_o.jpg,Dance
21144171,https://c7.staticflickr.com/1/23/27246359_825538fa3e_o.jpg,Sky
21144172,https://c7.staticflickr.com/1/23/27246359_825538fa3e_o.jpg,Boat
21144173,https://c7.staticflickr.com/1/23/27246359_825538fa3e_o.jpg,Lake


In [6]:
images.to_csv("open_images.csv", index=False)

## 2. Download images

In [7]:
images = pd.read_csv("open_images.csv")

In [8]:
def download_image(image_url, image_name, destination):
    response = requests.get(image_url)
    if response.status_code == 200:
        image = Image.open(BytesIO(response.content))
        resized_image = image.resize((500, 500))
        resized_image.save(destination + image_name)

In [9]:
def download_images_with_label(label, limit, destination):
    selected_images = images[images["label_name"] == label]
    print(f"- {label}: {len(selected_images)} images found", end=" - ")
    if len(selected_images) > limit:
        selected_images = selected_images.sample(limit)
    label_name = label.lower().replace(" ", "_").replace("-", "_")
    image_idx = 0
    for image_url in tqdm(selected_images["image_url"]):
        try:
            download_image(image_url, f"{label_name}_{image_idx}.jpg", destination)
            image_idx += 1
        except:
            pass
    print(f"{image_idx} images downloaded.")

In [10]:
with open("keywords/open_images_labels.json") as f:
    keywords = json.load(f)

print(f"Number of classes: {len(keywords)}")
print(f"Number of search keywords: {len([search for category in keywords.values() for search in category])}")

Number of classes: 30
Number of search keywords: 91


In [11]:
# Very long!
for category in keywords:
    
    print(f"\n¤ ¤ ¤ ¤ ¤ Class {category} ¤ ¤ ¤ ¤ ¤")
    
    # Creates category folder
    category_path = f"../../data/open_images/{category}/"
    if not os.path.exists(category_path):
        os.makedirs(category_path)
    
    for keyword in keywords[category]:
        download_images_with_label(
            label=keyword,
            limit=2000,
            destination=category_path
        )


¤ ¤ ¤ ¤ ¤ Class art ¤ ¤ ¤ ¤ ¤
- Art exhibition: 469 images found - 

100%|██████████| 469/469 [01:38<00:00,  4.78it/s]


469 images downloaded.
- Paint tools: 134 images found - 

100%|██████████| 134/134 [00:26<00:00,  5.00it/s]


134 images downloaded.

¤ ¤ ¤ ¤ ¤ Class barbecue ¤ ¤ ¤ ¤ ¤
- Barbecue: 612 images found - 

100%|██████████| 612/612 [02:01<00:00,  5.05it/s]


610 images downloaded.
- Barbecue grill: 534 images found - 

100%|██████████| 534/534 [06:32<00:00,  1.36it/s]


532 images downloaded.

¤ ¤ ¤ ¤ ¤ Class bathroom ¤ ¤ ¤ ¤ ¤
- Bathroom: 14598 images found - 

100%|██████████| 2000/2000 [20:48<00:00,  1.60it/s] 


1998 images downloaded.
- Bathroom accessory: 878 images found - 

100%|██████████| 878/878 [02:33<00:00,  5.71it/s]


874 images downloaded.
- Bathroom sink: 465 images found - 

100%|██████████| 465/465 [01:24<00:00,  5.50it/s]


465 images downloaded.
- Shower: 2482 images found - 

100%|██████████| 2000/2000 [13:37<00:00,  2.45it/s]


1993 images downloaded.

¤ ¤ ¤ ¤ ¤ Class beach ¤ ¤ ¤ ¤ ¤
- Beach: 241473 images found - 

100%|██████████| 2000/2000 [24:26<00:00,  1.36it/s]


1996 images downloaded.
- Beach towel: 109 images found - 

100%|██████████| 109/109 [00:21<00:00,  5.19it/s]


109 images downloaded.

¤ ¤ ¤ ¤ ¤ Class bedroom ¤ ¤ ¤ ¤ ¤
- Bed: 4417 images found - 

100%|██████████| 2000/2000 [11:20<00:00,  2.94it/s]


1997 images downloaded.
- Bedroom: 691 images found - 

100%|██████████| 691/691 [02:08<00:00,  5.36it/s]


690 images downloaded.
- Sleep: 538 images found - 

100%|██████████| 538/538 [01:32<00:00,  5.81it/s]


537 images downloaded.

¤ ¤ ¤ ¤ ¤ Class beer ¤ ¤ ¤ ¤ ¤
- Beer: 7917 images found - 

100%|██████████| 2000/2000 [16:34<00:00,  2.01it/s]


1997 images downloaded.
- Beer glass: 791 images found - 

100%|██████████| 791/791 [02:26<00:00,  5.39it/s]


790 images downloaded.
- Beer bottle: 625 images found - 

100%|██████████| 625/625 [02:01<00:00,  5.16it/s]


624 images downloaded.

¤ ¤ ¤ ¤ ¤ Class board_games ¤ ¤ ¤ ¤ ¤
- Board game: 837 images found - 

100%|██████████| 837/837 [03:21<00:00,  4.16it/s]


836 images downloaded.
- Card game: 407 images found - 

100%|██████████| 407/407 [01:32<00:00,  4.40it/s]


406 images downloaded.

¤ ¤ ¤ ¤ ¤ Class cleaning ¤ ¤ ¤ ¤ ¤
- Cleanliness: 766 images found - 

100%|██████████| 766/766 [03:34<00:00,  3.57it/s]


765 images downloaded.
- Household cleaning supply: 205 images found - 

100%|██████████| 205/205 [01:54<00:00,  1.79it/s]


205 images downloaded.
- Vacuum cleaner: 176 images found - 

100%|██████████| 176/176 [04:58<00:00,  1.70s/it]


176 images downloaded.
- Cleaner: 160 images found - 

100%|██████████| 160/160 [03:04<00:00,  1.16s/it]


160 images downloaded.

¤ ¤ ¤ ¤ ¤ Class coffee ¤ ¤ ¤ ¤ ¤
- Coffee cup: 4355 images found - 

100%|██████████| 2000/2000 [19:58<00:00,  1.67it/s] 


2000 images downloaded.
- Coffee (drink): 2394 images found - 

100%|██████████| 2000/2000 [20:32<00:00,  1.62it/s]  


1999 images downloaded.
- Tea: 1341 images found - 

100%|██████████| 1341/1341 [04:57<00:00,  4.51it/s]


1338 images downloaded.
- Drip coffee maker: 134 images found - 

100%|██████████| 134/134 [00:27<00:00,  4.84it/s]


134 images downloaded.

¤ ¤ ¤ ¤ ¤ Class cooking ¤ ¤ ¤ ¤ ¤
- Cooking: 966 images found - 

100%|██████████| 966/966 [03:30<00:00,  4.59it/s]


960 images downloaded.
- Cookware and bakeware: 614 images found - 

100%|██████████| 614/614 [02:11<00:00,  4.68it/s]


612 images downloaded.
- Cook: 556 images found - 

100%|██████████| 556/556 [02:29<00:00,  3.71it/s]


556 images downloaded.
- Kitchen appliance: 2338 images found - 

100%|██████████| 2000/2000 [09:35<00:00,  3.47it/s]  


1999 images downloaded.
- Kitchen utensil: 1680 images found - 

100%|██████████| 1680/1680 [15:53<00:00,  1.76it/s] 


1675 images downloaded.
- Kitchen: 1311 images found - 

100%|██████████| 1311/1311 [05:12<00:00,  4.20it/s]


1310 images downloaded.
- Kitchen stove: 585 images found - 

100%|██████████| 585/585 [02:23<00:00,  4.08it/s]


585 images downloaded.
- Kitchenware: 202 images found - 

100%|██████████| 202/202 [00:53<00:00,  3.77it/s]


202 images downloaded.
- Refrigerator: 1213 images found - 

100%|██████████| 1213/1213 [04:40<00:00,  4.33it/s]


1211 images downloaded.

¤ ¤ ¤ ¤ ¤ Class flight ¤ ¤ ¤ ¤ ¤
- Airplane: 2383 images found - 

100%|██████████| 2000/2000 [13:14<00:00,  2.52it/s]


1999 images downloaded.
- Airport: 721 images found - 

100%|██████████| 721/721 [02:36<00:00,  4.61it/s]


720 images downloaded.

¤ ¤ ¤ ¤ ¤ Class france ¤ ¤ ¤ ¤ ¤
- Bread: 2561 images found - 

100%|██████████| 2000/2000 [10:31<00:00,  3.17it/s]


1996 images downloaded.
- Bakery: 608 images found - 

100%|██████████| 608/608 [01:59<00:00,  5.08it/s]


606 images downloaded.

¤ ¤ ¤ ¤ ¤ Class garden ¤ ¤ ¤ ¤ ¤
- Garden: 918 images found - 

100%|██████████| 918/918 [03:48<00:00,  4.02it/s]


917 images downloaded.
- Botanical garden: 564 images found - 

100%|██████████| 564/564 [02:44<00:00,  3.43it/s]


563 images downloaded.
- Gardening: 321 images found - 

100%|██████████| 321/321 [01:20<00:00,  4.00it/s]


321 images downloaded.
- Gardener: 281 images found - 

100%|██████████| 281/281 [01:18<00:00,  3.59it/s]


281 images downloaded.

¤ ¤ ¤ ¤ ¤ Class gym ¤ ¤ ¤ ¤ ¤
- Gym: 487 images found - 

100%|██████████| 487/487 [04:04<00:00,  1.99it/s]


484 images downloaded.
- Strength training: 551 images found - 

100%|██████████| 551/551 [02:16<00:00,  4.03it/s]


549 images downloaded.
- Weight training: 268 images found - 

100%|██████████| 268/268 [00:58<00:00,  4.61it/s]


267 images downloaded.
- Bodybuilding: 392 images found - 

100%|██████████| 392/392 [01:23<00:00,  4.70it/s]


392 images downloaded.
- Weights: 418 images found - 

100%|██████████| 418/418 [01:37<00:00,  4.29it/s]


418 images downloaded.

¤ ¤ ¤ ¤ ¤ Class hot ¤ ¤ ¤ ¤ ¤
- Sex toy: 23 images found - 

100%|██████████| 23/23 [00:05<00:00,  4.27it/s]


23 images downloaded.

¤ ¤ ¤ ¤ ¤ Class italy ¤ ¤ ¤ ¤ ¤
- Pizza: 1991 images found - 

100%|██████████| 1991/1991 [07:16<00:00,  4.56it/s]


1991 images downloaded.
- Pasta: 1051 images found - 

100%|██████████| 1051/1051 [03:55<00:00,  4.47it/s]


1050 images downloaded.

¤ ¤ ¤ ¤ ¤ Class japan ¤ ¤ ¤ ¤ ¤
- Sushi: 1185 images found - 

100%|██████████| 1185/1185 [04:23<00:00,  4.50it/s]


1184 images downloaded.
- Japanese architecture: 582 images found - 

100%|██████████| 582/582 [03:00<00:00,  3.22it/s]


582 images downloaded.

¤ ¤ ¤ ¤ ¤ Class metro ¤ ¤ ¤ ¤ ¤
- Metro: 470 images found - 

100%|██████████| 470/470 [05:27<00:00,  1.43it/s]


470 images downloaded.
- Metro station: 417 images found - 

100%|██████████| 417/417 [03:17<00:00,  2.11it/s]


416 images downloaded.
- Bus: 44458 images found - 

100%|██████████| 2000/2000 [25:50<00:00,  1.29it/s]


1998 images downloaded.

¤ ¤ ¤ ¤ ¤ Class movie ¤ ¤ ¤ ¤ ¤
- Movie: 325 images found - 

100%|██████████| 325/325 [00:50<00:00,  6.38it/s]


324 images downloaded.
- Movie theater: 137 images found - 

100%|██████████| 137/137 [00:30<00:00,  4.45it/s]


137 images downloaded.

¤ ¤ ¤ ¤ ¤ Class nature ¤ ¤ ¤ ¤ ¤
- Nature: 1635 images found - 

100%|██████████| 1635/1635 [08:36<00:00,  3.17it/s]


1632 images downloaded.
- Hiking: 64296 images found - 

100%|██████████| 2000/2000 [30:48<00:00,  1.08it/s]


2000 images downloaded.
- Hiking equipment: 3256 images found - 

100%|██████████| 2000/2000 [15:31<00:00,  2.15it/s]


1996 images downloaded.
- Forest: 3034 images found - 

100%|██████████| 2000/2000 [16:39<00:00,  2.00it/s]


1996 images downloaded.

¤ ¤ ¤ ¤ ¤ Class party ¤ ¤ ¤ ¤ ¤
- Party: 598 images found - 

100%|██████████| 598/598 [02:03<00:00,  4.83it/s]


596 images downloaded.
- Alcoholic beverage: 1147 images found - 

100%|██████████| 1147/1147 [03:41<00:00,  5.18it/s]


1144 images downloaded.
- Alcohol: 843 images found - 

100%|██████████| 843/843 [02:31<00:00,  5.56it/s]


838 images downloaded.
- Bar: 766 images found - 

100%|██████████| 766/766 [02:39<00:00,  4.81it/s]


763 images downloaded.

¤ ¤ ¤ ¤ ¤ Class piano ¤ ¤ ¤ ¤ ¤
- Piano: 12320 images found - 

100%|██████████| 2000/2000 [21:25<00:00,  1.56it/s]


1996 images downloaded.
- Sheet music: 357 images found - 

100%|██████████| 357/357 [01:16<00:00,  4.67it/s]


356 images downloaded.

¤ ¤ ¤ ¤ ¤ Class pool ¤ ¤ ¤ ¤ ¤
- Swimming pool: 5122 images found - 

100%|██████████| 2000/2000 [17:02<00:00,  1.96it/s]


1994 images downloaded.
- Swimming: 33876 images found - 

100%|██████████| 2000/2000 [22:11<00:00,  1.50it/s]


1996 images downloaded.
- Swimwear: 3773 images found - 

100%|██████████| 2000/2000 [12:57<00:00,  2.57it/s]


1996 images downloaded.

¤ ¤ ¤ ¤ ¤ Class rainy ¤ ¤ ¤ ¤ ¤
- Rain: 520 images found - 

100%|██████████| 520/520 [01:49<00:00,  4.73it/s]


519 images downloaded.

¤ ¤ ¤ ¤ ¤ Class reading ¤ ¤ ¤ ¤ ¤
- Book: 74809 images found - 

100%|██████████| 2000/2000 [24:42<00:00,  1.35it/s]


1997 images downloaded.
- E-book readers: 337 images found - 

100%|██████████| 337/337 [01:05<00:00,  5.14it/s]


337 images downloaded.
- Library: 539 images found - 

100%|██████████| 539/539 [02:12<00:00,  4.08it/s]


538 images downloaded.
- Magazine: 620 images found - 

100%|██████████| 620/620 [02:11<00:00,  4.70it/s]


620 images downloaded.

¤ ¤ ¤ ¤ ¤ Class running ¤ ¤ ¤ ¤ ¤
- Long-distance running: 1286 images found - 

100%|██████████| 1286/1286 [07:47<00:00,  2.75it/s]


1285 images downloaded.
- Running: 1150 images found - 

100%|██████████| 1150/1150 [05:14<00:00,  3.66it/s]


1147 images downloaded.
- Jogging: 351 images found - 

100%|██████████| 351/351 [01:29<00:00,  3.91it/s]


351 images downloaded.
- Running shoe: 518 images found - 

100%|██████████| 518/518 [01:48<00:00,  4.77it/s]


518 images downloaded.

¤ ¤ ¤ ¤ ¤ Class sport ¤ ¤ ¤ ¤ ¤
- Sports equipment: 41369 images found - 

100%|██████████| 2000/2000 [23:31<00:00,  1.42it/s]


1998 images downloaded.
- Sports: 1001 images found - 

100%|██████████| 1001/1001 [04:02<00:00,  4.12it/s]


1000 images downloaded.

¤ ¤ ¤ ¤ ¤ Class train ¤ ¤ ¤ ¤ ¤
- Train: 53673 images found - 

100%|██████████| 2000/2000 [24:08<00:00,  1.38it/s]


1994 images downloaded.
- Train station: 669 images found - 

100%|██████████| 669/669 [02:31<00:00,  4.41it/s]


668 images downloaded.

¤ ¤ ¤ ¤ ¤ Class traveling ¤ ¤ ¤ ¤ ¤
- Travel: 1261 images found - 

100%|██████████| 1261/1261 [04:21<00:00,  4.83it/s]


1259 images downloaded.
- Luggage and bags: 2689 images found - 

100%|██████████| 2000/2000 [11:36<00:00,  2.87it/s]


1997 images downloaded.

¤ ¤ ¤ ¤ ¤ Class working ¤ ¤ ¤ ¤ ¤
- Job: 1163 images found - 

100%|██████████| 1163/1163 [05:28<00:00,  3.54it/s]


1161 images downloaded.
- Office supplies: 2570 images found - 

100%|██████████| 2000/2000 [10:54<00:00,  3.06it/s]


1996 images downloaded.
- Office equipment: 1070 images found - 

100%|██████████| 1070/1070 [03:30<00:00,  5.08it/s]


1068 images downloaded.
- White-collar worker: 883 images found - 

100%|██████████| 883/883 [03:15<00:00,  4.53it/s]


882 images downloaded.
- Computer: 119034 images found - 

100%|██████████| 2000/2000 [23:42<00:00,  1.41it/s]

1998 images downloaded.



