# Unsplash Dataset Builder

## 1. Setup

In [None]:
#@markdown 📦 install dependencies.

!pip -q install datasets pillow requests tqdm huggingface_hub

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
#@markdown 🧩 import dependencies.

from dataclasses import dataclass
import json
from typing import Iterable
from urllib.parse import quote

from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login
from PIL import Image
import requests
from tqdm.notebook import tqdm

## 2. Dataset

In [18]:
#@markdown 📚 create the dataset.

UNSPLASH_SEARCH_URL = 'https://unsplash.com/napi/search/photos?page={}&per_page=10&plus=none&query={}&xp=semantic-search%3Aexperiment'

queries = "portrait, flash photography, people" # @param {type:"string"}
queries = queries.split(',')
queries = [query.strip() for query in queries]

limits = "10, 10, 10" # @param {type:"string"}
limits = limits.split(',')
limits = [int(limit.strip()) for limit in limits]

dataset_dictionary = {'url': [], 'text': []}
result_urls = {}

for i, (query, limit) in enumerate(zip(queries, limits)):
    page = 0
    result_count = 0
    progress_bar = tqdm(total=limit)

    print(f'🔎 scraping query {i:03d}/{len(queries):03d} ({query})...')

    while result_count < limit:
        response = requests.get(UNSPLASH_SEARCH_URL.format(page, quote(query)))
        page += 1

        try:
            results = json.loads(response.text)['results']
        except:
            print(f'🙊 warning: page {page:04d} returned invalid JSON, skipping...')
            continue

        if not results:
            break

        for j, result in enumerate(results):
            try:
                result_url = result['urls']['regular']
                result_text = result['alt_description'].strip()

                assert result_url and result_text
                assert result_url not in result_urls

                dataset_dictionary['url'].append(result_url)
                dataset_dictionary['text'].append(result_text)
                result_urls[result_url] = True
                result_count += 1
                progress_bar.update(1)

                if result_count >= limit:
                    progress_bar.close()
                    break
            except:
                pass

print('📚 creating dataset...')

dataset = Dataset.from_dict(dataset_dictionary, split='train')
dataset = dataset.filter(lambda example: example)
dataset = dataset.shuffle()

print('✨ finished!')
print(dataset)

In [17]:
#@markdown ⏫ upload the dataset to Hugging Face.

dataset_path = 'username/repo'  #@param {type:"string"}

dataset.push_to_hub(dataset_path, split='train')