In [22]:
!pip install transformers sentence_transformers evaluate open_clip_torch pillow faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.2-cp310-abi3-macosx_14_0_arm64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.2-cp310-abi3-macosx_14_0_arm64.whl (3.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m89.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:04[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.2


In [19]:
import requests
import os
import pandas as pd
import time
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

API_KEY = '53965369-c16454507ccb511dd78f19ec6'

QUERIES = ['Tianzi Mountains', 'Grand Canyon', 'Plitvice Lakes', 'Cappadocia', 'Niagara Falls',
           'Coliseum', 'The Eiffel Tower', 'Burj Khalifa', 'Pyramids of Giza',
           'Machu Picchu', 'Kremlin Moscow', 'st petersburg hermitage', 'Zhangye Danxia Colorful Rocks']

PER_PAGE = 200
IMAGES_PER_QUERY = 120
SAVE_DIR = 'dataset_showplace'
os.makedirs(SAVE_DIR, exist_ok=True)

metadata = []
downloaded = 0

session = requests.Session()
retries = Retry(total=5, backoff_factor=2, status_forcelist=[429, 500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))

for query in tqdm(QUERIES):
    downloaded_per_query = 0
    page = 1

    while downloaded_per_query < IMAGES_PER_QUERY:
        url = f'https://pixabay.com/api/?key={API_KEY}&q={query}&image_type=photo&category=travel&per_page={PER_PAGE}&page={page}&safesearch=true'
        response = session.get(url, timeout=30)
        response.raise_for_status()
        if response.status_code != 200:
            print(f'Ошибка api для {query}: {response.text}')
            break
        data = response.json()

        if not data['hits']:
            print(f'Нет больше фото для "{query}"')
            break
        for hit in data['hits']:
            if downloaded_per_query >= IMAGES_PER_QUERY:
                break
            img_url = hit['largeImageURL']
            img_id = hit['id']
            filename = f'{SAVE_DIR}/{img_id}_{query.replace(" ", "_")}.jpg'
            
            try:
                img_data = requests.get(img_url).content
                with open(filename, 'wb') as f:
                    f.write(img_data)
                print(f'Скачано [{downloaded+1}]: {filename}')
                
                metadata.append({
                    'filename': os.path.basename(filename),
                    'query': query,
                    'source_url': hit['pageURL'],
                    'tags': hit['tags'],
                    'views': hit['views']
                })
                downloaded += 1
                downloaded_per_query += 1
            except Exception as e:
                print(f'Ошибка: {e}')
        
        page += 1
        time.sleep(2)

pd.DataFrame(metadata).to_csv(f'{SAVE_DIR}/metadata.csv', index=False)
print(f'{downloaded} скачано')

  0%|          | 0/13 [00:00<?, ?it/s]

Скачано [1]: dataset_showplace/455338_Tianzi_Mountains.jpg
Скачано [2]: dataset_showplace/2125148_Tianzi_Mountains.jpg
Скачано [3]: dataset_showplace/862870_Tianzi_Mountains.jpg
Скачано [4]: dataset_showplace/5819652_Tianzi_Mountains.jpg
Скачано [5]: dataset_showplace/9307295_Tianzi_Mountains.jpg
Скачано [6]: dataset_showplace/3010323_Tianzi_Mountains.jpg
Скачано [7]: dataset_showplace/1868299_Tianzi_Mountains.jpg
Скачано [8]: dataset_showplace/3609863_Tianzi_Mountains.jpg
Скачано [9]: dataset_showplace/6815304_Tianzi_Mountains.jpg
Скачано [10]: dataset_showplace/4646744_Tianzi_Mountains.jpg
Скачано [11]: dataset_showplace/1852955_Tianzi_Mountains.jpg
Скачано [12]: dataset_showplace/5669423_Tianzi_Mountains.jpg
Скачано [13]: dataset_showplace/6692712_Tianzi_Mountains.jpg
Скачано [14]: dataset_showplace/7704594_Tianzi_Mountains.jpg
Скачано [15]: dataset_showplace/2297961_Tianzi_Mountains.jpg
Скачано [16]: dataset_showplace/440520_Tianzi_Mountains.jpg
Скачано [17]: dataset_showplace/2179

  8%|▊         | 1/13 [11:08<2:13:40, 668.38s/it]

Скачано [121]: dataset_showplace/2179282_Grand_Canyon.jpg
Скачано [122]: dataset_showplace/1836412_Grand_Canyon.jpg
Скачано [123]: dataset_showplace/1867005_Grand_Canyon.jpg
Скачано [124]: dataset_showplace/3341112_Grand_Canyon.jpg
Скачано [125]: dataset_showplace/5946657_Grand_Canyon.jpg
Скачано [126]: dataset_showplace/4273791_Grand_Canyon.jpg
Скачано [127]: dataset_showplace/7924473_Grand_Canyon.jpg
Скачано [128]: dataset_showplace/3736557_Grand_Canyon.jpg
Скачано [129]: dataset_showplace/4064748_Grand_Canyon.jpg
Скачано [130]: dataset_showplace/7499755_Grand_Canyon.jpg
Скачано [131]: dataset_showplace/5642455_Grand_Canyon.jpg
Скачано [132]: dataset_showplace/293776_Grand_Canyon.jpg
Скачано [133]: dataset_showplace/2647092_Grand_Canyon.jpg
Скачано [134]: dataset_showplace/55643_Grand_Canyon.jpg
Скачано [135]: dataset_showplace/2835690_Grand_Canyon.jpg
Скачано [136]: dataset_showplace/1956139_Grand_Canyon.jpg
Скачано [137]: dataset_showplace/2410449_Grand_Canyon.jpg
Скачано [138]: da

 15%|█▌        | 2/13 [22:54<2:06:33, 690.34s/it]

Скачано [241]: dataset_showplace/5005213_Plitvice_Lakes.jpg
Скачано [242]: dataset_showplace/5060754_Plitvice_Lakes.jpg
Скачано [243]: dataset_showplace/939606_Plitvice_Lakes.jpg
Скачано [244]: dataset_showplace/4581231_Plitvice_Lakes.jpg
Скачано [245]: dataset_showplace/5140665_Plitvice_Lakes.jpg
Скачано [246]: dataset_showplace/4575023_Plitvice_Lakes.jpg
Скачано [247]: dataset_showplace/4575030_Plitvice_Lakes.jpg
Скачано [248]: dataset_showplace/868547_Plitvice_Lakes.jpg
Скачано [249]: dataset_showplace/190807_Plitvice_Lakes.jpg
Скачано [250]: dataset_showplace/1272663_Plitvice_Lakes.jpg
Скачано [251]: dataset_showplace/4575026_Plitvice_Lakes.jpg
Скачано [252]: dataset_showplace/4575027_Plitvice_Lakes.jpg
Скачано [253]: dataset_showplace/5140666_Plitvice_Lakes.jpg
Скачано [254]: dataset_showplace/1272662_Plitvice_Lakes.jpg
Скачано [255]: dataset_showplace/5274951_Plitvice_Lakes.jpg
Скачано [256]: dataset_showplace/1272668_Plitvice_Lakes.jpg
Скачано [257]: dataset_showplace/1272660_Pl

 23%|██▎       | 3/13 [35:18<1:59:11, 715.16s/it]

Скачано [361]: dataset_showplace/6771879_Cappadocia.jpg
Скачано [362]: dataset_showplace/5678553_Cappadocia.jpg
Скачано [363]: dataset_showplace/6757939_Cappadocia.jpg
Скачано [364]: dataset_showplace/7393437_Cappadocia.jpg
Скачано [365]: dataset_showplace/7217175_Cappadocia.jpg
Скачано [366]: dataset_showplace/765498_Cappadocia.jpg
Скачано [367]: dataset_showplace/1773468_Cappadocia.jpg
Скачано [368]: dataset_showplace/5630493_Cappadocia.jpg
Скачано [369]: dataset_showplace/4561263_Cappadocia.jpg
Скачано [370]: dataset_showplace/805624_Cappadocia.jpg
Скачано [371]: dataset_showplace/6551466_Cappadocia.jpg
Скачано [372]: dataset_showplace/6468301_Cappadocia.jpg
Скачано [373]: dataset_showplace/805626_Cappadocia.jpg
Скачано [374]: dataset_showplace/4561264_Cappadocia.jpg
Скачано [375]: dataset_showplace/4561274_Cappadocia.jpg
Скачано [376]: dataset_showplace/4561273_Cappadocia.jpg
Скачано [377]: dataset_showplace/2660659_Cappadocia.jpg
Скачано [378]: dataset_showplace/7723447_Cappadocia

 31%|███       | 4/13 [47:05<1:46:47, 711.89s/it]

Нет больше фото для "Cappadocia"
Скачано [478]: dataset_showplace/2760499_Niagara_Falls.jpg
Скачано [479]: dataset_showplace/84669_Niagara_Falls.jpg
Скачано [480]: dataset_showplace/1678290_Niagara_Falls.jpg
Скачано [481]: dataset_showplace/2811005_Niagara_Falls.jpg
Скачано [482]: dataset_showplace/4259511_Niagara_Falls.jpg
Скачано [483]: dataset_showplace/3750836_Niagara_Falls.jpg
Скачано [484]: dataset_showplace/6093740_Niagara_Falls.jpg
Скачано [485]: dataset_showplace/224935_Niagara_Falls.jpg
Скачано [486]: dataset_showplace/5389525_Niagara_Falls.jpg
Скачано [487]: dataset_showplace/113525_Niagara_Falls.jpg
Скачано [488]: dataset_showplace/4416170_Niagara_Falls.jpg
Скачано [489]: dataset_showplace/3993912_Niagara_Falls.jpg
Скачано [490]: dataset_showplace/1579909_Niagara_Falls.jpg
Скачано [491]: dataset_showplace/55119_Niagara_Falls.jpg
Скачано [492]: dataset_showplace/55118_Niagara_Falls.jpg
Скачано [493]: dataset_showplace/4008993_Niagara_Falls.jpg
Скачано [494]: dataset_showplac

 38%|███▊      | 5/13 [59:29<1:36:27, 723.44s/it]

Скачано [598]: dataset_showplace/4357098_Coliseum.jpg
Скачано [599]: dataset_showplace/5047352_Coliseum.jpg
Скачано [600]: dataset_showplace/173469_Coliseum.jpg
Скачано [601]: dataset_showplace/5274597_Coliseum.jpg
Скачано [602]: dataset_showplace/596696_Coliseum.jpg
Скачано [603]: dataset_showplace/7246970_Coliseum.jpg
Скачано [604]: dataset_showplace/2800128_Coliseum.jpg
Скачано [605]: dataset_showplace/1799675_Coliseum.jpg
Скачано [606]: dataset_showplace/7246901_Coliseum.jpg
Скачано [607]: dataset_showplace/4798229_Coliseum.jpg
Скачано [608]: dataset_showplace/601763_Coliseum.jpg
Скачано [609]: dataset_showplace/5274632_Coliseum.jpg
Скачано [610]: dataset_showplace/7246920_Coliseum.jpg
Скачано [611]: dataset_showplace/7244812_Coliseum.jpg
Скачано [612]: dataset_showplace/2111899_Coliseum.jpg
Скачано [613]: dataset_showplace/3456781_Coliseum.jpg
Скачано [614]: dataset_showplace/3736108_Coliseum.jpg
Скачано [615]: dataset_showplace/7246910_Coliseum.jpg
Скачано [616]: dataset_showplac

 46%|████▌     | 6/13 [1:08:49<1:17:56, 668.01s/it]

Нет больше фото для "Coliseum"
Скачано [683]: dataset_showplace/103417_The_Eiffel_Tower.jpg
Скачано [684]: dataset_showplace/9082149_The_Eiffel_Tower.jpg
Скачано [685]: dataset_showplace/1839974_The_Eiffel_Tower.jpg
Скачано [686]: dataset_showplace/9064240_The_Eiffel_Tower.jpg
Скачано [687]: dataset_showplace/4011964_The_Eiffel_Tower.jpg
Скачано [688]: dataset_showplace/3862939_The_Eiffel_Tower.jpg
Скачано [689]: dataset_showplace/4353082_The_Eiffel_Tower.jpg
Скачано [690]: dataset_showplace/1744574_The_Eiffel_Tower.jpg
Скачано [691]: dataset_showplace/1838626_The_Eiffel_Tower.jpg
Скачано [692]: dataset_showplace/4562306_The_Eiffel_Tower.jpg
Скачано [693]: dataset_showplace/6510643_The_Eiffel_Tower.jpg
Скачано [694]: dataset_showplace/1853365_The_Eiffel_Tower.jpg
Скачано [695]: dataset_showplace/4416700_The_Eiffel_Tower.jpg
Скачано [696]: dataset_showplace/2789943_The_Eiffel_Tower.jpg
Скачано [697]: dataset_showplace/4517342_The_Eiffel_Tower.jpg
Скачано [698]: dataset_showplace/6491734

 54%|█████▍    | 7/13 [1:20:38<1:08:08, 681.40s/it]

Скачано [803]: dataset_showplace/6887737_Burj_Khalifa.jpg
Скачано [804]: dataset_showplace/5295131_Burj_Khalifa.jpg
Скачано [805]: dataset_showplace/2212978_Burj_Khalifa.jpg
Скачано [806]: dataset_showplace/2650364_Burj_Khalifa.jpg
Скачано [807]: dataset_showplace/1085058_Burj_Khalifa.jpg
Скачано [808]: dataset_showplace/4654434_Burj_Khalifa.jpg
Скачано [809]: dataset_showplace/5082317_Burj_Khalifa.jpg
Скачано [810]: dataset_showplace/2292779_Burj_Khalifa.jpg
Скачано [811]: dataset_showplace/4922311_Burj_Khalifa.jpg
Скачано [812]: dataset_showplace/4926554_Burj_Khalifa.jpg
Скачано [813]: dataset_showplace/4044174_Burj_Khalifa.jpg
Скачано [814]: dataset_showplace/2292836_Burj_Khalifa.jpg
Скачано [815]: dataset_showplace/4922313_Burj_Khalifa.jpg
Скачано [816]: dataset_showplace/4926555_Burj_Khalifa.jpg
Скачано [817]: dataset_showplace/4926556_Burj_Khalifa.jpg
Скачано [818]: dataset_showplace/4926557_Burj_Khalifa.jpg
Скачано [819]: dataset_showplace/4922310_Burj_Khalifa.jpg
Скачано [820]:

 62%|██████▏   | 8/13 [1:30:00<53:35, 643.17s/it]  

Нет больше фото для "Burj Khalifa"
Скачано [883]: dataset_showplace/3217765_Pyramids_of_Giza.jpg
Скачано [884]: dataset_showplace/2371501_Pyramids_of_Giza.jpg
Скачано [885]: dataset_showplace/3333792_Pyramids_of_Giza.jpg
Скачано [886]: dataset_showplace/1839616_Pyramids_of_Giza.jpg
Скачано [887]: dataset_showplace/4762581_Pyramids_of_Giza.jpg
Скачано [888]: dataset_showplace/6880311_Pyramids_of_Giza.jpg
Скачано [889]: dataset_showplace/1175825_Pyramids_of_Giza.jpg
Скачано [890]: dataset_showplace/1051378_Pyramids_of_Giza.jpg
Скачано [891]: dataset_showplace/4194196_Pyramids_of_Giza.jpg
Скачано [892]: dataset_showplace/1175822_Pyramids_of_Giza.jpg
Скачано [893]: dataset_showplace/1051380_Pyramids_of_Giza.jpg
Скачано [894]: dataset_showplace/4796260_Pyramids_of_Giza.jpg
Скачано [895]: dataset_showplace/4886378_Pyramids_of_Giza.jpg
Скачано [896]: dataset_showplace/5340136_Pyramids_of_Giza.jpg
Скачано [897]: dataset_showplace/3343460_Pyramids_of_Giza.jpg
Скачано [898]: dataset_showplace/47

 69%|██████▉   | 9/13 [1:41:24<43:44, 656.04s/it]

Скачано [1003]: dataset_showplace/43387_Machu_Picchu.jpg
Скачано [1004]: dataset_showplace/143875_Machu_Picchu.jpg
Скачано [1005]: dataset_showplace/2197157_Machu_Picchu.jpg
Скачано [1006]: dataset_showplace/772356_Machu_Picchu.jpg
Скачано [1007]: dataset_showplace/2913005_Machu_Picchu.jpg
Скачано [1008]: dataset_showplace/772355_Machu_Picchu.jpg
Скачано [1009]: dataset_showplace/43384_Machu_Picchu.jpg
Скачано [1010]: dataset_showplace/5223959_Machu_Picchu.jpg
Скачано [1011]: dataset_showplace/43381_Machu_Picchu.jpg
Скачано [1012]: dataset_showplace/2577070_Machu_Picchu.jpg
Скачано [1013]: dataset_showplace/43394_Machu_Picchu.jpg
Скачано [1014]: dataset_showplace/43389_Machu_Picchu.jpg
Скачано [1015]: dataset_showplace/3889867_Machu_Picchu.jpg
Скачано [1016]: dataset_showplace/43396_Machu_Picchu.jpg
Скачано [1017]: dataset_showplace/43392_Machu_Picchu.jpg
Скачано [1018]: dataset_showplace/2773629_Machu_Picchu.jpg
Скачано [1019]: dataset_showplace/15903_Machu_Picchu.jpg
Скачано [1020]: 

 77%|███████▋  | 10/13 [1:50:26<31:02, 620.80s/it]

Нет больше фото для "Machu Picchu"
Скачано [1095]: dataset_showplace/5077128_Kremlin_Moscow.jpg
Скачано [1096]: dataset_showplace/3872941_Kremlin_Moscow.jpg
Скачано [1097]: dataset_showplace/5077131_Kremlin_Moscow.jpg
Скачано [1098]: dataset_showplace/6649913_Kremlin_Moscow.jpg
Скачано [1099]: dataset_showplace/5349470_Kremlin_Moscow.jpg
Скачано [1100]: dataset_showplace/1029667_Kremlin_Moscow.jpg
Скачано [1101]: dataset_showplace/4575243_Kremlin_Moscow.jpg
Скачано [1102]: dataset_showplace/3937159_Kremlin_Moscow.jpg
Скачано [1103]: dataset_showplace/2861719_Kremlin_Moscow.jpg
Скачано [1104]: dataset_showplace/3937163_Kremlin_Moscow.jpg
Скачано [1105]: dataset_showplace/4332695_Kremlin_Moscow.jpg
Скачано [1106]: dataset_showplace/5077140_Kremlin_Moscow.jpg
Скачано [1107]: dataset_showplace/3937158_Kremlin_Moscow.jpg
Скачано [1108]: dataset_showplace/3937168_Kremlin_Moscow.jpg
Скачано [1109]: dataset_showplace/3937167_Kremlin_Moscow.jpg
Скачано [1110]: dataset_showplace/5213085_Kremlin_

 85%|████████▍ | 11/13 [2:01:49<21:19, 639.94s/it]

Скачано [1215]: dataset_showplace/4064263_st_petersburg_hermitage.jpg
Скачано [1216]: dataset_showplace/3539208_st_petersburg_hermitage.jpg
Скачано [1217]: dataset_showplace/2612663_st_petersburg_hermitage.jpg
Скачано [1218]: dataset_showplace/3711586_st_petersburg_hermitage.jpg
Скачано [1219]: dataset_showplace/3287636_st_petersburg_hermitage.jpg
Скачано [1220]: dataset_showplace/1168136_st_petersburg_hermitage.jpg
Скачано [1221]: dataset_showplace/3438539_st_petersburg_hermitage.jpg
Скачано [1222]: dataset_showplace/3438537_st_petersburg_hermitage.jpg
Скачано [1223]: dataset_showplace/5077420_st_petersburg_hermitage.jpg
Скачано [1224]: dataset_showplace/4524343_st_petersburg_hermitage.jpg
Скачано [1225]: dataset_showplace/6902548_st_petersburg_hermitage.jpg
Скачано [1226]: dataset_showplace/6902540_st_petersburg_hermitage.jpg
Скачано [1227]: dataset_showplace/4805295_st_petersburg_hermitage.jpg
Скачано [1228]: dataset_showplace/8171206_st_petersburg_hermitage.jpg
Скачано [1229]: data

 92%|█████████▏| 12/13 [2:14:22<11:14, 674.29s/it]

Скачано [1335]: dataset_showplace/3203029_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1336]: dataset_showplace/4847687_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1337]: dataset_showplace/2653785_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1338]: dataset_showplace/7866915_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1339]: dataset_showplace/2670762_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1340]: dataset_showplace/1773468_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1341]: dataset_showplace/4657956_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1342]: dataset_showplace/9215914_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1343]: dataset_showplace/4561264_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1344]: dataset_showplace/4840080_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1345]: dataset_showplace/4549396_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1346]: dataset_showplace/5173654_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [1347]: dataset_showplace/6514351_Zhangye_Danxia_Colorful_Rocks.jpg
Скачано [134

100%|██████████| 13/13 [2:27:12<00:00, 679.43s/it]

1454 скачано





In [20]:
metadata

[{'filename': '455338_Tianzi_Mountains.jpg',
  'query': 'Tianzi Mountains',
  'source_url': 'https://pixabay.com/photos/mountaineering-man-trail-path-455338/',
  'tags': 'mountaineering, man, trail, path, mountains, male mountaineer, wanderer, backpacking, backpack, hike, away, mountain hiking, hiking, adventure, activity, trekking, traveler, tourist, lifestyle, mountain, nature, landscape, mountain landscape, mountain path, mountaineer, mountain hike, urlaub, erholung, freizeit, entspannung, man, man, man, man, man, hiking, hiking, hiking, mountain',
  'views': 668258},
 {'filename': '2125148_Tianzi_Mountains.jpg',
  'query': 'Tianzi Mountains',
  'source_url': 'https://pixabay.com/photos/climb-mountaineering-mountains-2125148/',
  'tags': 'climb, mountaineering, mountains, mountain ranges, mountaineer, mountain climber, mountain climbing, cliffs, silhouette, rock formations, nature, ice ax, summit, sunrise, sunbeam',
  'views': 199478},
 {'filename': '862870_Tianzi_Mountains.jpg',
  