In [27]:
import os
import time
import requests
import hashlib
import shutil
import pandas as pd
from IPython.display import display
from selenium import webdriver
from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
# from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service

In [28]:
# .\venv\Scripts\activate
# deactivate
# working click version
# aprox 23 min from downloading all
# time with added copy train is aprox

def download_images_from_yandex(query, num_images, path):
    # Create csv file for DataFrame if it doesn't exist already
    if not os.path.exists('dataset.csv'):
        # Creating an empty DataFrame with selected columns
        columns = ['Name', 'Path', 'Url']
        df = pd.DataFrame(columns=columns)
        print('Created empty file dataset.csv')
    else:
        # If it exists load DataFrame from file
        df = pd.read_csv('dataset.csv')

    # Create a base folder named 'data'
    images_folder = os.path.join(path, 'images')
    labels_folder = os.path.join(path, 'labels')
    query_folder = os.path.join(images_folder, query.replace(" ", "_"))
    if not os.path.exists(labels_folder):
        os.makedirs(labels_folder)
    if not os.path.exists(images_folder):
        os.makedirs(images_folder)
    if not os.path.exists(query_folder):
        os.makedirs(query_folder)

    # Set Chrome options
    chrome_options = Options()
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")

    # Initialize the WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get('https://yandex.ru/images/')

    # Find the search box and perform the search
    search_box = driver.find_element(By.NAME, "text")
    search_box.send_keys(query)
    search_box.submit()

    # Wait for images to load with a longer timeout
    try:
        WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'img.cbir-intent__thumb')))
    except Exception as e:
        print(f"Error during waiting for images: {e}")
        driver.quit()
        return

    # Allow for scrolling to load more images
    scrolls_needed = 10  # Number of scrolls to attempt
    for _ in range(scrolls_needed):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)  # Adjust as necessary for loading time

    # Download images by clicking on them
    count = 0
    while count < num_images:
        # Fetch the image elements again before processing 
        # Fixed issue with clicking on image-links for related searches
        image_elements = driver.find_elements(By.CSS_SELECTOR, 'img.ContentImage-Image.ContentImage-Image_clickable:not(.SearchRelatedGallery-Items img)')

        if not image_elements:  # If no more images are available, break
            print("No more images found.")
            break
        
        try:
            # Click the image to open a larger version
            img = image_elements[count]  # Get the current image to click
            # Get SHA256 name for file based on their thumbnail link
            img_name = img.get_attribute('src')
            hashed_name = hashlib.sha256(img_name.encode('utf-8')).hexdigest()
            # print(hashed_name)
            jpg_file_path = os.path.join(query_folder, f"{hashed_name}.jpg")
            png_file_path = os.path.join(query_folder, f"{hashed_name}.png")
            # print(png_file_path)
            # Check if certain images have been already downloaded
            if not os.path.exists(jpg_file_path) and not os.path.exists(png_file_path):
                # Execute clicking on tgumbnails to download full-size images
                driver.execute_script("arguments[0].click();", img)
                # img.click()
                time.sleep(2)  # Wait for the larger image to load
                
                # Find the larger image element
                large_image = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'img.MMImage-Origin'))
                )
                img_url = large_image.get_attribute('src') 
                # Download full image
                if img_url:
                    img_data = requests.get(img_url).content
                    ext = 'jpg' if 'jpeg' in img_url else 'png'
                    file_path = os.path.join(query_folder, f'{hashed_name}.{ext}')
                    if df[(df['Name'] == hashed_name)].empty:
                        # Add new line to the DataFrame if a new image was downloaded
                        new_row = pd.DataFrame({'Name': [hashed_name], 'Path': [file_path], 'Url': [img_url]})
                        # print(f"Added new row: {new_row['Name']}")
                        df = pd.concat([df, new_row], ignore_index=True)
                        # Save new line to DataFrame csv file
                        df.to_csv('dataset.csv', index=False)
                    with open(file_path, 'wb') as f:
                        f.write(img_data)
                        print(f"Downloaded {file_path} from URL.")
                        count += 1
                # Close the enlarged image to return to the search results
                driver.back()
                time.sleep(2)  # Allow time for the previous page to load again
            else:
                # If image was already downloaded skip to next item on web-page
                print(f"image {hashed_name} has been already downloaded.") 
                count += 1
                
        except Exception as e:
            print(f"Error clicking on image or downloading: {e}")
            # Attempt to go back if there was an error
            try:
                driver.back()
                time.sleep(2)  # Allow time for the page to load again
            except Exception as inner_e:
                print(f"Error navigating back: {inner_e}")
    # Print DataFrame using IPython.display
    display(df)
    driver.quit()
    # Print total number of elements
    print(f"Downloaded {count} images for query '{query}'.")
    createValidationDataset(query, 20, query_folder, os.path.join('dataset', 'val'))

def createValidationDataset(query, num_images, source_path, destination_path):
    # Create a base folder named 'data'
    images_folder = os.path.join(destination_path, 'images')
    labels_folder = os.path.join(destination_path, 'labels')
    destination_path = os.path.join(images_folder, query.replace(" ", "_"))

    # Create necessary directories if they don't exist
    os.makedirs(labels_folder, exist_ok=True)
    os.makedirs(images_folder, exist_ok=True)
    os.makedirs(destination_path, exist_ok=True)

    # List images in the source directory
    files = os.listdir(source_path)

    # Filter out only image files (customize the extensions as needed)
    image_files = [f for f in files if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]

    # Limiting the number of files to move
    num_files_to_move = min(num_images, len(image_files))  # Move up to num_images, but not more than available
    files_to_move = image_files[:num_files_to_move]  # Get the first num_images images

    # Move each image file to the destination directory
    for file_name in files_to_move:
        source_file = os.path.join(source_path, file_name)
        destination_file = os.path.join(destination_path, file_name)
        shutil.move(source_file, destination_file)

    print(f'Moved {len(files_to_move)} images to {destination_path}')


# Usage
download_images_from_yandex("паспорт рф", 100, os.path.join('dataset', 'train'))  # Adjust the keyword and query number as needed
download_images_from_yandex("картинки", 100, os.path.join('dataset', 'train'))

Created empty file dataset.csv
Downloaded dataset\train\images\паспорт_рф\89c8bfe6af38f668815f6c34080b038e5e93c0e845197d46e8ad259b6c6208ff.png from URL.
Downloaded dataset\train\images\паспорт_рф\fc3a20c81a82f70f99578071f68ac26bbf93c4bbbd995bc6b2eb617f03ba9063.png from URL.
Downloaded dataset\train\images\паспорт_рф\f41cbf4a0ea62bf66321c881fa0483808dc53cefef9d45ebe8cfee6ea69d2322.png from URL.
Downloaded dataset\train\images\паспорт_рф\c070299b386ecd2de9c178275992b2938dabc8495716de24660fff9d5f5aa9a3.png from URL.
Downloaded dataset\train\images\паспорт_рф\71918b1385fbbd8d70a9d43cbb5854460b9a06759f7929d03241f62fe039242c.png from URL.
Downloaded dataset\train\images\паспорт_рф\cafdcf7a147a9e529a2161e8a7069b465e1c96d284e8cb4aee1ca57299e40655.png from URL.
Downloaded dataset\train\images\паспорт_рф\24e578ac5e41fab20ec719a67f25bd4bb6d71148313a3fc54a04ef728a8ff139.png from URL.
Downloaded dataset\train\images\паспорт_рф\d2874316e590da6185dad79330b6773674c328a0abf501d78e41232db8998a0e.png fro

Unnamed: 0,Name,Path,Url
0,89c8bfe6af38f668815f6c34080b038e5e93c0e845197d...,dataset\train\images\паспорт_рф\89c8bfe6af38f6...,https://res.cloudinary.com/dbb6h31sb/image/upl...
1,fc3a20c81a82f70f99578071f68ac26bbf93c4bbbd995b...,dataset\train\images\паспорт_рф\fc3a20c81a82f7...,https://avatars.mds.yandex.net/i?id=a5adb7c517...
2,f41cbf4a0ea62bf66321c881fa0483808dc53cefef9d45...,dataset\train\images\паспорт_рф\f41cbf4a0ea62b...,https://avatars.mds.yandex.net/i?id=a205555f68...
3,c070299b386ecd2de9c178275992b2938dabc8495716de...,dataset\train\images\паспорт_рф\c070299b386ecd...,https://avatars.mds.yandex.net/i?id=e8e1efcdf6...
4,71918b1385fbbd8d70a9d43cbb5854460b9a06759f7929...,dataset\train\images\паспорт_рф\71918b1385fbbd...,https://avatars.mds.yandex.net/i?id=e536524296...
...,...,...,...
95,dcb9828b0b7cb34e8777f5d81be29b335dca956b8e0911...,dataset\train\images\паспорт_рф\dcb9828b0b7cb3...,https://avatars.mds.yandex.net/i?id=00ff0261a0...
96,cf59fcc7d2794a7d30cf49f9a12cacad9714534088c09f...,dataset\train\images\паспорт_рф\cf59fcc7d2794a...,https://otvet.imgsmail.ru/download/245070603_d...
97,0495df3d3a5e1418beca0615e688d5df8dd0a00aaeade8...,dataset\train\images\паспорт_рф\0495df3d3a5e14...,https://avatars.mds.yandex.net/i?id=7f9fea39f4...
98,b7c360bb77c185a8f419fae1686ac032a0f74fac6e7894...,dataset\train\images\паспорт_рф\b7c360bb77c185...,https://avatars.mds.yandex.net/i?id=38b2c8781d...


Downloaded 100 images for query 'паспорт рф'.
Moved 20 images to dataset\val\images\паспорт_рф
Downloaded dataset\train\images\картинки\31014cd9f4c8b3f7d4022520abb8608510db95b3eae55048a3cfba5a2b894749.png from URL.
Downloaded dataset\train\images\картинки\397498b6e0b6610ba608de5ca61afd09a640bb73ff5ff0b6f2d67ebcf9675313.png from URL.
Downloaded dataset\train\images\картинки\4fbc02a7a091e2d1c3dc4c7d46941872ae53dbe695fcae41107bb2645b63c7d5.png from URL.
Downloaded dataset\train\images\картинки\d775c7b79b6ee99493cdaace31d64af7a577c5ca6fa0de6a7d3e8e961c5ac034.png from URL.
Downloaded dataset\train\images\картинки\e6423aac1af345e91012266c2faecb3a34bd5db9da5563634e4e3990fe03f754.png from URL.
Downloaded dataset\train\images\картинки\b63339870e11e1532caa7526512a8d1711ef590d4c631f917543e43dafbc1576.png from URL.
Downloaded dataset\train\images\картинки\f5991fcb11377220a694c1c18434b82dcb06549b30bc0a5e130226848f238f33.png from URL.
Downloaded dataset\train\images\картинки\2df110affb2e0eb9410a64c4

Unnamed: 0,Name,Path,Url
0,89c8bfe6af38f668815f6c34080b038e5e93c0e845197d...,dataset\train\images\паспорт_рф\89c8bfe6af38f6...,https://res.cloudinary.com/dbb6h31sb/image/upl...
1,fc3a20c81a82f70f99578071f68ac26bbf93c4bbbd995b...,dataset\train\images\паспорт_рф\fc3a20c81a82f7...,https://avatars.mds.yandex.net/i?id=a5adb7c517...
2,f41cbf4a0ea62bf66321c881fa0483808dc53cefef9d45...,dataset\train\images\паспорт_рф\f41cbf4a0ea62b...,https://avatars.mds.yandex.net/i?id=a205555f68...
3,c070299b386ecd2de9c178275992b2938dabc8495716de...,dataset\train\images\паспорт_рф\c070299b386ecd...,https://avatars.mds.yandex.net/i?id=e8e1efcdf6...
4,71918b1385fbbd8d70a9d43cbb5854460b9a06759f7929...,dataset\train\images\паспорт_рф\71918b1385fbbd...,https://avatars.mds.yandex.net/i?id=e536524296...
...,...,...,...
195,bcb3d45d05bf0bf4b17518459e8e1ab17b22411d725aae...,dataset\train\images\картинки\bcb3d45d05bf0bf4...,https://avatars.mds.yandex.net/i?id=7bbd4551d2...
196,d02b37246f6ddaf25d9b8874dab2ee0e654efa353e9a53...,dataset\train\images\картинки\d02b37246f6ddaf2...,https://avatars.mds.yandex.net/i?id=358a3ff136...
197,f0879b9cbd2041da5445b20c6c48b40ef5acf1f6794f24...,dataset\train\images\картинки\f0879b9cbd2041da...,https://avatars.mds.yandex.net/i?id=e512a28a7e...
198,210c984abec5d297971fbe7e74d50306a17c35da7c3206...,dataset\train\images\картинки\210c984abec5d297...,https://www.sunhome.ru/i/wallpapers/125/vodopa...


Downloaded 100 images for query 'картинки'.
Moved 20 images to dataset\val\images\картинки


In [None]:
# code to run BEFORE labeling train and validation dataset
# this will remove directories named as query and store all images in images/
# AFTER running this block you can start labeling images

# make update dataset.csv path after rebuilding tree
# make storing classes after processing by model

# Define list for directories to delete
subdirs_to_delete = []

def rebuildDirectoriesTree(base_dir):
    # Iterate through the subdirectories
    for subdir, _, files in os.walk(base_dir):
        # Flag to check if we moved at least one file from this subdir
        moved_any_file = False
        for file in files:
            # Check if the file is an image (simple check based on common image extensions)
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.svg')):
                # Define source file path
                src_file_path = os.path.join(subdir, file)
                
                # Define destination file path
                dst_file_path = os.path.join(base_dir, file)
                
                # Move the file to the base directory
                try:
                    # If file already exists, you can choose to overwrite or rename it
                    if os.path.exists(dst_file_path):
                        print(f"File {dst_file_path} already exists. Skipping...")
                    else:
                        shutil.move(src_file_path, dst_file_path)
                        print(f'Moved: {src_file_path} to {dst_file_path}')
                        moved_any_file = True  # Mark that we moved a file
                except Exception as e:
                    print(f"Error moving file {src_file_path}: {e}")

        # Check if we moved any file and if the directory is empty after moving
        if moved_any_file or not os.listdir(subdir):
            # Check if the subdir is empty after moving files
            subdirs_to_delete.append(subdir)

        # Now delete the empty subdirectories
        for subdir in subdirs_to_delete:
            try:
                shutil.rmtree(subdir)
                print(f'Deleted empty directory: {subdir}')
            except Exception as e:
                print(f"Error deleting directory {subdir}: {e}")

    print("All images have been moved and empty subdirectories deleted.")

# Define the base dataset directory
base_dir_train = os.path.join('dataset', 'train', 'images')
base_dir_val = os.path.join('dataset', 'val', 'images')

# Usage
rebuildDirectoriesTree(base_dir_train)
rebuildDirectoriesTree(base_dir_val)


File dataset\train\images\004ec4f3ed3c81e1ac14424fd668a879af5fdb9fdb1e8c494884bd6f9702c297.png already exists. Skipping...
File dataset\train\images\0355025ae5f440b43aa09ecc722e98ccf8693a92adfada85d604608e31952e21.png already exists. Skipping...
File dataset\train\images\0495df3d3a5e1418beca0615e688d5df8dd0a00aaeade8d8bd56a4962c4aca68.png already exists. Skipping...
File dataset\train\images\049e804385129e2690b5c3516e64538347b2fd5816a8d7e9f03eeb6c90ccefb5.png already exists. Skipping...
File dataset\train\images\06af9a536603f31297a24426897420d1525deba004e40b7e302e0267e9eca143.png already exists. Skipping...
File dataset\train\images\070941c15650a2f103ed93d9ca16c2718c493db99f539be803f70ad5726f5ee6.png already exists. Skipping...
File dataset\train\images\071cac05764ad5a318facca290a04311a6270dd8c837a07ac6a569a4ed2d4450.png already exists. Skipping...
File dataset\train\images\07eac7bf971785a361f54982503d8d95a4e5121aff07762bbaf2282d49c14855.png already exists. Skipping...
File dataset\tra

In [None]:
# working download thumbnails option
# SerpItem
# make clicking work
# locate enlarged image after clicking
# <a href="https://sun9-13.userapi.com/impg/PshPESmU2uloudQfFOtPEI4UnWikwb9M7_cmxw/1z9w2gOUf5s.jpg?size=559x811&amp;quality=96&amp;sign=e7f0537a6dd5994e07a4622cdccbbaa1&amp;type=album" target="_blank" type="button" class="Button2 Button2_pin_circle-brick Button2_size_xl Button2_link Button2_view_default OpenImageButton-Save MMViewerButtons-OpenImage MMViewerButtons-Button" autocomplete="off"><span class="Icon Icon_type_downloadOutline24 Button2-Icon Button2-Icon_side_left" aria-hidden="true"></span><span class="Button2-Text">Открыть</span></a>
# <img class="MMImage-Origin" src="//avatars.mds.yandex.net/i?id=593913c21abff56cc260d5552945e029_l-4097709-images-thumbs&amp;n=13" alt="Picture background" aria-hidden="true">

def download_images_from_yandex(query, num_images):
    # Create a base folder named 'data'
    base_folder = 'data'
    if not os.path.exists(base_folder):
        os.makedirs(base_folder)

    # Create a subfolder for the specific query inside 'data'
    folder_name = os.path.join(base_folder, query.replace(" ", "_"))
    
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Set Chrome options
    chrome_options = Options()
    # Uncomment the following line for headless mode
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")

    # Initialize the WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get('https://yandex.ru/images/')

    # Find the search box and perform the search
    search_box = driver.find_element(By.NAME, "text")
    search_box.send_keys(query)
    search_box.submit()

    # Wait for images to load with a longer timeout
    try:
        WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'img.cbir-intent__thumb')))
    except Exception as e:
        print(f"Error during waiting for images: {e}")
        driver.quit()
        return

    # Allow for scrolling to load more images
    scrolls_needed = 5  # Number of scrolls to attempt
    for _ in range(scrolls_needed):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)  # Adjust as necessary for loading time

    # Locate image elements on the page after scrolling
    image_elements = driver.find_elements(By.CSS_SELECTOR, 'img.ContentImage-Image.ContentImage-Image_clickable')

    print(f"Found {len(image_elements)} images.")

    # Download images
    count = 0
    for img in image_elements:
        img_url = img.get_attribute('src')
        
        if img_url:
            try:
                img_data = requests.get(img_url).content
                ext = 'jpg' if 'jpeg' in img_url else 'png'
                file_path = os.path.join(folder_name, f'{query}_{count}.{ext}')
                with open(file_path, 'wb') as f:
                    f.write(img_data)
                    print(f"Downloaded {file_path} from URL.")
                count += 1
                if count >= num_images:
                    break
            except Exception as e:
                print(f"Could not download {img_url}: {e}")

    driver.quit()
    print(f"Downloaded {count} images for query '{query}'.")

# Usage
download_images_from_yandex("котята", 5)  # Adjust the keyword as needed


Found 287 images.
Downloaded data\котята\котята_0.png from URL.
Downloaded data\котята\котята_1.png from URL.
Downloaded data\котята\котята_2.png from URL.
Downloaded data\котята\котята_3.png from URL.
Downloaded data\котята\котята_4.png from URL.
Downloaded data\котята\котята_5.png from URL.
Downloaded data\котята\котята_6.png from URL.
Downloaded data\котята\котята_7.png from URL.
Downloaded data\котята\котята_8.png from URL.
Downloaded data\котята\котята_9.png from URL.
Downloaded data\котята\котята_10.png from URL.
Downloaded data\котята\котята_11.png from URL.
Downloaded data\котята\котята_12.png from URL.
Downloaded data\котята\котята_13.png from URL.
Downloaded data\котята\котята_14.png from URL.
Downloaded data\котята\котята_15.png from URL.
Downloaded data\котята\котята_16.png from URL.
Downloaded data\котята\котята_17.png from URL.
Downloaded data\котята\котята_18.png from URL.
Downloaded data\котята\котята_19.png from URL.
Downloaded data\котята\котята_20.png from URL.
Downl

KeyboardInterrupt: 