In [10]:
import os
import time
import requests
import hashlib
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
# from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
# from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service

In [None]:
# .\venv\Scripts\activate
# deactivate
# working click version

def download_images_from_yandex(query, num_images):
    if not os.path.exists('data.csv'):
        # Creating an empty DataFrame from the lists
        columns = ['Name', 'Path', 'Url']
        df = pd.DataFrame(columns=columns)
        print('Created empty file data.csv')
    else:
        df = pd.read_csv('data.csv')

    # Create a base folder named 'data'
    base_folder = 'data'
    if not os.path.exists(base_folder):
        os.makedirs(base_folder)

    # Create a subfolder for the specific query inside 'data'
    # folder_name = os.path.join(base_folder, query.replace(" ", "_"))
    
    # if not os.path.exists(folder_name):
    #     os.makedirs(folder_name)

    # Set Chrome options
    chrome_options = Options()
    # Uncomment the following line for headless mode
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")

    # Initialize the WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get('https://yandex.ru/images/')

    # Find the search box and perform the search
    search_box = driver.find_element(By.NAME, "text")
    search_box.send_keys(query)
    search_box.submit()

    # Wait for images to load with a longer timeout
    try:
        WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'img.cbir-intent__thumb')))
    except Exception as e:
        print(f"Error during waiting for images: {e}")
        driver.quit()
        return

    # Allow for scrolling to load more images
    scrolls_needed = 5  # Number of scrolls to attempt
    for _ in range(scrolls_needed):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)  # Adjust as necessary for loading time

    # Download images by clicking on them
    count = 0
    while count < num_images:
        # Fetch the image elements again before processing
        image_elements = driver.find_elements(By.CSS_SELECTOR, 'img.ContentImage-Image.ContentImage-Image_clickable')

        if not image_elements:  # If no more images are available, break
            print("No more images found.")
            break
        
        try:
            # Click the image to open a larger version
            img = image_elements[count]  # Get the current image to click
            driver.execute_script("arguments[0].click();", img)
            # img.click()
            time.sleep(2)  # Wait for the larger image to load
            
            # Find the larger image element
            large_image = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'img.MMImage-Origin'))
            )
            img_url = large_image.get_attribute('src')
            hashed_url = hashlib.sha256(img_url.encode('utf-8')).hexdigest()
            # print(hashed_url)
            jpg_file_path = os.path.join(base_folder, f"{hashed_url}.jpg")
            png_file_path = os.path.join(base_folder, f"{hashed_url}.png")
            # print(png_file_path)
            if not os.path.exists(jpg_file_path) and not os.path.exists(png_file_path):
                if img_url:
                    img_data = requests.get(img_url).content
                    ext = 'jpg' if 'jpeg' in img_url else 'png'
                    file_path = os.path.join(base_folder, f'{hashed_url}.{ext}')
                    if df[(df['Name'] == hashed_url)].empty:
                        new_row = pd.DataFrame({'Name': [hashed_url], 'Path': [file_path], 'Url': [img_url]})
                        print(f"Added new row: {new_row['Name']}")
                        df = pd.concat([df, new_row], ignore_index=True)
                    with open(file_path, 'wb') as f:
                        f.write(img_data)
                        print(f"Downloaded {file_path} from URL.")
                        count += 1
            else:
                print(f"image {hashed_url} is already downloaded.") 
                count += 1
                
            # Close the enlarged image to return to the search results
            driver.back()
            time.sleep(2)  # Allow time for the previous page to load again

        except Exception as e:
            print(f"Error clicking on image or downloading: {e}")
            # Attempt to go back if there was an error
            try:
                driver.back()
                time.sleep(2)  # Allow time for the page to load again
            except Exception as inner_e:
                print(f"Error navigating back: {inner_e}")
    print(df)
    df.to_csv('data.csv', index=False)
    driver.quit()
    print(f"Downloaded {count} images for query '{query}'.")

# Usage
download_images_from_yandex("паспорт рф", 10)  # Adjust the keyword as needed


Added new row:                                                 Name  \
0  7cc66736ef17019fcf275704dcbdacf2f4ba685db33ff5...   

                                                Path  \
0  data\7cc66736ef17019fcf275704dcbdacf2f4ba685db...   

                                                 Url  
0  https://res.cloudinary.com/dbb6h31sb/image/upl...  
Downloaded data\7cc66736ef17019fcf275704dcbdacf2f4ba685db33ff5d5b308e449a8e37cd5.png from URL.
Added new row:                                                 Name  \
0  a59cd6990ddcc709d994ed80d6abfe3d8293cea68bd03b...   

                                                Path  \
0  data\a59cd6990ddcc709d994ed80d6abfe3d8293cea68...   

                                                 Url  
0  https://avatars.mds.yandex.net/i?id=a5adb7c517...  
Downloaded data\a59cd6990ddcc709d994ed80d6abfe3d8293cea68bd03bfe28d97bd523c11e94.png from URL.
Added new row:                                                 Name  \
0  bb18a84b538c249f5b1442f5603bdfcd44

In [None]:
# working download thumbnails option
# SerpItem
# make clicking work
# locate enlarged image after clicking
# <a href="https://sun9-13.userapi.com/impg/PshPESmU2uloudQfFOtPEI4UnWikwb9M7_cmxw/1z9w2gOUf5s.jpg?size=559x811&amp;quality=96&amp;sign=e7f0537a6dd5994e07a4622cdccbbaa1&amp;type=album" target="_blank" type="button" class="Button2 Button2_pin_circle-brick Button2_size_xl Button2_link Button2_view_default OpenImageButton-Save MMViewerButtons-OpenImage MMViewerButtons-Button" autocomplete="off"><span class="Icon Icon_type_downloadOutline24 Button2-Icon Button2-Icon_side_left" aria-hidden="true"></span><span class="Button2-Text">Открыть</span></a>
# <img class="MMImage-Origin" src="//avatars.mds.yandex.net/i?id=593913c21abff56cc260d5552945e029_l-4097709-images-thumbs&amp;n=13" alt="Picture background" aria-hidden="true">

def download_images_from_yandex(query, num_images):
    # Create a base folder named 'data'
    base_folder = 'data'
    if not os.path.exists(base_folder):
        os.makedirs(base_folder)

    # Create a subfolder for the specific query inside 'data'
    folder_name = os.path.join(base_folder, query.replace(" ", "_"))
    
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)

    # Set Chrome options
    chrome_options = Options()
    # Uncomment the following line for headless mode
    # chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--window-size=1920,1080")

    # Initialize the WebDriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    driver.get('https://yandex.ru/images/')

    # Find the search box and perform the search
    search_box = driver.find_element(By.NAME, "text")
    search_box.send_keys(query)
    search_box.submit()

    # Wait for images to load with a longer timeout
    try:
        WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'img.cbir-intent__thumb')))
    except Exception as e:
        print(f"Error during waiting for images: {e}")
        driver.quit()
        return

    # Allow for scrolling to load more images
    scrolls_needed = 5  # Number of scrolls to attempt
    for _ in range(scrolls_needed):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)  # Adjust as necessary for loading time

    # Locate image elements on the page after scrolling
    image_elements = driver.find_elements(By.CSS_SELECTOR, 'img.ContentImage-Image.ContentImage-Image_clickable')

    print(f"Found {len(image_elements)} images.")

    # Download images
    count = 0
    for img in image_elements:
        img_url = img.get_attribute('src')
        
        if img_url:
            try:
                img_data = requests.get(img_url).content
                ext = 'jpg' if 'jpeg' in img_url else 'png'
                file_path = os.path.join(folder_name, f'{query}_{count}.{ext}')
                with open(file_path, 'wb') as f:
                    f.write(img_data)
                    print(f"Downloaded {file_path} from URL.")
                count += 1
                if count >= num_images:
                    break
            except Exception as e:
                print(f"Could not download {img_url}: {e}")

    driver.quit()
    print(f"Downloaded {count} images for query '{query}'.")

# Usage
download_images_from_yandex("котята", 5)  # Adjust the keyword as needed


Found 287 images.
Downloaded data\котята\котята_0.png from URL.
Downloaded data\котята\котята_1.png from URL.
Downloaded data\котята\котята_2.png from URL.
Downloaded data\котята\котята_3.png from URL.
Downloaded data\котята\котята_4.png from URL.
Downloaded data\котята\котята_5.png from URL.
Downloaded data\котята\котята_6.png from URL.
Downloaded data\котята\котята_7.png from URL.
Downloaded data\котята\котята_8.png from URL.
Downloaded data\котята\котята_9.png from URL.
Downloaded data\котята\котята_10.png from URL.
Downloaded data\котята\котята_11.png from URL.
Downloaded data\котята\котята_12.png from URL.
Downloaded data\котята\котята_13.png from URL.
Downloaded data\котята\котята_14.png from URL.
Downloaded data\котята\котята_15.png from URL.
Downloaded data\котята\котята_16.png from URL.
Downloaded data\котята\котята_17.png from URL.
Downloaded data\котята\котята_18.png from URL.
Downloaded data\котята\котята_19.png from URL.
Downloaded data\котята\котята_20.png from URL.
Downl

KeyboardInterrupt: 