# Buddhist Image Scraper for LoRA Training

This notebook scrapes high-quality Buddhist-themed images and processes them to be suitable for LoRA training.
The script will download images larger than 1200x1200 and crop them to exactly 1024x1024 from the center.
Images will be saved in your Google Drive under Loras/[project_name]/dataset/

In [None]:
# Install required packages
!pip install requests Pillow duckduckgo_search tqdm

In [None]:
import os
import requests
from PIL import Image
from io import BytesIO
from duckduckgo_search import DDGS
import time
from tqdm.notebook import tqdm
import hashlib
from concurrent.futures import ThreadPoolExecutor
import logging

# Mount Google Drive to save images
from google.colab import drive
drive.mount('/content/drive')

In [None]:
class ImageScraper:
    def __init__(self):
        self.target_size = (1024, 1024)
        self.setup_logging()

    def setup_logging(self):
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)

    def create_save_directory(self, project_name):
        # Create directory structure: Loras/project_name/dataset
        save_path = os.path.join('/content/drive/MyDrive/Loras', project_name, 'dataset')
        os.makedirs(save_path, exist_ok=True)
        return save_path

    def crop_center(self, image):
        width, height = image.size
        
        # Calculate dimensions for center crop
        if width > height:
            left = (width - height) // 2
            top = 0
            right = left + height
            bottom = height
        else:
            top = (height - width) // 2
            left = 0
            bottom = top + width
            right = width
            
        # Get the center crop
        cropped = image.crop((left, top, right, bottom))
        
        # If the cropped image is still larger than 1024x1024, take the center 1024x1024
        if cropped.size[0] > 1024:
            size = cropped.size[0]
            margin = (size - 1024) // 2
            cropped = cropped.crop((margin, margin, margin + 1024, margin + 1024))
            
        return cropped

    def process_image(self, image_url, save_path, index):
        try:
            response = requests.get(image_url, timeout=10)
            if response.status_code != 200:
                return False

            # Open image and convert to RGB
            image = Image.open(BytesIO(response.content)).convert('RGB')
            width, height = image.size

            # Skip if image is too small
            if width < 1200 or height < 1200:
                return False

            # Crop center of the image to exactly 1024x1024
            cropped_image = self.crop_center(image)
            
            # Double check we have exactly 1024x1024
            if cropped_image.size != (1024, 1024):
                return False

            # Generate unique filename
            image_hash = hashlib.md5(response.content).hexdigest()[:10]
            filename = f"image_{index}_{image_hash}.jpg"
            save_path = os.path.join(save_path, filename)
            
            # Save the image with high quality
            cropped_image.save(save_path, "JPEG", quality=95)
            print(f"Saved {filename} (original size: {width}x{height})")
            return True

        except Exception as e:
            print(f"Error processing image {image_url}: {str(e)}")
            return False

    def search_duckduckgo(self, keyword, max_results):
        image_urls = []
        try:
            with DDGS() as ddgs:
                results = ddgs.images(
                    keyword,
                    max_results=max_results * 3
                )
                for r in results:
                    if r['image']:
                        image_urls.append(r['image'])
        except Exception as e:
            print(f"Error searching DuckDuckGo: {str(e)}")
        return image_urls

    def scrape_images(self, project_name, keyword, num_images):
        save_path = self.create_save_directory(project_name)
        print(f"Saving images to: {save_path}")
        print("Note: Only processing images larger than 1200x1200 pixels")
        
        # Collect image URLs
        image_urls = self.search_duckduckgo(f"buddhism {keyword}", num_images * 3)
        print(f"Found {len(image_urls)} potential images to process")

        # Process images with progress bar
        successful_downloads = 0
        with tqdm(total=num_images) as pbar:
            with ThreadPoolExecutor(max_workers=4) as executor:
                for i, url in enumerate(image_urls):
                    if successful_downloads >= num_images:
                        break
                        
                    if self.process_image(url, save_path, i):
                        successful_downloads += 1
                        pbar.update(1)

        print(f"Successfully downloaded {successful_downloads} images")
        print(f"Images are saved in: {save_path}")
        return successful_downloads

## Run the Image Scraper

Execute the cell below to start scraping images. The images will be saved to your Google Drive in the following structure:
```
Google Drive/
  └── Loras/
      └── [project_name]/
          └── dataset/
              └── images...
```

In [None]:
# Initialize the scraper
scraper = ImageScraper()

# Set your parameters
project_name = input("Enter project name (will be used as folder name): ")
keyword = input("Enter search keyword (will be combined with 'buddhism'): ")
num_images = int(input("Enter number of images to download: "))

# Start scraping
scraper.scrape_images(project_name, keyword, num_images)