# üñºÔ∏è Advanced Image Crawler & Downloader

Notebook untuk crawling dan download gambar dari berbagai situs dengan dukungan **gallery-dl** untuk resolusi penuh.

## ‚ú® Fitur
- üîê Autentikasi untuk akses resolusi penuh (Pixiv, Twitter, Instagram, dll)
- üé® Integrasi gallery-dl
- üì∑ Download gambar resolusi tinggi
- üîÑ Fallback ke manual crawler

---

## üì¶ 1. Install Dependencies

Jalankan cell di bawah untuk menginstall library yang diperlukan.

In [None]:
# Install dependencies
!pip install requests beautifulsoup4 lxml gallery-dl -q

print("‚úÖ Dependencies berhasil diinstall!")

## ‚öôÔ∏è 2. Import Libraries & Setup

In [None]:
import os
import re
import sys
import json
import time
import shutil
import hashlib
import subprocess
from pathlib import Path
from urllib.parse import urljoin, urlparse, unquote
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Set, List, Optional, Tuple, Dict, Any
from IPython.display import display, HTML, Image as IPImage, clear_output
import ipywidgets as widgets

import requests
from bs4 import BeautifulSoup

print("‚úÖ Libraries imported successfully!")

## üîß 3. Konfigurasi

### 3.1 Pengaturan Dasar

In [None]:
# ============================================================
# KONFIGURASI DASAR - Edit sesuai kebutuhan
# ============================================================

# Folder output untuk menyimpan gambar
OUTPUT_DIR = "./downloaded_images"

# Kedalaman crawling (0 = hanya halaman utama, 1-3 = ikuti link)
MAX_DEPTH = 1

# Delay antar request (detik) - untuk menghindari rate limiting
REQUEST_DELAY = 0.5

# Timeout request (detik)
REQUEST_TIMEOUT = 30

# Jumlah thread untuk parallel download
MAX_WORKERS = 5

# Gunakan gallery-dl jika tersedia
USE_GALLERY_DL = True

# Fallback ke manual crawler jika gallery-dl gagal
FALLBACK_TO_CRAWLER = True

print("‚úÖ Konfigurasi dasar sudah diset!")
print(f"   üìÅ Output folder: {OUTPUT_DIR}")
print(f"   üîç Max depth: {MAX_DEPTH}")

### 3.2 Kredensial Login (Opsional)

‚ö†Ô∏è **PENTING**: Isi kredensial di bawah untuk akses resolusi penuh. Jangan share notebook yang sudah berisi kredensial!

Pilih situs yang ingin Anda gunakan dan isi kredensialnya:

In [None]:
# ============================================================
# KREDENSIAL - Isi sesuai kebutuhan (biarkan kosong jika tidak punya)
# ============================================================

CREDENTIALS = {
    # ===== PIXIV =====
    # Opsi 1: Username & Password
    # Opsi 2: Refresh Token (lebih aman, dapatkan dengan: pip install gppt && gppt login)
    "pixiv": {
        "username": "",           # Email Pixiv
        "password": "",           # Password Pixiv
        "refresh_token": "",      # ATAU gunakan refresh token
    },
    
    # ===== TWITTER/X =====
    # Opsi 1: Export cookies dari browser (extension: "Get cookies.txt")
    # Opsi 2: Auth token dari Developer Tools > Application > Cookies
    "twitter": {
        "cookies_file": "",       # Path ke file cookies.txt
        "auth_token": "",         # ATAU auth_token dari cookies browser
    },
    
    # ===== INSTAGRAM =====
    # Opsi 1: Username & Password
    # Opsi 2: Session ID dari cookies browser
    "instagram": {
        "username": "",
        "password": "",
        "session_id": "",         # ATAU sessionid dari cookies browser
    },
    
    # ===== DEVIANTART =====
    # Dapatkan dari: https://www.deviantart.com/developers/
    "deviantart": {
        "client_id": "",
        "client_secret": "",
    },
    
    # ===== DANBOORU =====
    # API key dari profile settings
    "danbooru": {
        "username": "",
        "api_key": "",
    },
    
    # ===== IMGUR =====
    # Dapatkan dari: https://api.imgur.com/oauth2/addclient
    "imgur": {
        "client_id": "",
    },
    
    # ===== REDDIT =====
    # Buat app di: https://www.reddit.com/prefs/apps
    "reddit": {
        "client_id": "",
        "client_secret": "",
        "user_agent": "ImageCrawler/2.0",
    },
}

# Opsi gallery-dl per situs
GALLERY_DL_OPTIONS = {
    "pixiv": {
        "ugoira": True,       # Download animasi ugoira
        "metadata": True,     # Simpan metadata
    },
    "twitter": {
        "retweets": False,    # Include retweets
        "videos": True,       # Download video juga
    },
    "instagram": {
        "stories": True,      # Download stories
        "highlights": True,   # Download highlights
        "videos": True,
    },
}

print("‚úÖ Kredensial sudah dikonfigurasi!")

# Tampilkan status kredensial
print("\nüìã Status Kredensial:")
for site, creds in CREDENTIALS.items():
    has_creds = any(v for v in creds.values() if v)
    status = "‚úÖ Tersedia" if has_creds else "‚¨ú Kosong"
    print(f"   {site.capitalize()}: {status}")

## üî® 4. Core Functions

Jalankan cell di bawah untuk memuat semua fungsi yang diperlukan.

In [None]:
# ============================================================
# CONSTANTS
# ============================================================

IMAGE_EXTENSIONS = {
    '.jpg', '.jpeg', '.png', '.gif', '.webp',
    '.bmp', '.svg', '.ico', '.tiff', '.tif'
}

SUPPORTED_SITES = {
    'pixiv': {
        'domains': ['pixiv.net', 'www.pixiv.net', 'i.pximg.net'],
        'extractor': 'pixiv',
    },
    'twitter': {
        'domains': ['twitter.com', 'x.com', 'pbs.twimg.com'],
        'extractor': 'twitter',
    },
    'instagram': {
        'domains': ['instagram.com', 'www.instagram.com'],
        'extractor': 'instagram',
    },
    'deviantart': {
        'domains': ['deviantart.com', 'www.deviantart.com'],
        'extractor': 'deviantart',
    },
    'artstation': {
        'domains': ['artstation.com', 'www.artstation.com'],
        'extractor': 'artstation',
    },
    'danbooru': {
        'domains': ['danbooru.donmai.us'],
        'extractor': 'danbooru',
    },
    'imgur': {
        'domains': ['imgur.com', 'i.imgur.com'],
        'extractor': 'imgur',
    },
    'reddit': {
        'domains': ['reddit.com', 'www.reddit.com', 'i.redd.it'],
        'extractor': 'reddit',
    },
}

# Tambahkan Bagian atas secara manual agar kamu bisa mendownload. baca dokumentasi pada gallery-dl untuk penamaan
DEFAULT_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
}

print("‚úÖ Constants loaded!")

In [None]:
# ============================================================
# HELPER FUNCTIONS
# ============================================================

def detect_site(url: str) -> Optional[str]:
    """Deteksi situs dari URL."""
    parsed = urlparse(url)
    domain = parsed.netloc.lower()
    
    for site, info in SUPPORTED_SITES.items():
        for site_domain in info['domains']:
            if site_domain in domain:
                return site
    return None


def is_valid_image_url(url: str) -> bool:
    """Cek apakah URL adalah gambar."""
    path_lower = urlparse(url).path.lower()
    return any(path_lower.endswith(ext) for ext in IMAGE_EXTENSIONS)


def upgrade_to_high_res(img_url: str) -> str:
    """Upgrade URL ke resolusi tinggi."""
    # Pixiv
    if 'pximg.net' in img_url:
        img_url = re.sub(r'/c/\d+x\d+[^/]*/', '/img-original/', img_url)
        img_url = re.sub(r'_square\d+|_master\d+', '', img_url)
    # Twitter
    elif 'twimg.com' in img_url:
        img_url = re.sub(r'\?.*$', '', img_url)
        img_url += '?name=orig' if '?' not in img_url else '&name=orig'
    # Imgur
    elif 'imgur.com' in img_url:
        img_url = re.sub(r'([a-zA-Z0-9]+)[smhl]\.', r'\1.', img_url)
    # DeviantArt
    elif 'wixmp.com' in img_url:
        img_url = re.sub(r'/v1/fill/.*?/', '/', img_url)
    
    return img_url


def get_filename_from_url(url: str, response=None) -> str:
    """Extract nama file dari URL."""
    parsed = urlparse(url)
    filename = os.path.basename(unquote(parsed.path))
    
    if not filename or '.' not in filename:
        url_hash = hashlib.md5(url.encode()).hexdigest()[:12]
        ext = '.jpg'
        if response:
            content_type = response.headers.get('content-type', '')
            if 'png' in content_type: ext = '.png'
            elif 'gif' in content_type: ext = '.gif'
            elif 'webp' in content_type: ext = '.webp'
        filename = f"image_{url_hash}{ext}"
    
    return re.sub(r'[<>:"/\\|?*]', '_', filename)


print("‚úÖ Helper functions loaded!")

In [None]:
# ============================================================
# GALLERY-DL WRAPPER
# ============================================================

class GalleryDLDownloader:
    """Wrapper untuk gallery-dl."""
    
    def __init__(self):
        self.gallery_dl_path = shutil.which('gallery-dl')
    
    def is_available(self) -> bool:
        return self.gallery_dl_path is not None
    
    def generate_config(self, site: str) -> dict:
        """Generate config untuk gallery-dl."""
        creds = CREDENTIALS.get(site, {})
        options = GALLERY_DL_OPTIONS.get(site, {})
        
        config = {
            "extractor": {},
            "downloader": {"rate": "1M", "retries": 3, "timeout": 30},
        }
        
        if site == 'pixiv':
            config["extractor"]["pixiv"] = {
                "filename": "{id}_{title}_{num}.{extension}",
                "directory": ["pixiv", "{user[name]}"],
            }
            if creds.get('refresh_token'):
                config["extractor"]["pixiv"]["refresh-token"] = creds['refresh_token']
            elif creds.get('username') and creds.get('password'):
                config["extractor"]["pixiv"]["username"] = creds['username']
                config["extractor"]["pixiv"]["password"] = creds['password']
            if options.get('ugoira'):
                config["extractor"]["pixiv"]["ugoira"] = True
        
        elif site == 'twitter':
            config["extractor"]["twitter"] = {
                "filename": "{tweet_id}_{num}.{extension}",
                "directory": ["twitter", "{user[name]}"],
                "retweets": options.get('retweets', False),
                "videos": options.get('videos', True),
            }
            if creds.get('auth_token'):
                config["extractor"]["twitter"]["cookies"] = {"auth_token": creds['auth_token']}
            elif creds.get('cookies_file') and os.path.exists(creds['cookies_file']):
                config["extractor"]["twitter"]["cookies"] = creds['cookies_file']
        
        elif site == 'instagram':
            config["extractor"]["instagram"] = {
                "filename": "{shortcode}_{num}.{extension}",
                "directory": ["instagram", "{username}"],
                "stories": options.get('stories', True),
                "highlights": options.get('highlights', True),
            }
            if creds.get('session_id'):
                config["extractor"]["instagram"]["cookies"] = {"sessionid": creds['session_id']}
            elif creds.get('username') and creds.get('password'):
                config["extractor"]["instagram"]["username"] = creds['username']
                config["extractor"]["instagram"]["password"] = creds['password']
        
        elif site == 'deviantart':
            config["extractor"]["deviantart"] = {
                "filename": "{index}_{title}.{extension}",
                "directory": ["deviantart", "{author[username]}"],
                "original": True,
            }
            if creds.get('client_id') and creds.get('client_secret'):
                config["extractor"]["deviantart"]["client-id"] = creds['client_id']
                config["extractor"]["deviantart"]["client-secret"] = creds['client_secret']
        
        elif site == 'imgur':
            config["extractor"]["imgur"] = {
                "filename": "{id}_{num}.{extension}",
                "directory": ["imgur"],
                "mp4": True,
            }
            if creds.get('client_id'):
                config["extractor"]["imgur"]["client-id"] = creds['client_id']
        
        elif site == 'reddit':
            config["extractor"]["reddit"] = {
                "filename": "{id}_{num}.{extension}",
                "directory": ["reddit", "{subreddit}"],
            }
            if creds.get('client_id') and creds.get('client_secret'):
                config["extractor"]["reddit"]["client-id"] = creds['client_id']
                config["extractor"]["reddit"]["client-secret"] = creds['client_secret']
        
        return config
    
    def download(self, url: str, output_dir: str, site: str = None) -> Tuple[bool, str, list]:
        """Download menggunakan gallery-dl."""
        if not self.is_available():
            return False, "gallery-dl tidak tersedia", []
        
        if not site:
            site = detect_site(url)
        
        if not site:
            return False, "Situs tidak didukung", []
        
        # Generate config
        config = self.generate_config(site)
        config_path = Path(output_dir) / ".gallery-dl-temp.conf"
        
        os.makedirs(output_dir, exist_ok=True)
        
        with open(config_path, 'w') as f:
            json.dump(config, f, indent=2)
        
        # Build command
        cmd = [
            self.gallery_dl_path,
            '--config', str(config_path),
            '--dest', output_dir,
            url
        ]
        
        print(f"üîÑ Downloading dari {site}...")
        print(f"   URL: {url}")
        
        try:
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
            config_path.unlink(missing_ok=True)
            
            # Parse downloaded files
            downloaded = []
            for line in result.stdout.split('\n'):
                if line.strip() and any(ext in line.lower() for ext in IMAGE_EXTENSIONS):
                    downloaded.append(line.strip())
            
            if result.returncode == 0:
                return True, f"Berhasil download dari {site}", downloaded
            else:
                return False, f"Error: {result.stderr[:200]}", downloaded
        
        except subprocess.TimeoutExpired:
            return False, "Timeout", []
        except Exception as e:
            return False, str(e), []


gallery_dl = GalleryDLDownloader()
print(f"‚úÖ GalleryDL Wrapper loaded!")
print(f"   gallery-dl available: {'‚úÖ Yes' if gallery_dl.is_available() else '‚ùå No'}")

In [None]:
# ============================================================
# MANUAL CRAWLER
# ============================================================

class ManualCrawler:
    """Crawler manual sebagai fallback."""
    
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(DEFAULT_HEADERS)
        self.visited_urls: Set[str] = set()
        self.image_urls: Set[str] = set()
        self.downloaded_hashes: Set[str] = set()
    
    def reset(self):
        """Reset state."""
        self.visited_urls.clear()
        self.image_urls.clear()
        self.downloaded_hashes.clear()
    
    def crawl_page(self, url: str, depth: int = 0, max_depth: int = 1) -> Set[str]:
        """Crawl halaman untuk gambar."""
        if url in self.visited_urls or depth > max_depth:
            return set()
        
        self.visited_urls.add(url)
        found_images: Set[str] = set()
        
        print(f"üîç Crawling (depth {depth}): {url[:80]}...")
        
        try:
            # Set referer
            parsed = urlparse(url)
            self.session.headers['Referer'] = f"{parsed.scheme}://{parsed.netloc}/"
            
            response = self.session.get(url, timeout=REQUEST_TIMEOUT)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'lxml')
            
            # 1. Tag <img>
            for img in soup.find_all('img'):
                for attr in ['src', 'data-src', 'data-original', 'srcset']:
                    img_url = img.get(attr)
                    if img_url:
                        if attr == 'srcset':
                            for part in img_url.split(','):
                                src = part.strip().split()[0]
                                found_images.add(upgrade_to_high_res(urljoin(url, src)))
                        else:
                            found_images.add(upgrade_to_high_res(urljoin(url, img_url)))
            
            # 2. Tag <a> ke gambar
            for link in soup.find_all('a', href=True):
                href = link.get('href')
                if href and is_valid_image_url(href):
                    found_images.add(upgrade_to_high_res(urljoin(url, href)))
            
            # 3. Background images
            style_pattern = r'url\(["\']?([^"\')\s]+)["\']?\)'
            for element in soup.find_all(style=True):
                matches = re.findall(style_pattern, element.get('style', ''))
                for match in matches:
                    full_url = urljoin(url, match)
                    if is_valid_image_url(full_url):
                        found_images.add(upgrade_to_high_res(full_url))
            
            # 4. Meta tags
            for meta in soup.find_all('meta'):
                if meta.get('property') in ['og:image', 'twitter:image']:
                    img_url = meta.get('content')
                    if img_url:
                        found_images.add(upgrade_to_high_res(urljoin(url, img_url)))
            
            # 5. JSON dalam script
            for script in soup.find_all('script'):
                if script.string:
                    patterns = [
                        r'"(?:image|thumbnail|original)[Uu]rl?"\s*:\s*"([^"]+)"',
                        r'"url"\s*:\s*"(https?://[^"]+\.(?:jpg|jpeg|png|gif|webp))"',
                    ]
                    for pattern in patterns:
                        matches = re.findall(pattern, script.string)
                        for match in matches:
                            if is_valid_image_url(match):
                                found_images.add(upgrade_to_high_res(match))
            
            self.image_urls.update(found_images)
            print(f"   ‚úÖ Ditemukan {len(found_images)} gambar")
            
            # Deep crawl
            if depth < max_depth:
                domain = parsed.netloc
                for link in soup.find_all('a', href=True):
                    href = link.get('href')
                    if href:
                        next_url = urljoin(url, href)
                        if urlparse(next_url).netloc == domain:
                            time.sleep(REQUEST_DELAY)
                            found_images.update(self.crawl_page(next_url, depth + 1, max_depth))
        
        except Exception as e:
            print(f"   ‚ùå Error: {e}")
        
        return found_images
    
    def download_image(self, img_url: str, output_dir: str) -> Tuple[bool, str]:
        """Download satu gambar."""
        try:
            # Set referer
            parsed = urlparse(img_url)
            self.session.headers['Referer'] = f"{parsed.scheme}://{parsed.netloc}/"
            
            response = self.session.get(img_url, timeout=REQUEST_TIMEOUT, stream=True)
            response.raise_for_status()
            
            content = response.content
            content_hash = hashlib.md5(content).hexdigest()
            
            if content_hash in self.downloaded_hashes:
                return False, "Duplikat"
            
            filename = get_filename_from_url(img_url, response)
            filepath = Path(output_dir) / filename
            
            counter = 1
            while filepath.exists():
                filepath = Path(output_dir) / f"{filepath.stem}_{counter}{filepath.suffix}"
                counter += 1
            
            with open(filepath, 'wb') as f:
                f.write(content)
            
            self.downloaded_hashes.add(content_hash)
            size_kb = len(content) / 1024
            
            return True, f"{filepath.name} ({size_kb:.1f} KB)"
        
        except Exception as e:
            return False, str(e)
    
    def download_all(self, output_dir: str, parallel: bool = True) -> dict:
        """Download semua gambar yang ditemukan."""
        os.makedirs(output_dir, exist_ok=True)
        
        stats = {'downloaded': 0, 'skipped': 0, 'errors': 0}
        total = len(self.image_urls)
        
        if total == 0:
            print("‚ö†Ô∏è Tidak ada gambar untuk didownload")
            return stats
        
        print(f"\nüì• Downloading {total} gambar...")
        
        if parallel and total > 1:
            with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
                futures = {executor.submit(self.download_image, url, output_dir): url for url in self.image_urls}
                
                for i, future in enumerate(as_completed(futures), 1):
                    success, msg = future.result()
                    status = "‚úÖ" if success else "‚¨ú"
                    print(f"[{i}/{total}] {status} {msg}")
                    
                    if success:
                        stats['downloaded'] += 1
                    else:
                        stats['skipped'] += 1
        else:
            for i, url in enumerate(self.image_urls, 1):
                success, msg = self.download_image(url, output_dir)
                status = "‚úÖ" if success else "‚¨ú"
                print(f"[{i}/{total}] {status} {msg}")
                
                if success:
                    stats['downloaded'] += 1
                else:
                    stats['skipped'] += 1
                
                time.sleep(REQUEST_DELAY)
        
        return stats


crawler = ManualCrawler()
print("‚úÖ Manual Crawler loaded!")

In [None]:
# ============================================================
# MAIN DOWNLOAD FUNCTION
# ============================================================

def download_images(url: str, output_dir: str = None, max_depth: int = None):
    """
    Fungsi utama untuk download gambar.
    
    Args:
        url: URL halaman/profil yang akan di-crawl
        output_dir: Folder output (default: OUTPUT_DIR + domain)
        max_depth: Kedalaman crawling (default: MAX_DEPTH)
    """
    # Set defaults
    if output_dir is None:
        domain = urlparse(url).netloc
        safe_domain = re.sub(r'[^\w\-.]', '_', domain)
        output_dir = os.path.join(OUTPUT_DIR, safe_domain)
    
    if max_depth is None:
        max_depth = MAX_DEPTH
    
    # Detect site
    site = detect_site(url)
    
    print("="*60)
    print("üñºÔ∏è  IMAGE CRAWLER & DOWNLOADER")
    print("="*60)
    print(f"URL: {url}")
    print(f"Site: {site or 'Unknown'}")
    print(f"Output: {output_dir}")
    print("="*60)
    
    os.makedirs(output_dir, exist_ok=True)
    
    success = False
    stats = {'downloaded': 0, 'method': 'none'}
    
    # Try gallery-dl first
    if USE_GALLERY_DL and site and gallery_dl.is_available():
        print("\nüì¶ Menggunakan gallery-dl...")
        
        # Check credentials
        creds = CREDENTIALS.get(site, {})
        has_creds = any(v for v in creds.values() if v)
        
        if not has_creds:
            print(f"‚ö†Ô∏è  Kredensial {site} tidak ditemukan (mungkin resolusi terbatas)")
        
        success, message, files = gallery_dl.download(url, output_dir, site)
        print(f"\n{message}")
        
        if success:
            stats['downloaded'] = len(files)
            stats['method'] = 'gallery-dl'
    
    # Fallback to manual crawler
    if not success and FALLBACK_TO_CRAWLER:
        print("\nüîß Menggunakan manual crawler...")
        
        crawler.reset()
        crawler.crawl_page(url, max_depth=max_depth)
        
        dl_stats = crawler.download_all(output_dir)
        stats['downloaded'] = dl_stats['downloaded']
        stats['method'] = 'manual'
    
    # Summary
    print("\n" + "="*60)
    print("üìä RINGKASAN")
    print("="*60)
    print(f"Metode: {stats['method']}")
    print(f"Gambar didownload: {stats['downloaded']}")
    print(f"Output: {output_dir}")
    print("="*60)
    
    # List downloaded files
    files = list(Path(output_dir).glob('*'))
    image_files = [f for f in files if f.suffix.lower() in IMAGE_EXTENSIONS]
    
    if image_files:
        print(f"\nüìÅ File dalam folder ({len(image_files)} gambar):")
        for f in image_files[:10]:
            size_kb = f.stat().st_size / 1024
            print(f"   ‚Ä¢ {f.name} ({size_kb:.1f} KB)")
        if len(image_files) > 10:
            print(f"   ... dan {len(image_files) - 10} file lainnya")
    
    return stats


print("‚úÖ Main function loaded!")
print("\nüí° Gunakan: download_images('URL_ANDA')")

---

## üöÄ 5. Download Gambar

### Cara Penggunaan:

1. Masukkan URL di cell di bawah
2. Jalankan cell
3. Gambar akan tersimpan di folder `downloaded_images/[domain]/`

### Contoh URL yang Didukung:
- Pixiv: `https://www.pixiv.net/en/users/12345`
- Twitter: `https://twitter.com/username`
- Instagram: `https://www.instagram.com/username/`
- DeviantArt: `https://www.deviantart.com/username`
- ArtStation: `https://www.artstation.com/username`
- Imgur: `https://imgur.com/a/albumid`
- Dan website lainnya!

In [None]:
# ============================================================
# MASUKKAN URL DI SINI DAN JALANKAN!
# ============================================================

URL = "https://www.pixiv.net/en/users/86903979"  # <-- Ganti dengan URL Anda

# Jalankan download
download_images(URL)

### üì• Download Multiple URLs

Untuk download dari beberapa URL sekaligus:

In [None]:
# ============================================================
# DOWNLOAD MULTIPLE URLs
# ============================================================

URLS = [
    # Tambahkan URL di sini
    # "https://www.pixiv.net/en/users/12345",
    # "https://www.artstation.com/username",
    # "https://example.com/gallery",
]

# Jalankan download untuk semua URL
if URLS:
    for i, url in enumerate(URLS, 1):
        print(f"\n{'#'*60}")
        print(f"# URL {i}/{len(URLS)}")
        print(f"{'#'*60}")
        download_images(url)
        print("\n")
else:
    print("‚ö†Ô∏è Tambahkan URL ke list URLS di atas!")

---

## üñºÔ∏è 6. Preview Gambar

Lihat preview gambar yang sudah didownload:

In [None]:
# ============================================================
# PREVIEW GAMBAR YANG DIDOWNLOAD
# ============================================================

def preview_images(folder: str = OUTPUT_DIR, max_images: int = 12):
    """Preview gambar dalam folder."""
    folder = Path(folder)
    
    if not folder.exists():
        print(f"‚ùå Folder tidak ditemukan: {folder}")
        return
    
    # Find all images recursively
    images = []
    for ext in IMAGE_EXTENSIONS:
        images.extend(folder.rglob(f'*{ext}'))
        images.extend(folder.rglob(f'*{ext.upper()}'))
    
    if not images:
        print(f"‚ö†Ô∏è Tidak ada gambar di folder: {folder}")
        return
    
    print(f"üìÅ Folder: {folder}")
    print(f"üì∑ Total gambar: {len(images)}")
    print(f"üñºÔ∏è Menampilkan {min(max_images, len(images))} preview:\n")
    
    # Display images
    displayed = 0
    for img_path in images[:max_images]:
        try:
            # Skip non-displayable formats
            if img_path.suffix.lower() in ['.svg', '.ico']:
                continue
            
            display(HTML(f"<b>{img_path.name}</b> ({img_path.stat().st_size/1024:.1f} KB)"))
            display(IPImage(filename=str(img_path), width=300))
            print("")
            displayed += 1
        except Exception as e:
            print(f"‚ö†Ô∏è Tidak bisa menampilkan {img_path.name}: {e}")
    
    if len(images) > max_images:
        print(f"\n... dan {len(images) - max_images} gambar lainnya")


# Preview gambar
preview_images(OUTPUT_DIR)

---

## üìÇ 7. Kelola File

Utilitas untuk mengelola file yang didownload:

In [None]:
# ============================================================
# LIST SEMUA FOLDER DAN FILE
# ============================================================

def list_downloads(folder: str = OUTPUT_DIR):
    """List semua file yang didownload."""
    folder = Path(folder)
    
    if not folder.exists():
        print(f"‚ùå Folder tidak ada: {folder}")
        return
    
    print(f"üìÅ Output Directory: {folder}\n")
    
    total_files = 0
    total_size = 0
    
    for subfolder in sorted(folder.iterdir()):
        if subfolder.is_dir():
            files = list(subfolder.glob('*'))
            image_files = [f for f in files if f.suffix.lower() in IMAGE_EXTENSIONS]
            folder_size = sum(f.stat().st_size for f in image_files) / (1024*1024)
            
            print(f"üìÇ {subfolder.name}/")
            print(f"   ‚îî‚îÄ‚îÄ {len(image_files)} gambar ({folder_size:.2f} MB)")
            
            total_files += len(image_files)
            total_size += folder_size
    
    # Also check root folder
    root_files = [f for f in folder.glob('*') if f.is_file() and f.suffix.lower() in IMAGE_EXTENSIONS]
    if root_files:
        root_size = sum(f.stat().st_size for f in root_files) / (1024*1024)
        print(f"üìÑ (root): {len(root_files)} gambar ({root_size:.2f} MB)")
        total_files += len(root_files)
        total_size += root_size
    
    print(f"\n{'='*40}")
    print(f"üìä Total: {total_files} gambar ({total_size:.2f} MB)")


list_downloads()

In [None]:
# ============================================================
# HAPUS FOLDER DOWNLOAD (HATI-HATI!)
# ============================================================

def clear_downloads(folder: str = OUTPUT_DIR, confirm: bool = False):
    """Hapus semua file yang didownload."""
    folder = Path(folder)
    
    if not folder.exists():
        print(f"‚ùå Folder tidak ada: {folder}")
        return
    
    if not confirm:
        print(f"‚ö†Ô∏è Ini akan MENGHAPUS semua file di: {folder}")
        print("‚ö†Ô∏è Set confirm=True untuk melanjutkan")
        return
    
    import shutil
    shutil.rmtree(folder)
    print(f"‚úÖ Folder {folder} telah dihapus")


# Uncomment baris di bawah untuk menghapus (HATI-HATI!)
# clear_downloads(confirm=True)

---

## ‚ÑπÔ∏è 8. Bantuan & Tips

### üîê Cara Mendapatkan Kredensial:

#### Pixiv (Refresh Token):
```bash
pip install gppt
gppt login
```
Lalu copy refresh token yang muncul.

#### Twitter (Auth Token):
1. Login ke Twitter di browser
2. Buka Developer Tools (F12)
3. Tab Application > Cookies > twitter.com
4. Cari `auth_token`, copy value-nya

#### Instagram (Session ID):
1. Login ke Instagram di browser
2. Buka Developer Tools (F12)
3. Tab Application > Cookies > instagram.com
4. Cari `sessionid`, copy value-nya

### ‚ö†Ô∏è Troubleshooting:

- **gallery-dl tidak ditemukan**: Jalankan cell install di atas
- **Login gagal**: Cek kredensial, coba refresh token/session
- **Rate limited**: Tunggu beberapa menit, naikkan REQUEST_DELAY
- **Gambar resolusi rendah**: Pastikan kredensial terisi

### üìù Catatan:

- Jangan share notebook yang berisi kredensial
- Hormati copyright dan terms of service
- Gunakan untuk keperluan personal saja

In [None]:
import shutil
from pathlib import Path

def compress_folder_for_manual_download(folder_path: str):
    """
    Mengompres folder lokal menjadi file ZIP agar bisa didownload secara manual.

    Args:
        folder_path (str): Path ke folder yang ingin dikompres.
    """
    folder = Path(folder_path)
    if not folder.exists():
        print(f"‚ùå Folder tidak ditemukan: {folder_path}")
        return
    if not folder.is_dir():
        print(f"‚ùå Path yang diberikan bukan folder: {folder_path}")
        return

    output_zip_name = f"{folder.name}.zip"
    print(f"Compressing folder '{folder_path}' to '{output_zip_name}'...")
    try:
        shutil.make_archive(folder.name, 'zip', root_dir=folder.parent, base_dir=folder.name)
        print(f"‚úÖ Folder berhasil dikompres ke: {output_zip_name}")
        print(f"‚ÑπÔ∏è Anda bisa menemukan file ini di root direktori `/content/` (di panel file Colab). Silakan download secara manual.")

    except Exception as e:
        print(f"‚ùå Terjadi error saat mengompres folder: {e}")


# --- CARA PENGGUNAAN ---
# Ganti 'path/ke/folder/anda' dengan folder yang ingin Anda kompres.
# Misalnya, untuk mengkompres semua gambar dari Folder:
# folder_to_compress = "downloaded_images/folder"

# Atau untuk mengkompres seluruh folder output utama:
folder_to_compress = OUTPUT_DIR # Menggunakan variabel OUTPUT_DIR yang sudah ada

compress_folder_for_manual_download(folder_to_compress)

In [None]:
# ============================================================
# INFO SITUS YANG DIDUKUNG
# ============================================================

print("üåê Situs yang Didukung:\n")
print(f"{'Situs':<15} {'Domain':<30} {'Perlu Login?'}")
print("="*60)

auth_info = {
    'pixiv': 'Ya (untuk resolusi penuh)',
    'twitter': 'Ya (untuk media)',
    'instagram': 'Ya (untuk stories/highlights)',
    'deviantart': 'Opsional',
    'artstation': 'Tidak',
    'danbooru': 'Opsional',
    'imgur': 'Tidak',
    'reddit': 'Opsional',
}

for site, info in SUPPORTED_SITES.items():
    domains = ', '.join(info['domains'][:2])
    auth = auth_info.get(site, 'Tidak diketahui')
    print(f"{site.capitalize():<15} {domains:<30} {auth}")