In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
from datetime import datetime, timedelta
import os

In [2]:
# Chrome options untuk headless browsing dan block image loading
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--headless")  # Run headless
chrome_options.add_argument('--blink-settings=imagesEnabled=false')  # Block image loading
 
# ChromeDriver path
service = Service('C:\\Users\\ASUS\\AppData\\Roaming\\Python\\Python313\\Scripts\\chromedriver-win64\\chromedriver.exe')
driver = webdriver.Chrome(service=service, options=chrome_options)
 
# Generate the dates
def generate_dates(start_date, end_date):
    date_list = []
    current_date = start_date
    while current_date <= end_date:
        date_list.append((current_date.strftime("%d"), current_date.strftime("%m"), current_date.strftime("%Y")))
        current_date += timedelta(days=1)
    return date_list 

In [3]:
# Save URLs Excel (incrementally)
def save_to_excel(scraped_urls, file_name='scraped_urls_2019_to_2024.xlsx'):
    df = pd.DataFrame(scraped_urls, columns=['URL'])
    
    # Jika file tidak exist, create it; otherwise, append 
    if not os.path.exists(file_name):
        df.to_excel(file_name, index=False)  # Create new file
    else:
        existing_df = pd.read_excel(file_name)
        combined_df = pd.concat([existing_df, df]).drop_duplicates().reset_index(drop=True)
        combined_df.to_excel(file_name, index=False)  # Save combined data tanpa duplicates
 
# Load previously saved URLs (kalau ada)
def load_existing_urls(file_name='scraped_urls_2019_to_2024.xlsx'):
    if os.path.exists(file_name):
        return pd.read_excel(file_name)['URL'].tolist()
    return []

In [4]:
# Scrape URLs untuk tanggal spesifik
def scrape_urls_selenium(tanggal, bulan, tahun, max_pages=40):
    base_url = f'https://www.kontan.co.id/search/indeks?kanal=&tanggal={tanggal}&bulan={bulan}&tahun={tahun}&pos=indeks'
    driver.get(base_url)
 
    all_urls = []
    page = 1
 
    while page <= max_pages:
        print(f"Scraping page {page} for {tanggal}-{bulan}-{tahun}")
 
        # Tunggu hasil search to load using WebDriverWait, no sleep
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//div[contains(@class, 'search_wrap_kanan')]//a[contains(@href, '/news/')]"))
            )
        except Exception as e:
            print(f"Error: {e}. Stopping.")
            break
 
        # Find all the <a> elements that contain '/news/' 
        links = driver.find_elements(By.XPATH, "//div[contains(@class, 'search_wrap_kanan')]//a[contains(@href, '/news/')]")
        
        if not links:
            print(f"No more news articles found on page {page} for {tanggal}-{bulan}-{tahun}. Stopping.")
            break
 
        # Collect URLs 
        urls = [link.get_attribute("href") for link in links]
        all_urls.extend(urls)
        print(f"Scraped {len(urls)} URLs from page {page} for {tanggal}-{bulan}-{tahun}")
 
        # Go to the next page pakai tombol 'Next »' 
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//li[contains(@class, 'button')]//a[contains(text(), 'Next »')]"))
            )
 
            # Scroll page pada tombol next lalu click
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            driver.execute_script("arguments[0].click();", next_button)
            page += 1
        except Exception as e:
            print(f"No 'Next »' button found on page {page}. Stopping. Error: {e}")
            break
 
    return all_urls

In [5]:
# Define start & end dates
start_date = datetime(2019, 1, 1)
end_date = datetime(2024, 8, 31)
 
# Generate the list of dates to scrape
dates_to_scrape = generate_dates(start_date, end_date)
 
# Load scraped URLs sebelumnya jika exist
all_scraped_urls = load_existing_urls()
 
# Scrape URLs dan save incrementally
for day, month, year in dates_to_scrape:
    urls = scrape_urls_selenium(day, month, year)
 
    # Cek jika ada duplicates, only add new URLs
    new_urls = [url for url in urls if url not in all_scraped_urls]
    if new_urls:
        all_scraped_urls.extend(new_urls)
        save_to_excel(new_urls)  # Save incrementally 
 
# Close the driver
driver.quit()


Scraping page 1 for 01-01-2019
Scraped 40 URLs from page 1 for 01-01-2019
Scraping page 2 for 01-01-2019
Scraped 40 URLs from page 2 for 01-01-2019
Scraping page 3 for 01-01-2019
Scraped 40 URLs from page 3 for 01-01-2019
Scraping page 4 for 01-01-2019
Scraped 40 URLs from page 4 for 01-01-2019
Scraping page 5 for 01-01-2019
Scraped 40 URLs from page 5 for 01-01-2019
Scraping page 6 for 01-01-2019
Scraped 16 URLs from page 6 for 01-01-2019
No 'Next »' button found on page 6. Stopping. Error: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF733005335+78597]
	GetHandleVerifier [0x00007FF733005390+78688]
	(No symbol) [0x00007FF732DB91AA]
	(No symbol) [0x00007FF732E0F149]
	(No symbol) [0x00007FF732E0F3FC]
	(No symbol) [0x00007FF732E62467]
	(No symbol) [0x00007FF732E3712F]
	(No symbol) [0x00007FF732E5F2BB]
	(No symbol) [0x00007FF732E36EC3]
	(No symbol) [0x00007FF732E003F8]
	(No symbol) [0x00007FF732E01163]
	GetHandleVerifier [0x00007FF7332AEEED+2870973]
	GetHandleVerifier [0x00007FF7332A9

KeyboardInterrupt: 