In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from datetime import datetime
from tqdm import tqdm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
# Buat function session dengan retry
def create_session_with_retries(retries=3, backoff_factor=0.3):
    session = requests.Session()
    retry = Retry(
        total=retries, 
        read=retries, 
        connect=retries, 
        backoff_factor=backoff_factor, 
        status_forcelist=[500, 502, 503, 504]  # Retry status HTTP 
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [4]:
# Scrape artikel dari tiap URL
def scrape_article(session, url, keywords):
    try:
        # Define headers to avoid loading images & iklan
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0'
        }

        # Send request dengan session retry & header spesifik
        response = session.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
 
        # Ekstrak rubrik
        rubric = soup.select_one('#wrapforleftpush > div.wrapping.mar-t-10 > div.container-section > div.bag-kiri > div.breadcumb.fs18 > a:nth-child(1)')
        rubric_text = rubric.get_text(strip=True) if rubric else 'N/A'
 
        # Ekstrak tanggal
        date = soup.select_one('#wrapforleftpush > div.wrapping.mar-t-10 > div.container-section > div.bag-kiri > div.fs14.ff-opensans.font-gray')
        date_text = date.get_text(strip=True) if date else 'N/A'
 
        # Ekstrak headline
        headline = soup.select_one('#wrapforleftpush > div.wrapping.mar-t-10 > div.container-section > div.bag-kiri > h1')
        headline_text = headline.get_text(strip=True) if headline else 'N/A'
 
        # Ekstrak nama reporter dan editor
        author_info = soup.select_one('#wrapforleftpush > div.wrapping.mar-t-10 > div.container-section > div.bag-kiri > div.box-det-desk-2 > div.tmpt-desk-kon > p:nth-child(1)')
        author_text = author_info.get_text(strip=True) if author_info else 'N/A'
 
        # Ekstrak lead berita
        lead = soup.select_one('#wrapforleftpush > div.wrapping.mar-t-10 > div.container-section > div.bag-kiri > div.box-det-desk-2 > div.tmpt-desk-kon > p:nth-child(2)')
        lead_text = lead.get_text(strip=True) if lead else 'N/A'
 
        # Ekstrak body berita
        body_paragraphs = [
            p.get_text(strip=True) for p in [
                soup.select_one(f'#wrapforleftpush > div.wrapping.mar-t-10 > div.container-section > div.bag-kiri > div.box-det-desk-2 > div.tmpt-desk-kon > p:nth-child({i})')
                for i in [2, 3, 6, 7, 9, 10, 11, 13, 14, 15]
            ] if p
        ]
        body_text = " ".join(body_paragraphs) if body_paragraphs else 'N/A'
 
        # Cek keywords
        content = f"{headline_text} {lead_text} {body_text}".lower()
        keyword_match = any(keyword.lower() in content for keyword in keywords)
 
        # Return (if the content matches the keywordssss)
        if keyword_match:
            return {
                'URL': url,
                'Rubric': rubric_text,
                'Date': date_text,
                'Headline': headline_text,
                'Author': author_text,  # Reporter dan editor
                'Lead': lead_text,
                'Body': body_text
            }
        else:
 
            return None
    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return None
 

In [None]:
import os
from tqdm import tqdm

# Load URLs from file
url_file_path = r'C:\Users\ASUS\AppData\Roaming\Python\Python313\site-packages\pandas\io\excel\(1jan19_30jun24) scraped_urls filtered.xlsx'
df_urls = pd.read_excel(url_file_path)

# Define fixed output file path (no timestamp)
output_file_path = r'C:\Users\ASUS\AppData\Roaming\Python\Python313\site-packages\pandas\io\excel\scraped_data_incremental.xlsx'
print(f"Saving to this file: {output_file_path}")

# Create DataFrame for scraped data
if os.path.exists(output_file_path):
    df_scraped = pd.read_excel(output_file_path)
    scraped_urls = set(df_scraped['URL'].tolist())  # Avoid duplicates
else:
    df_scraped = pd.DataFrame(columns=['URL', 'Rubric', 'Date', 'Headline', 'Author', 'Lead', 'Body'])
    scraped_urls = set()

# Define keywords
keywords = [
    'pendapatan masyarakat', 'penghasilan masyarakat', 'ekspektasi pendapatan', 'gaji', 'upah', 'gaji karyawan', 'upah karyawan',
    'pendapatan riil', 'tabungan masyarakat', 'konsumsi masyarakat', 'lapangan kerja', 'lapangan pekerjaan', 'pengangguran',
    'lowongan kerja', 'PHK', 'Pemutusan Hubungan Kerja', 'kesempatan kerja', 'penyerapan tenaga kerja', 'pengangguran terbuka',
    'pengangguran terselubung', 'pengangguran struktural', 'kegiatan usaha', 'aktivitas bisnis', 'prospek usaha', 'inovasi bisnis',
    'optimisme konsumen', 'UMKM', 'pembelian barang tahan lama', 'daya beli', 'belanja konsumen', 'barang tahan lama',
    'harga barang elektronik', 'harga produk elektronik', 'harga mobil', 'harga produk tahan lama', 'durable goods',
    'pertumbuhan ekonomi', 'PDB', 'ekspansi ekonomi', 'kontraksi ekonomi', 'resesi ekonomi', 'krisis ekonomi', 'pemulihan ekonomi',
    'Pemulihan Ekonomi Nasional', 'economic recovery', 'laju ekonomi', 'inflasi', 'deflasi', 'harga barang komoditas',
    'harga pangan', 'harga energi', 'harga BBM', 'nilai tukar', 'depresiasi mata uang', 'inflasi inti', 'tingkat inflasi',
    'tekanan inflasi', 'Indeks Harga Konsumen', 'harga barang pokok', 'Indeks Harga Produsen', 'permintaan agregat',
    'kebijakan moneter', 'suku bunga Bank Indonesia', 'suku bunga BI', 'BI rate', 'rupiah', 'stimulus moneter', 'suku bunga acuan',
    'suku bunga kredit', 'suku bunga pinjaman', 'suku bunga deposito', 'suku bunga pasar', 'FOMC Rate', 'suku bunga rendah',
    'suku bunga tinggi', 'tingkat bunga pinjaman', 'bunga kredit perbankan', 'yield obligasi', 'likuiditas perbankan', 'pasar obligasi'
]

# Create session with retry
session = create_session_with_retries()

# Start scraping loop
for url in tqdm(df_urls['url'], desc='Scraping Progress', unit='url'):
    if url in scraped_urls:
        continue  # Skip already scraped

    article_data = scrape_article(session, url, keywords)
    if article_data:
        # Append data
        df_scraped = pd.concat([df_scraped, pd.DataFrame([article_data])], ignore_index=True)

        # Save after every successful scrape
        try:
            df_scraped.to_excel(output_file_path, index=False)
        except Exception as e:
            print(f"❌ Failed to save data for {url}: {e}")

        scraped_urls.add(url)

print(f"✅ Scraping completed. Final data saved to: {output_file_path}")

Saving to this file: C:\Users\ASUS\AppData\Roaming\Python\Python313\site-packages\pandas\io\excel\scraped_data_incremental.xlsx


Scraping Progress:   4%|█▊                                               | 12432/347504 [1:47:22<376:57:29,  4.05s/url]

Error scraping https://investasi.kontan.co.id/news/gelar-rupslb-mitra-komunikasi-nusantara-mknt-rombak-susunan-direksi-dan-komisaris: HTTPSConnectionPool(host='investasi.kontan.co.id', port=443): Max retries exceeded with url: /news/gelar-rupslb-mitra-komunikasi-nusantara-mknt-rombak-susunan-direksi-dan-komisaris (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000027791ED6990>: Failed to establish a new connection: [WinError 10013] An attempt was made to access a socket in a way forbidden by its access permissions'))


Scraping Progress:  14%|██████▋                                        | 49569/347504 [10:23:45<2450:37:02, 29.61s/url]

Error scraping https://investasi.kontan.co.id/news/kinerja-moncer-sejak-awal-tahun-penghuni-indeks-idxgrowth30-ini-layak-koleksi: HTTPSConnectionPool(host='investasi.kontan.co.id', port=443): Max retries exceeded with url: /news/kinerja-moncer-sejak-awal-tahun-penghuni-indeks-idxgrowth30-ini-layak-koleksi (Caused by ResponseError('too many 504 error responses'))


Scraping Progress:  23%|███████████▎                                    | 81630/347504 [22:10:40<282:53:27,  3.83s/url]

Error scraping https://analisis.kontan.co.id/news/lebih-tegas: HTTPSConnectionPool(host='analisis.kontan.co.id', port=443): Max retries exceeded with url: /news/lebih-tegas (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002779466A490>: Failed to resolve 'analisis.kontan.co.id' ([Errno 11001] getaddrinfo failed)"))


Scraping Progress:  23%|███████████▎                                    | 81631/347504 [22:10:42<237:59:59,  3.22s/url]

Error scraping https://internasional.kontan.co.id/news/pbb-misil-yang-digunakan-untuk-menyerang-arab-saudi-berasal-dari-iran: HTTPSConnectionPool(host='internasional.kontan.co.id', port=443): Max retries exceeded with url: /news/pbb-misil-yang-digunakan-untuk-menyerang-arab-saudi-berasal-dari-iran (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000277961FA490>: Failed to resolve 'internasional.kontan.co.id' ([Errno 11001] getaddrinfo failed)"))


Scraping Progress:  23%|███████████▎                                    | 81632/347504 [22:10:44<206:35:29,  2.80s/url]

Error scraping https://investasi.kontan.co.id/news/ihsg-ditutup-melemah-047-ke-4831-pada-sesi-i-sektor-aneka-industri-ke-zona-hijau: HTTPSConnectionPool(host='investasi.kontan.co.id', port=443): Max retries exceeded with url: /news/ihsg-ditutup-melemah-047-ke-4831-pada-sesi-i-sektor-aneka-industri-ke-zona-hijau (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000027797AA1450>: Failed to resolve 'investasi.kontan.co.id' ([Errno 11001] getaddrinfo failed)"))


Scraping Progress:  23%|███████████▎                                    | 81633/347504 [22:10:45<184:34:47,  2.50s/url]

Error scraping https://investasi.kontan.co.id/news/rupiah-ada-di-level-rp-14204-per-dolar-as-melemah-129-pada-siang-hari-ini: HTTPSConnectionPool(host='investasi.kontan.co.id', port=443): Max retries exceeded with url: /news/rupiah-ada-di-level-rp-14204-per-dolar-as-melemah-129-pada-siang-hari-ini (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000027797AA16D0>: Failed to resolve 'investasi.kontan.co.id' ([Errno 11001] getaddrinfo failed)"))


Scraping Progress:  23%|███████████▎                                    | 81634/347504 [22:10:47<169:11:34,  2.29s/url]

Error scraping https://industri.kontan.co.id/news/inilah-jajaran-direksi-pertamina-yang-baru-ada-mantan-direksi-mandiri-sekuritas: HTTPSConnectionPool(host='industri.kontan.co.id', port=443): Max retries exceeded with url: /news/inilah-jajaran-direksi-pertamina-yang-baru-ada-mantan-direksi-mandiri-sekuritas (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000027797AA2850>: Failed to resolve 'industri.kontan.co.id' ([Errno 11001] getaddrinfo failed)"))


Scraping Progress:  23%|███████████▎                                    | 81635/347504 [22:10:49<158:25:27,  2.15s/url]

Error scraping https://nasional.kontan.co.id/news/agar-pbi-tepat-sasaran-penyempurnaan-dtks-harus-terus-dilakukan: HTTPSConnectionPool(host='nasional.kontan.co.id', port=443): Max retries exceeded with url: /news/agar-pbi-tepat-sasaran-penyempurnaan-dtks-harus-terus-dilakukan (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000027797AA20D0>: Failed to resolve 'nasional.kontan.co.id' ([Errno 11001] getaddrinfo failed)"))


Scraping Progress:  23%|███████████▎                                    | 81636/347504 [22:10:51<150:53:06,  2.04s/url]

Error scraping https://industri.kontan.co.id/news/rampingkan-jajaran-direksi-ini-susunan-direksi-baru-pertamina: HTTPSConnectionPool(host='industri.kontan.co.id', port=443): Max retries exceeded with url: /news/rampingkan-jajaran-direksi-ini-susunan-direksi-baru-pertamina (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x0000027797AA3390>: Failed to resolve 'industri.kontan.co.id' ([Errno 11001] getaddrinfo failed)"))


Scraping Progress:  26%|████████████▍                                   | 90385/347504 [26:42:16<119:07:20,  1.67s/url]