# Scrape - Date Range

In [1]:
import pandas as pd
import os
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import re

# Buat direktori untuk menyimpan hasil scraping
os.makedirs('scraped_reports', exist_ok=True)

# Fungsi untuk membersihkan karakter ilegal dari teks
def clean_text(text):
    # Hapus karakter ilegal
    return re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)

# Fungsi untuk mengambil artikel di halaman utama
def scrape_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = soup.find_all('article')

    titles, links, dates = [], [], []

    for article in articles:
        link = article.find('a')['href']
        title = article.find('h2').text.strip()

        # Ambil tanggal dari elemen dengan class yang sesuai
        date_tag = article.find('span', class_='date')
        date = date_tag.text.strip() if date_tag else 'No date found'

        titles.append(clean_text(title))  # Bersihkan judul
        links.append(link)
        dates.append(clean_text(date))  # Bersihkan tanggal

    return titles, links, dates

# Fungsi untuk mengambil konten artikel dan memperbarui tanggal
def fetch_news_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        date_div = soup.find('div', class_='text-cm text-gray')
        date_text = date_div.get_text(strip=True) if date_div else 'Tanggal tidak ditemukan'

        content_div = soup.find('div', class_='detail-text min-w-0')
        paragraphs = content_div.find_all('p') if content_div else []
        news_text = "\n".join(paragraph.get_text(strip=True) for paragraph in paragraphs)

        return clean_text(date_text), clean_text(news_text)  # Bersihkan teks artikel dan tanggal

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return 'Error', 'Error'

# Simpan seluruh data ke dalam file Excel
def save_to_excel(df):
    df.to_excel('scraped_reports/scraped_articles.xlsx', index=False, engine='openpyxl')
    print(f'Saved all articles in scraped_articles.xlsx with {len(df)} articles')

# Fungsi untuk melakukan scraping berdasarkan tanggal dan jumlah halaman
def scrape_news_by_date(start_date, end_date, max_pages=5):
    all_data = []
    base_url = "https://www.cnbcindonesia.com/market/indeks/5"
    date_range = pd.date_range(start=start_date, end=end_date)

    for target_date in date_range:
        formatted_date = target_date.strftime("%Y/%m/%d")

        # Loop untuk setiap halaman
        for i in range(1, max_pages + 1):
            url = f"{base_url}/{i}?date={formatted_date}&tipe=artikel"
            titles, links, dates = scrape_page(url)

            # Ambil konten dari setiap artikel jika data tersedia
            for title, link, date in zip(titles, links, dates):
                article_date, content = fetch_news_content(link)

                if title and link and article_date and content:
                    all_data.append({
                        'Title': title,
                        'Link': link,
                        'Date': article_date,
                        'Content': content
                    })

    # Simpan semua data dalam satu file Excel
    if all_data:
        df = pd.DataFrame(all_data)
        save_to_excel(df)
        return df

# Mulai proses scraping
start_date = datetime.strptime("2025/01/01", "%Y/%m/%d")
end_date = datetime.strptime("2025/01/31", "%Y/%m/%d")

scrape_news_by_date(start_date, end_date, max_pages=5)


Error fetching https://www.cnbcindonesia.com/market/20250122080826-17-605011/optimis-tinggi-dirut-bbri-sunarso-borong-210000-saham-perusahaan: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))
Saved all articles in scraped_articles.xlsx with 855 articles


Unnamed: 0,Title,Link,Date,Content
0,"KRL Diserbu! Jumlah Penumpang Tembus 1,2 Juta ...",https://www.cnbcindonesia.com/market/202501012...,01 January 2025 21:50,"Jakarta, CNBC Indonesia -Penumpang angkutan ke..."
1,"Pengawasan Aset Kripto Bakal Dipindah ke OJK, ...",https://www.cnbcindonesia.com/market/202501012...,01 January 2025 21:37,"Jakarta, CNBC Indonesia-Asosiasi Pedagang Aset..."
2,"Ini Gaji Terkecil dan Batas Usia Pay Later, At...",https://www.cnbcindonesia.com/market/202501011...,01 January 2025 16:30,"Jakarta, CNBC Indonesia- Otoritas Jasa Keuanga..."
3,8 Kripto Ini Bakal Jadi Primadona di Tahun 2025,https://www.cnbcindonesia.com/market/202501011...,01 January 2025 15:45,"Jakarta, CNBC Indonesia -Mata uang kripto (cry..."
4,"Dukung Ekonomi RI, Simak 10 Pencapaian BRI di ...",https://www.cnbcindonesia.com/market/202501011...,01 January 2025 15:18,"Jakarta, CNBC Indonesia- PT Bank Rakyat Indone..."
...,...,...,...,...
850,Bos BRI (BBRI) Kasih Bocoran Laba 2024,https://www.cnbcindonesia.com/market/202501310...,31 January 2025 07:50,"Jakarta, CNBC Indonesia- Direktur Utama PT Ban..."
851,Bos Asuransi Curhat: Bisnisnya Kalah Sama Pinjol,https://www.cnbcindonesia.com/market/202501310...,31 January 2025 07:30,"Jakarta, CNBC Indonesia- Pertumbuhan industri ..."
852,Diam-Diam Asing Banyak Jual Saham Ini,https://www.cnbcindonesia.com/market/202501310...,31 January 2025 07:10,"Jakarta, CNBC Indonesia- Indeks Harga Saham Ga..."
853,Wamen BUMN Beri Kabar Terbaru Pembahasan RUU BUMN,https://www.cnbcindonesia.com/market/202501301...,31 January 2025 06:50,"Jakarta, CNBC Indonesia- Wakil Menteri Badan U..."


# Scrape - Specific Date

In [None]:
def scrape_news_by_specific_dates(dates, max_pages=5):
    all_data = []
    base_url = "https://www.cnbcindonesia.com/news/indeks/3"

    for target_date in dates:
        formatted_date = target_date.strftime("%d/%m/%Y")

        # Loop untuk setiap halaman
        for i in range(1, max_pages + 1):
            url = f"{base_url}?date={formatted_date}&tipe=artikel"
            titles, links, dates = scrape_page(url)

            # Ambil konten dari setiap artikel jika data tersedia
            for title, link, date in zip(titles, links, dates):
                article_date, content = fetch_news_content(link)

                if title and link and article_date and content:
                    all_data.append({
                        'Title': title,
                        'Link': link,
                        'Date': article_date,
                        'Content': content
                    })

    # Simpan semua data dalam satu file Excel
    if all_data:
        df = pd.DataFrame(all_data)
        save_to_excel(df)
        return df

# Tanggal spesifik yang ingin diambil datanya
specific_dates = [
    "01/09/2024"
]
# Konversi tanggal menjadi objek datetime
specific_dates = [datetime.strptime(date, "%d/%m/%Y") for date in specific_dates]

# Mulai proses scraping
scrape_news_by_specific_dates(specific_dates, max_pages=5)


Saved all articles in scraped_articles.xlsx with 50 articles


Unnamed: 0,Title,Link,Date,Content
0,"1 Juli 2025 Berubah, Cek Aturan Iuran & Denda ...",https://www.cnbcindonesia.com/news/20250129132...,29 January 2025 13:45,"Jakarta, CNBC Indonesia -Menteri Kesehatan (Me..."
1,"Siap-Siap Stasiun Karet Ditutup, Ini Alasan da...",https://www.cnbcindonesia.com/news/20250129112...,29 January 2025 13:15,"Jakarta, CNBC Indonesia- Stasiun Karet akan di..."
2,RI Siaga Bencana! BMKG Warning Cuaca Ekstrem M...,https://www.cnbcindonesia.com/news/20250129104...,29 January 2025 12:30,"Jakarta, CNBC Indonesia- Kepala Badan Meteorol..."
3,Pertamina Amankan Pasokan LPG di Jakarta Selam...,https://www.cnbcindonesia.com/news/20250129111...,29 January 2025 11:24,"Jakarta, CNBC Indonesia- PT Pertamina Patra Ni..."
4,"Warga Palestina Lawan Rencana Relokasi Trump, ...",https://www.cnbcindonesia.com/news/20250129093...,29 January 2025 10:45,"Jakarta, CNBC Indonesia- Sejumlah warga Palest..."
5,Banjir! Cek Kondisi Terbaru Lalu Lintas Arah B...,https://www.cnbcindonesia.com/news/20250129095...,29 January 2025 10:06,"Jakarta, CNBC Indonesia- Sejumlah titik di wil..."
6,"Tragedi Festival Keagamaan Terbesar Dunia, Sed...",https://www.cnbcindonesia.com/news/20250129094...,29 January 2025 09:51,"Jakarta, CNBC Indonesia- Perayaan Kumbh Mela, ..."
7,"Gempa Bumi M 5,1 Guncang Kolaka Sultra, Tak Be...",https://www.cnbcindonesia.com/news/20250129085...,29 January 2025 08:55,"Jakarta, CNBC Indonesia- Gempa bumi dengan mag..."
8,"Pesawat Airbus Air Busan Terbakar di Bandara, ...",https://www.cnbcindonesia.com/news/20250129061...,29 January 2025 06:20,"Jakarta, CNBC Indonesia- Sebuah pesawat Airbus..."
9,"Cegah Kelangkaan, Pertamina Pasok 9 Juta Tabun...",https://www.cnbcindonesia.com/news/20250128192...,28 January 2025 21:45,"Jakarta, CNBC Indonesia -PT Pertamina Patra Ni..."
