In [1]:
import csv
import requests
from bs4 import BeautifulSoup
import time

# Fungsi untuk melakukan request dengan header User-Agent
def make_request_with_retry(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Pastikan status code 200
        return response
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Fungsi untuk mengambil semua link artikel dari halaman
def get_article_links(index_soup):
    article_links = set()
    articles = index_soup.find_all('a', class_='flex group items-center gap-4')  # Update sesuai dengan elemen yang ditemukan
    print(f"Found {len(articles)} article links on this page.")
    for article in articles:
        link = article.get('href')
        if link:
            article_links.add(link)

    return list(article_links)

# Fungsi untuk mengambil detail artikel
def get_article_details(article_url):
    article_response = make_request_with_retry(article_url)
    if article_response:
        article_soup = BeautifulSoup(article_response.text, 'lxml')

        # Mengambil judul artikel
        title_tag = article_soup.find('h1', class_='mb-2 text-[28px] leading-9 text-cnn_black')
        title = title_tag.text.strip() if title_tag else 'No Title'

        # Mengambil isi artikel hanya dari <p> dan <strong>
        content_tags = article_soup.find_all(['p', 'strong'])  # Hanya ambil tag <p> dan <strong>
        content = ' '.join(tag.get_text(strip=True) for tag in content_tags)

        if not content:
            content = 'No Content'

        return title, content
    return None, None

# Fungsi untuk mendapatkan URL halaman berikutnya
def get_next_page_url(current_page_num):
    next_page_num = current_page_num + 1
    next_page_url = f'https://www.cnnindonesia.com/indeks/2?page={next_page_num}'
    return next_page_url

# Fungsi utama untuk scraping semua artikel
def scrape_articles():
    base_url = 'https://www.cnnindonesia.com/indeks/2?page=2'  # Halaman pertama
    current_page_num = 1  # Mulai dari halaman pertama
    max_articles = 2000  # Batasi jumlah artikel yang diambil
    article_count = 0  # Hitung jumlah artikel yang diambil

    with open('data_cnn.csv', 'w', newline='', encoding='utf-8') as file:
        wr = csv.writer(file, delimiter=',')
        wr.writerow(['title', 'content', 'url'])

        while article_count < max_articles:
            page_url = get_next_page_url(current_page_num)  # Dapatkan URL halaman berikutnya
            print(f"Scraping {page_url}...")

            # Ambil halaman indeks
            index_response = make_request_with_retry(page_url)
            if not index_response:
                print(f"Failed to fetch {page_url}")
                break

            index_soup = BeautifulSoup(index_response.text, 'lxml')

            # Ambil link artikel dari halaman indeks
            article_links = get_article_links(index_soup)
            print(f"Found {len(article_links)} unique articles.")

            if not article_links:
                print("Tidak ada artikel yang ditemukan di halaman ini.")
                break

            # Scrape artikel
            for article_url in article_links:
                full_article_url = article_url if article_url.startswith('http') else 'https://www.cnnindonesia.com' + article_url
                print(f"Scraping article: {full_article_url}")

                title, content = get_article_details(full_article_url)

                if title and content:
                    wr.writerow([title, content, full_article_url])
                    article_count += 1
                    print(f"[{article_count}] Artikel berhasil diambil: {title}")

                    if article_count >= max_articles:
                        print(f"Reached {max_articles} articles. Stopping scraping.")
                        return  # Hentikan scraping setelah mencapai 2000 artikel
                else:
                    print(f"Artikel {full_article_url} tidak dapat diambil.")

            # Increment page number and continue to next page
            current_page_num += 1
            time.sleep(2)  # Jeda sebelum mengambil halaman berikutnya

# Jalankan proses scraping
scrape_articles()


Scraping https://www.cnnindonesia.com/indeks/2?page=2...
Found 10 article links on this page.
Found 10 unique articles.
Scraping article: https://www.cnnindonesia.com/gaya-hidup/20221226130854-269-892115/muncul-kode-ssss-di-boarding-pass-pesawat-penumpang-harus-apa
[1] Artikel berhasil diambil: No Title
Scraping article: https://www.cnnindonesia.com/nasional/20241209131114-12-1175315/bupati-sidoarjo-nonaktif-ahmad-muhdlor-dituntut-6-tahun-penjara
[2] Artikel berhasil diambil: Bupati Sidoarjo Nonaktif Ahmad Muhdlor Dituntut 6 Tahun Penjara
Scraping article: https://www.cnnindonesia.com/internasional/20241209125149-120-1175304/israel-serang-suriah-pakai-rudal-hantam-gudang-senjata
[3] Artikel berhasil diambil: Israel Serang Suriah Pakai Rudal, Hantam Gudang Senjata
Scraping article: https://www.cnnindonesia.com/nasional/20241209124329-12-1175292/aipda-robig-penembak-gamma-disidang-etik-hari-ini-di-polda
[4] Artikel berhasil diambil: Aipda Robig Penembak Gamma Disidang Etik Hari Ini di Po

KeyboardInterrupt: 