In [3]:
import sys
import os
import pandas as pd
import numpy as np
# Menambahkan folder project ke sys.path
sys.path.append(os.path.abspath(os.path.join('..')))
from src.validation.validate_data import validation_process 
# Menggunakan fungsi
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
import csv  # Mengimpor csv untuk menyimpan data ke file CSV.

# Scrape the data

In [1]:
# Konfigurasi logging
logging.basicConfig(filename='../data_source/scraping_data/scraping.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Kode warna
GREEN = "\033[92m"  # Warna hijau
RED = "\033[91m"    # Warna merah
RESET = "\033[0m"   # Reset ke warna default

def scrape_kompas(pages: int, csv_filename: str) -> None:
    """
    Melakukan scraping berita dari situs Kompas dan menyimpan hasilnya ke file CSV.

    Args:
        pages (int): Jumlah halaman yang akan di-scrape.
        csv_filename (str): Nama file CSV untuk menyimpan hasil.

    Returns:
        None
    """
    # Header CSV
    fieldnames = ['judul', 'topik', 'sub_topik', 'topik_pilihan', 'tanggal_waktu_publish', 'redaksi', 'advetorial', 'isi_berita', 'link', 'topik_pilihan_link']
    
    # Membuka file CSV dalam mode append ('a') dan menulis header jika file baru
    with open(csv_filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # Menulis header hanya jika file baru (dengan mengecek apakah filenya kosong)
        if file.tell() == 0:
            writer.writeheader()
        
        # Loop untuk setiap halaman dari 1 hingga pages
        for i in range(1, pages + 1):
            try:
                url = f"https://indeks.kompas.com/?site=all&page={i}"
                response = requests.get(url)
                
                soup = BeautifulSoup(response.text, 'html.parser')
                links = soup.find_all('a', class_='article-link')
                links = [link.get('href') for link in links]
                
                # Loop untuk setiap link artikel yang ditemukan
                for j, link in enumerate(links):
                    try:
                        response_news = BeautifulSoup(requests.get(link).text, 'html.parser')

                        # Cek apakah artikel adalah advertorial
                        try:
                            advetorial = response_news.find('div', class_='kcm__header__advertorial').get_text()
                        except:
                            advetorial = ''

                        # Mengambil topik dari breadcrumb
                        try:
                            topic_tags = response_news.find_all('li', class_='breadcrumb__item')
                            topics = [tag.find('span').get_text() for tag in topic_tags]
                            topik = topics[1]
                        except:
                            topik = ''

                        # Mengambil sub-topik dari breadcrumb
                        try:
                            sub_topik = topics[2]
                        except:
                            sub_topik = ''

                        # Mengambil topik pilihan jika ada
                        try:
                            topik_pilihan = response_news.find('div', class_='topicSubtitle').find('a').get_text()
                            topik_pilihan_link = response_news.find('div', class_='topicSubtitle').find('a').get('href')
                        except:
                            topik_pilihan = ''
                            topik_pilihan_link = ''

                        # Mengambil judul artikel
                        try:
                            judul = response_news.find('h1', class_='read__title').get_text()
                        except:
                            judul = ''

                        # Mengambil tanggal dan waktu publikasi
                        try:
                            tanggal_waktu_publish = response_news.find('div', class_='read__time').get_text().split(' - ')[1]
                        except:
                            tanggal_waktu_publish = ''

                        # Mengambil nama redaksi yang menulis artikel
                        try:
                            redaksi_tag = response_news.find('div', class_='credit-title-name').find_all('h6')
                            redaksi = [penulis.get_text() for penulis in redaksi_tag]
                            redaksi = ' '.join(redaksi)
                        except:
                            redaksi = ''

                        # Mengambil isi berita
                        try:
                            konteks_tag = response_news.find('div', class_='read__content').find_all('p')
                            isi_berita = ' '.join([konteks.get_text() for konteks in konteks_tag])
                        except:
                            isi_berita = ''

                        # Simpan hasil scraping ke dalam dictionary
                        result = {
                            'judul': judul,
                            'topik': topik,
                            'sub_topik': sub_topik,
                            'topik_pilihan': topik_pilihan,
                            'tanggal_waktu_publish': tanggal_waktu_publish,
                            'redaksi': redaksi,
                            'advetorial': advetorial,
                            'isi_berita': isi_berita,
                            'link': link,
                            'topik_pilihan_link': topik_pilihan_link
                        }

                        # Menulis hasil ke file CSV
                        writer.writerow(result)

                        random_delay = random.uniform(0.1, 1)
                        time.sleep(random_delay)
                    except:
                        print(f'{RED}ERROR - page {i} link {j + 1}  = {link}')
                        logging.info(f'ERROR - page {i} link {j + 1}  = {link}')
                        random_delay = random.uniform(0.1, 1)
                        time.sleep(random_delay)
                print()

            except Exception as e:
                print(f'{RED}ERROR - page {i}\n')
                logging.info(f'ERROR - page {i}\n')
                continue

# Contoh pemanggilan fungsi
scrape_kompas(5, '../data_source/scraping_data/scraping_kompas.csv')









# Read the data 

In [4]:
df = pd.read_csv('../data_source/scraping_data/scraping_kompas.csv')
df

Unnamed: 0,judul,topik,sub_topik,topik_pilihan,tanggal_waktu_publish,redaksi,advetorial,isi_berita,link,topik_pilihan_link
0,Dosen di Solo Terjerat Kasus Penipuan Jual Bel...,Regional,,,"06/09/2024, 16:46 WIB",Rachmawati,,"KOMPAS.com - H, salah satu dosen di Solo, Jawa...",https://regional.kompas.com/read/2024/09/06/16...,
1,Indonesia-Africa Forum Sepakati Kerja Sama Bis...,Money,Ekbis,,"06/09/2024, 16:44 WIB","Yohana Artha Uly, Sakina Rakhma Diah Setiawan",,"JAKARTA, KOMPAS.com - Pertemuan Indonesia-Afri...",https://money.kompas.com/read/2024/09/06/16445...,
2,Kronologi Suami Bunuh Istri Usai Cekcok di Kon...,News,Megapolitan,,"06/09/2024, 16:43 WIB","Achmad Nasrudin Yahya, Irfan Maullana",,"JAKARTA, KOMPAS.com - Polres Metro Jakarta Sel...",https://megapolitan.kompas.com/read/2024/09/06...,
3,Simak Perbedaan Tiga Tipe Toyota New Fortuner,Otomotif,Mobil,,"06/09/2024, 16:41 WIB","Aprida Mega Nanda, Azwar Ferdian",,"JAKARTA, KOMPAS.com - PT Toyota Astra Motor (T...",https://otomotif.kompas.com/read/2024/09/06/16...,
4,"Penjualan Drop, VW Akan Tutup 2 Pabrik buat Se...",Otomotif,News,,"06/09/2024, 16:41 WIB","Gilang Satria, Azwar Ferdian",,"JAKARTA, KOMPAS.com - Volkswagen, merek mobil ...",https://otomotif.kompas.com/read/2024/09/06/16...,
...,...,...,...,...,...,...,...,...,...,...
395,"Alasan Kenapa Mesin Membutuhkan Coolant, Bukan...",Otomotif,News,,"07/09/2024, 15:22 WIB","Erwin Setiawan, Aditya Maulana",,"KLATEN, KOMPAS.com - Mesin pembakaran dalam (I...",https://otomotif.kompas.com/read/2024/09/07/15...,
396,"Kelola Limbah Plastik, Amandina Raih Pengharga...",Lestari,Swasta,,"07/09/2024, 15:20 WIB",Yohanes Enggar Harususilo,,KOMPAS.com - Laporan Kementerian Lingkungan Hi...,https://lestari.kompas.com/read/2024/09/07/152...,
397,"Puluhan Paus Terdampar di Perairan Alor, Warga...",Regional,,,"07/09/2024, 15:19 WIB","Serafinus Sandi Hayon Jehadu, Gloria Setyvani ...",,"ALOR, KOMPAS.com - Kepala Cabang Dinas Kelauta...",https://regional.kompas.com/read/2024/09/07/15...,
398,Warga Tegal Alur Ngeluh ke Rano Karno karena A...,News,Megapolitan,,"07/09/2024, 15:16 WIB","Baharudin Al Farisi, Dani Prabowo",,"JAKARTA, KOMPAS.com - Warga mengeluhkan soal...",https://megapolitan.kompas.com/read/2024/09/07...,


# Validations

In [5]:
data_name = 'scraping_data'
validation_process(df, data_name)

CHECKING SHAPE DATA
Data scraping_data has 400 rows and 10 columns

CHECKING DATA TYPE
Each column in scraping_data has the following data types:

Column `judul` has data type object
Column `topik` has data type object
Column `sub_topik` has data type object
Column `topik_pilihan` has data type object
Column `tanggal_waktu_publish` has data type object
Column `redaksi` has data type object
Column `advetorial` has data type object
Column `isi_berita` has data type object
Column `link` has data type object
Column `topik_pilihan_link` has data type object

CHECKING MISSING DATA
Each column in `scraping_data` has the following number of missing values:

Column `judul` has 0 or 0.0% missing values
Column `topik` has 9 or 2.2% missing values
Column `sub_topik` has 177 or 44.2% missing values
Column `topik_pilihan` has 364 or 91.0% missing values
Column `tanggal_waktu_publish` has 0 or 0.0% missing values
Column `redaksi` has 9 or 2.2% missing values
Column `advetorial` has 381 or 95.2% missi

# Cleaning the data

## Data Duplicate

In [7]:
df = df.drop_duplicates()

## Handle Misiing value

In [19]:
def transform_scraping_data(df_scraping):

    # Isi data topik kosong menjadi iklan karena berita merupakan beritak iklan (advetorial=advertorial)
    df_scraping['topik'] = df_scraping['topik'].fillna('Iklan')

    # Isi data subtopik yg kosong dengan "Belum ditentukan"
    df_scraping['sub_topik'] = df_scraping['sub_topik'].fillna('Belum ditentukan')

    # Mengisi data kosong topik pilihan dengan bukan topik pilihan
    df_scraping['topik_pilihan'] = df_scraping['topik_pilihan'].fillna('Bukan topik pilihan')

    # Redaksi yang kosong dianggap anonim
    df_scraping['redaksi'] = df_scraping['redaksi'].fillna('Anonim')

    df_scraping['advetorial'] = df_scraping['advetorial'].fillna('Non Advertorial')

    # Drop topik pilinan link
    df_scraping = df_scraping.drop('topik_pilihan_link',axis=1)

    df_scraping

Unnamed: 0,judul,topik,sub_topik,topik_pilihan,tanggal_waktu_publish,redaksi,advetorial,isi_berita,link
0,Dosen di Solo Terjerat Kasus Penipuan Jual Bel...,Regional,Belum ditentukan,Bukan topik pilihan,"06/09/2024, 16:46 WIB",Rachmawati,Non Advertorial,"KOMPAS.com - H, salah satu dosen di Solo, Jawa...",https://regional.kompas.com/read/2024/09/06/16...
1,Indonesia-Africa Forum Sepakati Kerja Sama Bis...,Money,Ekbis,Bukan topik pilihan,"06/09/2024, 16:44 WIB","Yohana Artha Uly, Sakina Rakhma Diah Setiawan",Non Advertorial,"JAKARTA, KOMPAS.com - Pertemuan Indonesia-Afri...",https://money.kompas.com/read/2024/09/06/16445...
2,Kronologi Suami Bunuh Istri Usai Cekcok di Kon...,News,Megapolitan,Bukan topik pilihan,"06/09/2024, 16:43 WIB","Achmad Nasrudin Yahya, Irfan Maullana",Non Advertorial,"JAKARTA, KOMPAS.com - Polres Metro Jakarta Sel...",https://megapolitan.kompas.com/read/2024/09/06...
3,Simak Perbedaan Tiga Tipe Toyota New Fortuner,Otomotif,Mobil,Bukan topik pilihan,"06/09/2024, 16:41 WIB","Aprida Mega Nanda, Azwar Ferdian",Non Advertorial,"JAKARTA, KOMPAS.com - PT Toyota Astra Motor (T...",https://otomotif.kompas.com/read/2024/09/06/16...
4,"Penjualan Drop, VW Akan Tutup 2 Pabrik buat Se...",Otomotif,News,Bukan topik pilihan,"06/09/2024, 16:41 WIB","Gilang Satria, Azwar Ferdian",Non Advertorial,"JAKARTA, KOMPAS.com - Volkswagen, merek mobil ...",https://otomotif.kompas.com/read/2024/09/06/16...
...,...,...,...,...,...,...,...,...,...
395,"Alasan Kenapa Mesin Membutuhkan Coolant, Bukan...",Otomotif,News,Bukan topik pilihan,"07/09/2024, 15:22 WIB","Erwin Setiawan, Aditya Maulana",Non Advertorial,"KLATEN, KOMPAS.com - Mesin pembakaran dalam (I...",https://otomotif.kompas.com/read/2024/09/07/15...
396,"Kelola Limbah Plastik, Amandina Raih Pengharga...",Lestari,Swasta,Bukan topik pilihan,"07/09/2024, 15:20 WIB",Yohanes Enggar Harususilo,Non Advertorial,KOMPAS.com - Laporan Kementerian Lingkungan Hi...,https://lestari.kompas.com/read/2024/09/07/152...
397,"Puluhan Paus Terdampar di Perairan Alor, Warga...",Regional,Belum ditentukan,Bukan topik pilihan,"07/09/2024, 15:19 WIB","Serafinus Sandi Hayon Jehadu, Gloria Setyvani ...",Non Advertorial,"ALOR, KOMPAS.com - Kepala Cabang Dinas Kelauta...",https://regional.kompas.com/read/2024/09/07/15...
398,Warga Tegal Alur Ngeluh ke Rano Karno karena A...,News,Megapolitan,Bukan topik pilihan,"07/09/2024, 15:16 WIB","Baharudin Al Farisi, Dani Prabowo",Non Advertorial,"JAKARTA, KOMPAS.com - Warga mengeluhkan soal...",https://megapolitan.kompas.com/read/2024/09/07...


In [20]:
validation_process(df, data_name)

CHECKING SHAPE DATA
Data scraping_data has 396 rows and 9 columns

CHECKING DATA TYPE
Each column in scraping_data has the following data types:

Column `judul` has data type object
Column `topik` has data type object
Column `sub_topik` has data type object
Column `topik_pilihan` has data type object
Column `tanggal_waktu_publish` has data type object
Column `redaksi` has data type object
Column `advetorial` has data type object
Column `isi_berita` has data type object
Column `link` has data type object

CHECKING MISSING DATA
Each column in `scraping_data` has the following number of missing values:

Column `judul` has 0 or 0.0% missing values
Column `topik` has 0 or 0.0% missing values
Column `sub_topik` has 0 or 0.0% missing values
Column `topik_pilihan` has 0 or 0.0% missing values
Column `tanggal_waktu_publish` has 0 or 0.0% missing values
Column `redaksi` has 0 or 0.0% missing values
Column `advetorial` has 0 or 0.0% missing values
Column `isi_berita` has 0 or 0.0% missing values
