In [1]:
import sys
import os
import pandas as pd
import numpy as np
# Menambahkan folder project ke sys.path
sys.path.append(os.path.abspath(os.path.join('..')))
from src.validation.validate_data import validation_process 
# Menggunakan fungsi
import requests
from bs4 import BeautifulSoup
import time
import random
import logging
import csv  # Mengimpor csv untuk menyimpan data ke file CSV.

# Scrape the data

In [10]:
# Konfigurasi logging
logging.basicConfig(filename='../data_source/scraping_data/scraping.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Kode warna
GREEN = "\033[92m"  # Warna hijau
RED = "\033[91m"    # Warna merah
RESET = "\033[0m"   # Reset ke warna default

def scrape_kompas(pages: int, csv_filename: str) -> None:
    """
    Melakukan scraping berita dari situs Kompas dan menyimpan hasilnya ke file CSV.

    Args:
        pages (int): Jumlah halaman yang akan di-scrape.
        csv_filename (str): Nama file CSV untuk menyimpan hasil.

    Returns:
        None
    """
    # Header CSV
    fieldnames = ['judul', 'topik', 'sub_topik', 'topik_pilihan', 'tanggal_waktu_publish', 'redaksi', 'advetorial', 'isi_berita', 'link', 'topik_pilihan_link']
    
    # Membuka file CSV dalam mode append ('a') dan menulis header jika file baru
    with open(csv_filename, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # Menulis header hanya jika file baru (dengan mengecek apakah filenya kosong)
        if file.tell() == 0:
            writer.writeheader()
        
        # Loop untuk setiap halaman dari 1 hingga pages
        for i in range(1, pages + 1):
            try:
                url = f"https://indeks.kompas.com/?site=all&page={i}"
                response = requests.get(url)
                
                soup = BeautifulSoup(response.text, 'html.parser')
                links = soup.find_all('a', class_='article-link')
                links = [link.get('href') for link in links]
                
                # Loop untuk setiap link artikel yang ditemukan
                for j, link in enumerate(links):
                    try:
                        response_news = BeautifulSoup(requests.get(link).text, 'html.parser')

                        # Cek apakah artikel adalah advertorial
                        try:
                            advetorial = response_news.find('div', class_='kcm__header__advertorial').get_text()
                        except:
                            advetorial = ''

                        # Mengambil topik dari breadcrumb
                        try:
                            topic_tags = response_news.find_all('li', class_='breadcrumb__item')
                            topics = [tag.find('span').get_text() for tag in topic_tags]
                            topik = topics[1]
                        except:
                            topik = ''

                        # Mengambil sub-topik dari breadcrumb
                        try:
                            sub_topik = topics[2]
                        except:
                            sub_topik = ''

                        # Mengambil topik pilihan jika ada
                        try:
                            topik_pilihan = response_news.find('div', class_='topicSubtitle').find('a').get_text()
                            topik_pilihan_link = response_news.find('div', class_='topicSubtitle').find('a').get('href')
                        except:
                            topik_pilihan = ''
                            topik_pilihan_link = ''

                        # Mengambil judul artikel
                        try:
                            judul = response_news.find('h1', class_='read__title').get_text()
                        except:
                            judul = ''

                        # Mengambil tanggal dan waktu publikasi
                        try:
                            tanggal_waktu_publish = response_news.find('div', class_='read__time').get_text().split(' - ')[1]
                        except:
                            tanggal_waktu_publish = ''

                        # Mengambil nama redaksi yang menulis artikel
                        try:
                            redaksi_tag = response_news.find('div', class_='credit-title-name').find_all('h6')
                            redaksi = [penulis.get_text() for penulis in redaksi_tag]
                            redaksi = ' '.join(redaksi)
                        except:
                            redaksi = ''

                        # Mengambil isi berita
                        try:
                            konteks_tag = response_news.find('div', class_='read__content').find_all('p')
                            isi_berita = ' '.join([konteks.get_text() for konteks in konteks_tag])
                        except:
                            isi_berita = ''

                        # Simpan hasil scraping ke dalam dictionary
                        result = {
                            'judul': judul,
                            'topik': topik,
                            'sub_topik': sub_topik,
                            'topik_pilihan': topik_pilihan,
                            'tanggal_waktu_publish': tanggal_waktu_publish,
                            'redaksi': redaksi,
                            'advetorial': advetorial,
                            'isi_berita': isi_berita,
                            'link': link,
                            'topik_pilihan_link': topik_pilihan_link
                        }

                        # Menulis hasil ke file CSV
                        writer.writerow(result)

                        random_delay = random.uniform(0.1, 1)
                        time.sleep(random_delay)
                    except:
                        print(f'{RED}ERROR - page {i} link {j + 1}  = {link}')
                        logging.info(f'ERROR - page {i} link {j + 1}  = {link}')
                        random_delay = random.uniform(0.1, 1)
                        time.sleep(random_delay)
                print()

            except Exception as e:
                print(f'{RED}ERROR - page {i}\n')
                logging.info(f'ERROR - page {i}\n')
                continue

# Contoh pemanggilan fungsi
scrape_kompas(5, '../data_source/scraping_data/scraping_kompas.csv')


[91mERROR - page 1 link 5  = https://www.kompas.com/food/read/2024/09/08/181807975/cara-membuat-buttercream-sendiri-di-rumah-cuma-butuh-5-bahan


# Read the data 

In [5]:
df = pd.read_csv('../data_source/scraping_data/scraping_kompas.csv')
data_name = 'scraping_data'

# Validations

In [3]:
validation_process(df, data_name)

CHECKING SHAPE DATA
Data scraping_data has 583 rows and 10 columns

CHECKING DATA TYPE
Each column in scraping_data has the following data types:

Column `judul` has data type object
Column `topik` has data type object
Column `sub_topik` has data type object
Column `topik_pilihan` has data type object
Column `tanggal_waktu_publish` has data type object
Column `redaksi` has data type object
Column `advetorial` has data type object
Column `isi_berita` has data type object
Column `link` has data type object
Column `topik_pilihan_link` has data type object

CHECKING MISSING DATA
Each column in `scraping_data` has the following number of missing values:

Column `judul` has 0 or 0.0% missing values
Column `topik` has 9 or 1.5% missing values
Column `sub_topik` has 257 or 44.1% missing values
Column `topik_pilihan` has 537 or 92.1% missing values
Column `tanggal_waktu_publish` has 0 or 0.0% missing values
Column `redaksi` has 9 or 1.5% missing values
Column `advetorial` has 560 or 96.1% missi

# Cleaning the data

## Data Duplicate

In [7]:
df = df.drop_duplicates()

## Handle Misiing value

In [15]:
def transform_scraping_data(df_scraping):

    # Isi data topik kosong menjadi iklan karena berita merupakan beritak iklan (advetorial=advertorial)
    df_scraping['topik'] = df_scraping['topik'].fillna('Iklan')

    # Isi data subtopik yg kosong dengan "Belum ditentukan"
    df_scraping['sub_topik'] = df_scraping['sub_topik'].fillna('Belum ditentukan')

    # Mengisi data kosong topik pilihan dengan bukan topik pilihan
    df_scraping['topik_pilihan'] = df_scraping['topik_pilihan'].fillna('Bukan topik pilihan')

    # Redaksi yang kosong dianggap anonim
    df_scraping['redaksi'] = df_scraping['redaksi'].fillna('Anonim')

    df_scraping['advetorial'] = df_scraping['advetorial'].fillna('Non Advertorial')

    # Menghapus "WIB" dan konversi menjadi datetime
    df_scraping['tanggal_waktu_publish'] = df_scraping['tanggal_waktu_publish'].str.replace(' WIB', '').str.replace('Diperbarui ','')
    df_scraping['tanggal_waktu_publish'] = pd.to_datetime(df_scraping['tanggal_waktu_publish'], format='%d/%m/%Y, %H:%M')

    # Drop topik pilinan link
    df_scraping = df_scraping.drop('topik_pilihan_link',axis=1)

    return df_scraping

In [16]:
df_result = transform_scraping_data(df.copy())
# validation_process(transform_scraping_data(df), data_name)
df_result[df_result['judul'].str.contains('Tawarkan 1 Kaveling ke Banyak Pembeli')]

Unnamed: 0,judul,topik,sub_topik,topik_pilihan,tanggal_waktu_publish,redaksi,advetorial,isi_berita,link
0,Dosen di Solo Terjerat Kasus Penipuan Jual Bel...,Regional,Belum ditentukan,Bukan topik pilihan,2024-09-06 16:46:00,Rachmawati,Non Advertorial,"KOMPAS.com - H, salah satu dosen di Solo, Jawa...",https://regional.kompas.com/read/2024/09/06/16...


In [17]:
for date in df['tanggal_waktu_publish'].unique():
    print(date)

06/09/2024, 16:46 WIB
06/09/2024, 16:44 WIB
06/09/2024, 16:43 WIB
06/09/2024, 16:41 WIB
06/09/2024, 16:40 WIB
06/09/2024, 16:38 WIB
06/09/2024, 16:36 WIB
06/09/2024, 16:35 WIB
06/09/2024, 16:32 WIB
06/09/2024, 16:31 WIB
06/09/2024, 16:30 WIB
06/09/2024, 16:28 WIB
06/09/2024, 16:27 WIB
06/09/2024, 16:26 WIB
06/09/2024, 16:24 WIB
06/09/2024, 16:23 WIB
06/09/2024, 16:22 WIB
06/09/2024, 16:21 WIB
06/09/2024, 16:20 WIB
06/09/2024, 16:18 WIB
06/09/2024, 16:17 WIB
06/09/2024, 16:15 WIB
06/09/2024, 16:13 WIB
06/09/2024, 16:12 WIB
06/09/2024, 16:07 WIB
06/09/2024, 16:06 WIB
06/09/2024, 16:05 WIB
06/09/2024, 16:04 WIB
06/09/2024, 16:03 WIB
06/09/2024, 16:02 WIB
06/09/2024, 16:01 WIB
06/09/2024, 16:00 WIB
06/09/2024, 15:58 WIB
06/09/2024, 15:53 WIB
06/09/2024, 15:48 WIB
06/09/2024, 15:47 WIB
06/09/2024, 15:45 WIB
06/09/2024, 15:42 WIB
06/09/2024, 15:41 WIB
06/09/2024, 15:38 WIB
06/09/2024, 15:37 WIB
06/09/2024, 15:35 WIB
06/09/2024, 15:34 WIB
06/09/2024, 15:32 WIB
06/09/2024, 15:30 WIB
06/09/2024