In [1]:
import logging
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm
import json

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [2]:
def get_page_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        logging.info(f'Successfully retrieved content from {url}')
        return response.content
    except requests.exceptions.HTTPError as http_err:
        logging.error(f'HTTP error occurred: {http_err}')
    except Exception as err:
        logging.error(f'An error occurred: {err}')
    return None

In [3]:
def generate_year_urls(base_url, start_year, end_year):
    year_urls = [f"{base_url}/{year}/" for year in range(start_year, end_year + 1)]
    logging.info(f'Year-based URLs generated for years {start_year} to {end_year}')
    return year_urls

def collect_articles_from_year_page(url):
    articles = []
    try:
        content = get_page_content(url)
        if content:
            soup = BeautifulSoup(content, 'html.parser')
            article_h3_tags = soup.find_all('h3', class_='post-title entry-title')
            for h3_tag in article_h3_tags:
                a_tag = h3_tag.find('a', href=True)
                if a_tag and a_tag['href']:
                    articles.append(a_tag['href'])
            logging.info(f'Collected {len(articles)} articles from {url}')
    except Exception as e:
        logging.error(f'Error collecting articles from {url}: {e}')
    return articles

In [4]:
def parse_article(url):
    try:
        content = get_page_content(url)
        soup = BeautifulSoup(content, 'html.parser')
        title = soup.find('h3', class_='post-title entry-title').text.strip()
        
        content_container = soup.find('div', class_='post-body entry-content')
        article_content = ''

        if content_container:
            article_content = content_container.get_text(separator=' ', strip=True)

        logging.info(f'Article parsed successfully: {title}')
        return {'url': url, 'title': title, 'content': article_content}
    except Exception as e:
        logging.error(f'Error parsing article at {url}: {e}')
        return None

In [5]:
def write_to_json(data, filename):
    with open(filename, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")
    logging.info(f'Data successfully written to {filename}')

In [6]:
base_url = 'https://sejarahstpm.blogspot.com'
start_year = 2007
end_year = 2021
year_urls = generate_year_urls(base_url, start_year, end_year)

all_article_links = []
for year_url in tqdm(year_urls, desc='Year pages'):
    articles = collect_articles_from_year_page(year_url)
    all_article_links.extend(articles)

all_data = [parse_article(url) for url in tqdm(all_article_links, desc='Articles')]
write_to_json(all_data, 'sejarahstpm-text-extracted.json')

2024-02-17 16:33:43,052 - INFO - Year-based URLs generated for years 2007 to 2021


Year pages:   0%|          | 0/15 [00:00<?, ?it/s]

2024-02-17 16:33:44,268 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2007/
2024-02-17 16:33:44,328 - INFO - Collected 7 articles from https://sejarahstpm.blogspot.com/2007/
2024-02-17 16:33:46,020 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2008/
2024-02-17 16:33:46,143 - INFO - Collected 16 articles from https://sejarahstpm.blogspot.com/2008/
2024-02-17 16:33:48,255 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2009/
2024-02-17 16:33:48,369 - INFO - Collected 16 articles from https://sejarahstpm.blogspot.com/2009/
2024-02-17 16:33:49,723 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2010/
2024-02-17 16:33:49,834 - INFO - Collected 20 articles from https://sejarahstpm.blogspot.com/2010/
2024-02-17 16:33:50,987 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2011/
2024-02-17 16:33:51,108 - INFO - Collected 20 articles from https

Articles:   0%|          | 0/106 [00:00<?, ?it/s]

2024-02-17 16:34:04,544 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2007/12/selamat-menyambut-tahun-2008.html
2024-02-17 16:34:04,619 - INFO - Article parsed successfully: Selamat Menyambut Tahun 2008
2024-02-17 16:34:05,378 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2007/11/ujian-berpakaian-menutup-aurat.html
2024-02-17 16:34:05,416 - INFO - Article parsed successfully: Ujian Berpakaian Menutup Aurat
2024-02-17 16:34:06,090 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2007/11/maju-dengan-kebendaan-akhlak-musnah.html
2024-02-17 16:34:06,128 - INFO - Article parsed successfully: Maju dengan kebendaan,  Akhlak musnah
2024-02-17 16:34:06,959 - INFO - Successfully retrieved content from https://sejarahstpm.blogspot.com/2007/11/selamat-menghadapi-peperiksaan-stpm.html
2024-02-17 16:34:06,996 - INFO - Article parsed successfully: SELAMAT MENGHADAPI PEPERIKSAAN STPM 2007
2024-02-17 16:34:08,6