We used 2 different versions of scraping code for this project. The first one we used the one-by-one method to scrape data from article links; hence, this method took such long time to gain data. We used another method which is called 'multithreaded scraping' with max_worker = 5 to scrape data faster. Indeed, the data was scraped a lot faster (took half the time of the initial method); however, the data was missing a lot of features and the number of eliminated records were more than other categories.

### 0. Library

In [1]:
# Danh s√°ch th∆∞ vi·ªán
import os
import csv
import json
import time
import requests
import concurrent.futures
import re 
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# H√†m kh·ªüi t·∫°o Selenium WebDriver
def create_chrome_driver():
    options = Options()
    options.add_argument("--headless=new") 
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(options=options)
    return driver

### 1. Crawling links

In [2]:
def collect_links_vnexpress_bs(max_pages=12, category="giai-tri"): # Chuy√™n m·ª•c m·∫∑c ƒë·ªãnh l√† giai-tri
    base_url = f"https://vnexpress.net/{category}" 
    article_pattern = re.compile(r"https:\/\/vnexpress\.net\/[a-zA-Z0-9\-]+(\.html)?$")
    all_links = set()

    for page in range(1, max_pages + 1):
        url = base_url if page == 1 else f"{base_url}-p{page}"

        print(f"\nüîç ƒêang thu th·∫≠p URL t·ª´ trang: {url}")
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status() 

            soup = BeautifulSoup(response.content, "html.parser")
            articles = soup.select('article a')

            page_links = set()
            for a_tag in articles:
                href = a_tag.get("href", "")
                if article_pattern.match(href):
                    page_links.add(href)

            all_links.update(page_links)
            print(f"‚úÖ ƒê√£ thu th·∫≠p {len(page_links)} link m·ªõi. T·ªïng: {len(all_links)}")

            if len(page_links) == 0:
                print("üö® Kh√¥ng c√≥ link m·ªõi, d·ª´ng thu th·∫≠p.")
                break

            time.sleep(1)  # Tr√°nh b·ªã ch·∫∑n IP

        except requests.exceptions.RequestException as e:
            print(f"‚ö†Ô∏è L·ªói khi truy c·∫≠p {url}: {e}")
            continue

    return all_links

In [3]:
category = "thoi-su" # Ch·ªçn category mu·ªën crawl link
links = collect_links_vnexpress_bs(max_pages=25, category=category)

file_name = f"{category}_article_links.txt"
file_folder = f"E:/UIT/NƒÉm-2/K√¨-2/Do-an-DS/Data/{category}"  # ƒê∆∞·ªùng d·∫´n workspace tr√™n m√°y local

# T·∫°o th∆∞ m·ª•c n·∫øu ch∆∞a c√≥
os.makedirs(file_folder, exist_ok=True)

# T·∫°o ƒë∆∞·ªùng d·∫´n ƒë·∫ßy ƒë·ªß cho file
file_path = os.path.join(file_folder, file_name)

# Ghi c√°c li√™n k·∫øt v√†o file
with open(file_path, "w", encoding="utf-8") as f:
    for link in links:
        f.write(link + "\n")

print(f"üìÅ ƒê√£ l∆∞u {len(links)} li√™n k·∫øt v√†o file: {file_path}")


üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpress.net/thoi-su
‚úÖ ƒê√£ thu th·∫≠p 55 link m·ªõi. T·ªïng: 55

üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpress.net/thoi-su-p2
‚úÖ ƒê√£ thu th·∫≠p 30 link m·ªõi. T·ªïng: 82

üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpress.net/thoi-su-p3
‚úÖ ƒê√£ thu th·∫≠p 30 link m·ªõi. T·ªïng: 112

üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpress.net/thoi-su-p4
‚úÖ ƒê√£ thu th·∫≠p 30 link m·ªõi. T·ªïng: 142

üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpress.net/thoi-su-p5
‚úÖ ƒê√£ thu th·∫≠p 30 link m·ªõi. T·ªïng: 172

üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpress.net/thoi-su-p6
‚úÖ ƒê√£ thu th·∫≠p 30 link m·ªõi. T·ªïng: 199

üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpress.net/thoi-su-p7
‚úÖ ƒê√£ thu th·∫≠p 30 link m·ªõi. T·ªïng: 229

üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpress.net/thoi-su-p8
‚úÖ ƒê√£ thu th·∫≠p 30 link m·ªõi. T·ªïng: 256

üîç ƒêang thu th·∫≠p URL t·ª´ trang: https://vnexpr

### 2. Scraping article data

In [4]:
def process_single_article(link):
    driver = None
    try:
        driver = create_chrome_driver()
        driver.set_page_load_timeout(60)
        driver.set_script_timeout(60)

        try:
            driver.get(link)
            time.sleep(1)
        except TimeoutException:
            print(f"‚ö†Ô∏è Timeout khi load trang: {link}")
            driver.execute_script("window.stop();")

        # L·∫•y HTML c·ªßa trang ƒë·ªÉ x·ª≠ l√Ω v·ªõi BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Ti√™u ƒë·ªÅ (S·ª≠ d·ª•ng BeautifulSoup)
        try:
            title = soup.find("h1").text.strip()
        except Exception as e:
            title = "Kh√¥ng r√µ"

        # S·ªë t·ª´ (S·ª≠ d·ª•ng BeautifulSoup)
        try:
            wordcount_meta = soup.find("meta", attrs={"name": "its_wordcount"})
            word_count = int(wordcount_meta["content"]) if wordcount_meta and wordcount_meta.get("content", "").isdigit() else 0
        except Exception as e:
            word_count = 0

        # Ng√†y ƒëƒÉng (S·ª≠ d·ª•ng BeautifulSoup)
        try:
            publish_date = soup.find(class_="date").text.strip()
        except Exception as e:
            publish_date = "Kh√¥ng r√µ"

        # Tags (S·ª≠ d·ª•ng BeautifulSoup)
        try:
            tags_elements = soup.select("div.tags a")
            tags = ', '.join([tag.text.strip() for tag in tags_elements if tag.text.strip()])
        except Exception as e:
            tags = ""

        # S·ªë ·∫£nh (S·ª≠ d·ª•ng BeautifulSoup)
        try:
            images = soup.select("img[itemprop='contentUrl']")
            image_count = len(images)
        except Exception as e:
            image_count = 0

        # S·ªë video (S·ª≠ d·ª•ng BeautifulSoup)
        try:
            video_divs = soup.find_all("div", class_="box_embed_video")
            video_count = len(video_divs)
        except Exception as e:
            video_count = 0

        # D·ªØ li·ªáu ƒë·ªông: S·ª≠ d·ª•ng Selenium cho b√¨nh lu·∫≠n v√† t∆∞∆°ng t√°c
        # S·ªë b√¨nh lu·∫≠n (S·ª≠ d·ª•ng Selenium)
        try:
            comment_text = driver.find_element(By.CLASS_NAME, 'section-comment').text.strip()
            match = re.search(r"(\d+)", comment_text)
            comments = match.group(1) if match else "0"
        except:
            comments = "0"

        # T·ªïng t∆∞∆°ng t√°c ·ªü b√¨nh lu·∫≠n (S·ª≠ d·ª•ng Selenium)
        try:
            interaction_elements = driver.find_elements(By.CSS_SELECTOR, 'a.number')
            total_interactions = sum(int(i.text) for i in interaction_elements if i.text.isdigit())
        except:
            total_interactions = 0

        return [title, publish_date, word_count, comments, total_interactions, image_count, video_count, tags]

    except Exception as e:
        print(f"‚ùå L·ªói khi x·ª≠ l√Ω b√†i vi·∫øt {link}:\nüëâ {e}")
        return None

    finally:
        if driver:
            driver.quit()

In [5]:
# H√†m t·∫°o t√™n file duy nh·∫•t
def get_unique_filename(folder, base_name, extension):
    count = 1
    filename = f"{base_name}{extension}"
    while os.path.exists(os.path.join(folder, filename)):
        filename = f"{base_name}_{count}{extension}"
        count += 1
    return filename

In [6]:
from concurrent.futures import ThreadPoolExecutor

def scrape_and_save_article_thread(link, chuyenmuc, folder_path, base_name, results, idx, total_links):
    result = process_single_article(link)  # H√†m scrape d·ªØ li·ªáu c·ªßa m·ªôt b√†i vi·∫øt
    if result:
        results.append({
            "Ti√™u ƒë·ªÅ": result[0],
            "Ng√†y ƒëƒÉng": result[1],
            "S·ªë t·ª´": result[2],
            "S·ªë b√¨nh lu·∫≠n": result[3],
            "T·ªïng t∆∞∆°ng t√°c ·ªü b√¨nh lu·∫≠n": result[4],
            "S·ªë ·∫£nh": result[5],
            "S·ªë video": result[6],
            "Tags": result[7],
            "Chuy√™n m·ª•c": chuyenmuc
        })
        print(f"‚úÖ B√†i vi·∫øt {idx + 1}/{total_links} ƒë√£ th√†nh c√¥ng: {link}")
    else:
        print(f"‚ö†Ô∏è B·ªè qua b√†i vi·∫øt {idx + 1}/{total_links} do l·ªói: {link}")

# H√†m scrape v√† l∆∞u d·ªØ li·ªáu b·∫±ng c√°ch s·ª≠ d·ª•ng multi-threading
def scrape_and_save_articles_multithreaded(all_links, folder_path, base_name, chuyenmuc, max_workers=4):
    # T·∫°o ƒë∆∞·ªùng d·∫´n cho file .csv v√† .json
    csv_file_name = get_unique_filename(folder_path, base_name, ".csv")
    json_file_name = get_unique_filename(folder_path, base_name, ".json")

    csv_path = os.path.join(folder_path, csv_file_name)
    json_path = os.path.join(folder_path, json_file_name)

    os.makedirs(folder_path, exist_ok=True)
    
    results = []  # Danh s√°ch ƒë·ªÉ l∆∞u d·ªØ li·ªáu b√†i vi·∫øt
    total_links = len(all_links)  # T·ªïng s·ªë link c·∫ßn x·ª≠ l√Ω

    # Ch·∫°y multi-threading v·ªõi ThreadPoolExecutor
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for idx, link in enumerate(all_links):
            futures.append(executor.submit(scrape_and_save_article_thread, link, chuyenmuc, folder_path, base_name, results, idx, total_links))
        
        # Ch·ªù t·∫•t c·∫£ c√°c thread ho√†n th√†nh
        for future in futures:
            future.result()  # ƒê·∫£m b·∫£o l√† t·∫•t c·∫£ threads ho√†n th√†nh

    # Ghi k·∫øt qu·∫£ v√†o file CSV v√† JSON sau khi thu th·∫≠p h·∫øt d·ªØ li·ªáu
    if len(results) > 0:
        with open(csv_path, mode='w', encoding='utf-8-sig', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["Ti√™u ƒë·ªÅ", "Ng√†y ƒëƒÉng", "S·ªë t·ª´", "S·ªë b√¨nh lu·∫≠n", "T·ªïng t∆∞∆°ng t√°c ·ªü b√¨nh lu·∫≠n", "S·ªë ·∫£nh", "S·ªë video", "Tags", "Chuy√™n m·ª•c"])
            for result in results:
                writer.writerow([result["Ti√™u ƒë·ªÅ"], result["Ng√†y ƒëƒÉng"], result["S·ªë t·ª´"], result["S·ªë b√¨nh lu·∫≠n"],
                                 result["T·ªïng t∆∞∆°ng t√°c ·ªü b√¨nh lu·∫≠n"], result["S·ªë ·∫£nh"], result["S·ªë video"], result["Tags"], result["Chuy√™n m·ª•c"]])

        # L∆∞u k·∫øt qu·∫£ v√†o file .json
        with open(json_path, 'w', encoding='utf-8') as json_file:
            json.dump(results, json_file, ensure_ascii=False, indent=4)

        print(f"üìÅ ƒê√£ l∆∞u {len(results)} b√†i vi·∫øt v√†o file: {csv_file_name} v√† {json_file_name}")
    else:
        print(f"‚ùå Kh√¥ng c√≥ b√†i vi·∫øt n√†o ƒë∆∞·ª£c l∆∞u. C√°c file ƒë√£ b·ªã x√≥a.")
        os.remove(csv_path)
        os.remove(json_path)

In [7]:
category = "thoi-su"
file_folder = r"E:\UIT\NƒÉm-2\K√¨-2\Do-an-DS\Data\thoi-su"
base_name = f"{category}_articles_data"

# Chia danh s√°ch link v√† ch·∫°y ƒëa lu·ªìng
link_lists = list(links)
scrape_and_save_articles_multithreaded(link_lists[:], file_folder, base_name, category)

‚úÖ B√†i vi·∫øt 4/613 ƒë√£ th√†nh c√¥ng: https://vnexpress.net/chu-tich-nuoc-dang-huong-gio-to-hung-vuong-4870850.html
‚úÖ B√†i vi·∫øt 1/613 ƒë√£ th√†nh c√¥ng: https://vnexpress.net/nguoi-dong-bhxh-tu-nguyen-du-kien-huong-luong-huu-the-nao-tu-1-7-4868528.html
‚úÖ B√†i vi·∫øt 5/613 ƒë√£ th√†nh c√¥ng: https://vnexpress.net/nguoi-noi-tieng-quang-cao-sai-su-that-co-the-bi-han-che-xuat-hien-4870732.html
‚úÖ B√†i vi·∫øt 2/613 ƒë√£ th√†nh c√¥ng: https://vnexpress.net/be-trai-ngu-quen-tren-cay-hon-100-nguoi-suot-dem-tim-kiem-4869925.html
‚úÖ B√†i vi·∫øt 8/613 ƒë√£ th√†nh c√¥ng: https://vnexpress.net/du-an-tai-dinh-cu-gan-50-ty-dong-bo-hoang-4863397.html
‚úÖ B√†i vi·∫øt 3/613 ƒë√£ th√†nh c√¥ng: https://vnexpress.net/khong-to-chuc-thanh-tra-bo-va-huyen-4867205.html
‚úÖ B√†i vi·∫øt 6/613 ƒë√£ th√†nh c√¥ng: https://vnexpress.net/cam-xe-5-duong-trung-tam-tp-hcm-de-lap-dat-tran-dia-phao-4870041.html
‚úÖ B√†i vi·∫øt 7/613 ƒë√£ th√†nh c√¥ng: https://vnexpress.net/can-chinh-sach-giup-can-bo-tinh-gian-c