In [6]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from datetime import datetime, timedelta
import random
from tqdm import tqdm

main_page_url = "https://vnexpress.net"

In [7]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')

In [None]:
def diff_year_month(date_begin, date_end):
    return [(date_end.year - date_begin.year), (date_end.month - date_begin.month)]

def chooseDate(date_begin, current_date, driver):
    date_input = driver.find_element(by = By.CSS_SELECTOR, value = "a.text-calendar.view_by_date")
    date_input.click()

    [diff_year, diff_month] = diff_year_month(date_begin, current_date)
    
    if (diff_year > 0):
        button_click = driver.find_element(by = By.CSS_SELECTOR, value = "span.arrowDown")
        for i in range(diff_year):
            button_click.click()
            
    if (diff_year < 0):
        button_click = driver.find_element(by = By.CSS_SELECTOR, value = "span.arrowUp")
        for i in range(-diff_year):
            button_click.click()
        
    if (diff_month > 0):
        button_click = driver.find_element(by = By.CSS_SELECTOR, value = "span.flatpickr-prev-month")
        for i in range(diff_month):
            button_click.click()
    
    if (diff_month < 0):
        button_click = driver.find_element(by = By.CSS_SELECTOR, value = "span.flatpickr-next-month")
        for i in range(-diff_month):
            button_click.click()

    days = driver.find_elements(by = By.CSS_SELECTOR, value = "span.flatpickr-day")
    for day in days:
        if (day.text == str(date_begin.day)):
            day.click()
            return

In [9]:
def crawl_link_article_category(category : str, numUrls : int = 10, date_begin = datetime(2023, 1, 1), date_end = datetime(2023, 12, 31)):
    url_to_category = main_page_url + '/' + category
    driver = webdriver.Chrome(options = chrome_options)
    driver.get(url_to_category)
    
    current_date = date_begin
    days_list = []
    while (current_date <= date_end):
        days_list.append(current_date)
        current_date += timedelta(days = 1)
    random.shuffle(days_list)
    days_list = tuple(days_list)
    
    list_urls = []
    pbar = tqdm(total = numUrls, desc = "Crawling Progress", colour = "cyan", unit = "url")
    for day in days_list:
        if (len(list_urls) == numUrls):
            break
        current_date = datetime.now()
        chooseDate(day, current_date, driver)
        chooseDate(day, day, driver)
        
        link_articles = driver.find_elements(by = By.CSS_SELECTOR, value = "article.item-news.item-news-common > h3.title-news > a")
        needs = min(len(link_articles), numUrls - len(list_urls), 3)    
        for i in range(needs):
            list_urls.append(link_articles[i].get_attribute("href"))
        pbar.update(needs)
        
    contentOfjson = {
        "category"  : category,
        "list_urls" : list_urls
    }
    
    file_path = "urls_of_articles/" + category + ".json" 
    with open(file_path, "w", encoding = "utf-8") as json_file:
        json.dump(contentOfjson, json_file, ensure_ascii = False, indent = 4)
        
    driver.quit()

In [10]:
categories = ["giao-duc",
              "phap-luat",
              "giai-tri",
              "kinh-doanh",
              "cong-nghe",
              "the-thao",
              "the-gioi"]


dt_begin = datetime(2023, 1, 1)
dt_end = datetime(2023, 12, 31)

for (i, category) in enumerate(categories):
    crawl_link_article_category(category = category, numUrls = 200, date_begin = dt_begin, date_end = dt_end)
    print(f"Complete {i + 1}/{len(categories)} categories : {category}") 

Crawling Progress: 100%|[36m██████████[0m| 200/200 [08:32<00:00,  2.56s/url]


Complete 1/7 categories : giao-duc


Crawling Progress: 100%|[36m██████████[0m| 200/200 [08:35<00:00,  2.58s/url]


Complete 2/7 categories : phap-luat


Crawling Progress: 100%|[36m██████████[0m| 200/200 [06:38<00:00,  1.99s/url]


Complete 3/7 categories : giai-tri


Crawling Progress: 100%|[36m██████████[0m| 200/200 [06:46<00:00,  2.03s/url]


Complete 4/7 categories : kinh-doanh


Crawling Progress: 100%|[36m██████████[0m| 200/200 [06:58<00:00,  2.09s/url]


Complete 5/7 categories : cong-nghe


Crawling Progress: 100%|[36m██████████[0m| 200/200 [07:04<00:00,  2.12s/url]


Complete 6/7 categories : the-thao


Crawling Progress: 100%|[36m██████████[0m| 200/200 [06:38<00:00,  1.99s/url]

Complete 7/7 categories : the-gioi



