In [9]:
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from tqdm import tqdm
import os
import time

main_page_url = "https://vnexpress.net"

In [None]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.page_load_strategy = 'eager'

In [None]:
def crawling_article(url : str):
    driver = webdriver.Chrome(options = chrome_options)
    driver.get(url)

    content = []
    date = driver.find_elements(by = By.CLASS_NAME, value = "date")
    for row_date in date:
        content.append(row_date.text)

    title = driver.find_elements(by = By.CLASS_NAME, value = "title-detail")
    for row_title in title:
        content.append(row_title.text)
        
    description = driver.find_elements(by = By.CSS_SELECTOR, value = "p.description")
    for row_description in description:
        content.append(row_description.text)

    main_content = driver.find_elements(by = By.CSS_SELECTOR, value = "article.fck_detail p.Normal")
    for row_main in main_content:
        content.append(row_main.text)
    content = '\n'.join(content)

    metadata = []
    images = driver.find_elements(By.CSS_SELECTOR, value = "article.fck_detail div.fig-picture img.lazy")
    for img in images:
        src = img.get_attribute("src") if (img.get_attribute("data-ll-status") == "loaded") else img.get_attribute("data-src")
        alt = img.get_attribute("alt")
        if not alt:
            alt = "No caption available"
        metadata.append(tuple([src, alt]))

    img_of_video = driver.find_elements(By.CSS_SELECTOR, value = "article.fck_detail div.box_embed_video_parent.embed_video_new img")
    for img in img_of_video:
        src = img.get_attribute("src")
        alt = img.get_attribute("alt")
        if not alt:
            alt = "No caption available"
        metadata.append(tuple([src, alt]))

    data = {
        "url" : url,
        "title" : title[0].text,
        "content" : content,
        "metadata" : metadata
    }
    driver.quit()
    return data            

In [None]:
def read_urls_from_category(category : str):
    with open("urls_of_articles/" + category + ".json", "r", encoding = "utf-8") as file:
        data = json.load(file)
    return data['list_urls']

def progress(category : str):
    urls = read_urls_from_category(category)
    numUrls = len(urls)
    
    progress_bar = tqdm(total = numUrls, desc = "Crawling Progress", colour = "cyan", unit = "articles")
    for (i, url) in enumerate(urls):
        data = crawling_article(url)
        file_path = "VnExpress/" + category + f"/{i}.json"
        with open(file_path, "w", encoding="utf-8") as json_file:  
            json.dump(data, json_file, ensure_ascii=False, indent=4)
        progress_bar.update(1)
        time.sleep(2)
        

In [None]:
categories = ["the-gioi"]

for (i, category) in enumerate(categories):
    progress(category)
    print(f"Complete {i + 1}/{len(categories)} categories : {category}")


In [None]:
# for cat in category:
#     directory_path = "VnExpress/" + cat 
#     if (os.path.exists(directory_path)):    continue
#     os.mkdir(directory_path)

In [None]:
# url = ""
# driver = webdriver.Chrome(options = chrome_options)
# driver.get(url)

# content = []
# date = driver.find_elements(by = By.CLASS_NAME, value = "date")
# for row_date in date:
#     content.append(row_date.text)

# title = driver.find_elements(by = By.CLASS_NAME, value = "title-detail")
# for row_title in title:
#     content.append(row_title.text)
    
# description = driver.find_elements(by = By.CSS_SELECTOR, value = "p.description")
# for row_description in description:
#     content.append(row_description.text)

# main_content = driver.find_elements(by = By.CSS_SELECTOR, value = "article.fck_detail p.Normal")
# for row_main in main_content:
#     content.append(row_main.text)
# content = '\n'.join(content)

# metadata = []
# images = driver.find_elements(By.CSS_SELECTOR, value = "article.fck_detail div.fig-picture img.lazy")
# for img in images:
#     src = img.get_attribute("src") if (img.get_attribute("data-ll-status") == "loaded") else img.get_attribute("data-src")
#     alt = img.get_attribute("alt")
#     if not alt:
#         alt = "No caption available"
#     metadata.append(tuple([src, alt]))
    
# img_of_video = driver.find_elements(By.CSS_SELECTOR, value = "article.fck_detail div.box_embed_video_parent.embed_video_new img")
# for img in img_of_video:
#     src = img.get_attribute("src")
#     alt = img.get_attribute("alt")
#     if not alt:
#         alt = "No caption available"
#     metadata.append(tuple([src, alt]))

# data = {
#     "url" : url,
#     "title" : title[0].text if not title else "",
#     "content" : content,
#     "metadata" : metadata
# }

# with open("0.json", "w", encoding="utf-8") as json_file:  
#     json.dump(data, json_file, ensure_ascii=False, indent=4)
    
# driver.quit()

In [None]:
# driver = webdriver.Chrome(options = chrome_options)
# driver.get(url)

# html = driver.page_source
# with open("page-source.html", "w", encoding = "utf-8") as file:
#     file.write(html)
    
# driver.quit()