In [None]:
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import time

In [None]:
def crawling_article(url : str):
    news = requests.get(url)
    soup = BeautifulSoup(news.content, "html.parser")
    
    content = []
    date = soup.select("span.date")
    for row_date in date:
        content.append(row_date.text)
    
    title = soup.select("h1.title-detail")
    for row_title in title:
        content.append(row_title.text)
    
    description = soup.select("p.description")
    for row_description in description:
        content.append(row_description.text)
        
    body = soup.select("article.fck_detail > *")
    for element in body:
        if (element.name == "p"):
            content.append(element.text)
        elif (element.name == "table"):
            rows = element.find_all("tr")
            for row in rows:
                content_in_row = []
                for col in row.children:
                    content_in_row.append(col.text.strip())
                content_in_row = " ".join(content_in_row)    
                content.append(content_in_row)
    content = "\n".join(content)
    
    metadata = []
    images = soup.select("article.fck_detail div.fig-picture img.lazy")
    for img in images:
        src = img.get("src", "") if (img.get("src", "").startswith("https")) else img.get("data-src", "")
        alt = img.get("alt", "")
        if not alt:
            alt = "No caption available"
        metadata.append(tuple([src, alt]))
        
    images_of_video = soup.select("article.fck_detail div.box_embed_video_parent.embed_video_new img")
    for img in images_of_video:
        src = img.get("src", "") if (img.get("src", "").startswith("https")) else img.get("data-src", "")
        alt = img.get("alt", "")
        if not alt:
            alt = "No caption available"
        metadata.append(tuple([src, alt]))
    
    data = {
        "url" : url,
        "title" : title[0].text.strip() if (title) else "",
        "content" : content,
        "metadata" : metadata,
    }
    
    return data

In [None]:
def read_urls_from_category(category : str):
    with open("urls_of_articles/" + category + ".json", "r", encoding = "utf-8") as file:
        data = json.load(file)
    return data['list_urls']

def progress(category : str):
    urls = read_urls_from_category(category)
    numUrls = len(urls)
    
    progress_bar = tqdm(total = numUrls, desc = "Crawling Progress", colour = "cyan", unit = "articles")
    for (i, url) in enumerate(urls):
        if (category == "cong-nghe" and i <= 132):  continue
        data = crawling_article(url)
        file_save = "VnExpress_Crawled_by_requests_bs4/" + category + f"/{i}.json"
        with open(file_save, "w", encoding="utf-8") as json_file:  
            json.dump(data, json_file, ensure_ascii=False, indent=4)
        progress_bar.update(1)
        time.sleep(2)

In [None]:
url_of_main_page = "https://vnexpress.net"

categories = ["cong-nghe",
              "the-thao",
              "the-gioi"]

for (i, category) in enumerate(categories):
    progress(category)
    print(f"Complete {i + 1}/{len(categories)} categories : {category}") 

In [None]:
# url = "https://vnexpress.net/diem-uu-tien-ielts-cua-dai-hoc-cong-nghe-thong-tin-dai-hoc-quoc-gia-tp-hcm-4848706.html"

# response = requests.get(url)

# # with open("page-source.html", "w", encoding = "utf-8") as file:
# #     file.write(response.text)

# soup = BeautifulSoup(response.content, "html.parser")

# content = []

# article = soup.select("article.fck_detail > *")
# for element in article:
#     if (element.name == "p"):
#         pass
#     elif (element.name == "table"):
#         rows = element.find_all("tr")
#         for row in rows:
#             content_in_row = []
#             for col in row.children:
#                 content_in_row.append(col.text.strip())
#             content_in_row = " ".join(content_in_row)    
#             content.append(content_in_row)
            
# for row in content:
#     print(row)

In [None]:
# def test_crawl(url : str):
#     news = requests.get(url)
#     soup = BeautifulSoup(news.content, "html.parser")
    
#     content = []
#     date = soup.select("span.date")
#     for row_date in date:
#         content.append(row_date.text)
    
#     title = soup.select("h1.title-detail")
#     for row_title in title:
#         content.append(row_title.text)
    
#     description = soup.select("p.description")
#     for row_description in description:
#         content.append(row_description.text)
        
#     body = soup.select("article.fck_detail > *")
#     for element in body:
#         if (element.name == "p"):
#             content.append(element.text)
#         elif (element.name == "table"):
#             rows = element.find_all("tr")
#             for row in rows:
#                 content_in_row = []
#                 for col in row.children:
#                     content_in_row.append(col.text.strip())
#                 content_in_row = " ".join(content_in_row)    
#                 content.append(content_in_row)
#     content = "\n".join(content)
    
#     metadata = []
#     images = soup.select("article.fck_detail div.fig-picture img.lazy")
#     for img in images:
#         src = img.get("src", "") if (img.get("src", "").startswith("https")) else img.get("data-src", "")
#         alt = img.get("alt", "")
#         if not alt:
#             alt = "No caption available"
#         metadata.append(tuple([src, alt]))
        
#     images_of_video = soup.select("article.fck_detail div.box_embed_video_parent.embed_video_new img")
#     for img in images_of_video:
#         src = img.get("src", "") if (img.get("src", "").startswith("https")) else img.get("data-src", "")
#         alt = img.get("alt", "")
#         if not alt:
#             alt = "No caption available"
#         metadata.append(tuple([src, alt]))
    
#     data = {
#         "url" : url,
#         "title" : title[0].text.strip() if (title) else "",
#         "content" : content,
#         "metadata" : metadata,
#     }
    
#     return data

# url = "https://vnexpress.net/diem-uu-tien-ielts-cua-dai-hoc-cong-nghe-thong-tin-dai-hoc-quoc-gia-tp-hcm-4848706.html"
# article = test_crawl(url)

# with open("0.json", "w", encoding="utf-8") as json_file:  
#     json.dump(article, json_file, ensure_ascii=False, indent=4)
