In [29]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import argparse

output_folder = './downloaded_websites'
lang_list = ['zh', 'en', 'fr', 'es', 'ru', 'ar']
error_url_save_path = "./error_url.txt"

def save_error_url(url):
    if os.path.isfile(error_url_save_path):
        mode = 'a'
    else:
        mode = 'w'
        
    with open(error_url_save_path, mode) as f:
         f.write(url + '\n')

In [30]:
def get_html(url, retries=3, backoff_factor=0.5):
    session = requests.Session()
    retry = Retry(total=retries, backoff_factor=backoff_factor)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    
    try:
        response = session.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"链接异常: {url} --- {e}")
        save_error_url(url)
        return None
    
    return response.text

In [31]:
def extract_urls_from_sitemap(sitemap_url):
    html_text = get_html(sitemap_url)
    if not html_text:
        return []
    
    soup = BeautifulSoup(html_text)
    urls = [loc.get_text() for loc in soup.find_all('loc')]
    return urls

In [32]:
def extract_urls_from_sitemap(sitemap_url):
    html_text = get_html(sitemap_url)
    if not html_text:
        return []
    
    soup = BeautifulSoup(html_text)
    urls = [loc.get_text() for loc in soup.find_all('loc')]
    return urls

In [33]:
import uuid

def generate_preupload_dataste_dict(url):
    html_text = get_html(url)
    
    if html_text:
        soup = BeautifulSoup(html_text)
        body_text = soup.select("body")
        
        title_tag = soup.select(".field--name-title")
        if len(title_tag) < 1:
            title_tag = soup.select("head > title")
            
        title = title_tag[0].get_text() if len(title_tag) > 0 else ""
        
        if len(body_text) != 0:
            return {url:{"uuid": str(uuid.uuid4()), "url":url,"title":title, "html_content":str(body_text[0])}}
        else:
            return {url:{"uuid": str(uuid.uuid4()), "url":url,"title":title, "html_content":""}}

In [48]:
def get_all_lang_url():
    lang_with_urls = {}
    
    print("Fetching all sitemap_urls...")
    for lang in lang_list:
        folder_path = os.path.join(output_folder, lang)
       
        sitemap_urls = extract_urls_from_sitemap(f"https://news.un.org/{lang}/sitemap.xml")
        is_lang_with_urls_exist = lang_with_urls.get("lang",None)
        
        if not is_lang_with_urls_exist:
            lang_with_urls[lang] = sitemap_urls
        else:    
            lang_with_urls[lang] += sitemap_urls
    
    print("Fetching all urls...")
    for lang in lang_with_urls:
        urls = []
        for lcos in lang_with_urls[lang]:
            urls += extract_urls_from_sitemap(lcos)
            
        lang_with_urls[lang] += urls
    
    return lang_with_urls

In [40]:
downloaded_html_dict = None

def generate_dataset_row(url):
    already_exists = downloaded_html_dict.get(url, None)
    
    if already_exists:
        return {url:already_exists}
    else:
        return generate_preupload_dataste_dict(url)

In [41]:
import datasets
from datasets import DatasetDict, Dataset

dataset_lang_template = {"uuid":[], "url":[],"title":[], "html_content":[]}

def transform_dataset_to_dict(dataset_single_lang):
    transformed_dataset = {}

    for item in dataset_single_lang:
        transformed_dataset[item['url']] = item

    return transformed_dataset
    
    
def transform_dict_to_dataset(dataset_single_lang_dict):
    pre_push_dataset_lang_template = {"uuid":[], "url":[],"title":[], "html_content":[]}
    
    for url in dataset_single_lang_dict:
        for key in pre_push_dataset_lang_template:
            pre_push_dataset_lang_template[key].append(dataset_single_lang_dict[url][key])
            
    return Dataset.from_dict(pre_push_dataset_lang_template)


def update_online_dataset(dataset_dict):
    upload_dataset_subset = transform_lang_dict_to_dataset(dataset_dict)
    dataset = Dataset.from_dict(upload_dataset_subset)
    dataset.push_to_hub("ranWang/un_corpus_for_sitemap", split=lang, token="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")

In [51]:
from concurrent.futures import ThreadPoolExecutor

def start_generate_rest_html(lang, dataset, urls):
    global downloaded_html_dict

    downloaded_html_dict = transform_dataset_to_dict(dataset)
    
    pre_uplaod_datdaset_dict = {}
        
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(generate_dataset_row, url) for url in urls]

        with tqdm(total=len(futures)) as pbar:
            for future in futures:
                pre_uplaod_datdaset_dict.update(future.result())
                pbar.update(1)
                          
    return transform_dict_to_dataset(pre_uplaod_datdaset_dict)

In [44]:
import multiprocessing
from time import time
import os
from tqdm import tqdm
    
    
def main():
    dataset = None 
    try:
        dataset = datasets.load_dataset('ranWang/un_corpus_for_sitemap')
    except FileNotFoundError:
        dataset = DatasetDict({lang:[] for lang in lang_list})
    
    lang_with_urls = get_all_lang_url()
    
    for lang in dataset:
        pre_uplaod_datdaset_dict = start_generate_rest_html(lang, 
                                                            dataset[lang], 
                                                            lang_with_urls[lang])
        dataset[lang] = pre_uplaod_datdaset_dict
                
        dataset.push_to_hub("ranWang/un_corpus_for_sitemap", token="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")
    

In [50]:
main()

Downloading readme:   0%|          | 0.00/751 [00:00<?, ?B/s]

Using custom data configuration ranWang--un_corpus_for_sitemap-4859af16cd95e76a


Downloading and preparing dataset None/None to /home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--un_corpus_for_sitemap-4859af16cd95e76a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.33M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/978k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating es split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating zh split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating ar split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating ru split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating fr split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating en split:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /home/ran/.cache/huggingface/datasets/ranWang___parquet/ranWang--un_corpus_for_sitemap-4859af16cd95e76a/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/6 [00:00<?, ?it/s]

Fetching all sitemap_urls...
Fetching all urls...


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 150874.24it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 241051.95it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 210769.05it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 224294.33it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 239674.51it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 162569.92it/s]
Pushing split es to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

ConnectionError: TLS/SSL connection has been closed (EOF) (_ssl.c:997)