In [14]:
import time
import requests
import re
import hashlib
from urllib import parse
from urllib.request import build_opener, install_opener, urlretrieve
from datasets import Dataset
from datasets import load_dataset

In [4]:
lang_list = ['zh', 'en', 'fr', 'es', 'ru', 'ar']
root_url = 'https://www.un.org/'
url_status = {}
link_pattern = r'<a href="((?:https?://[^/]*?\.un\.org)?/[^"]*?)"'
media_format_list = ['avi', 'wmv', 'mpeg', 'mp4', 'mov', 'mkv', 'flv', 'f4v', 'm4v', 'rmvb', 'rm', '3gp', 'dat', 'ts',
                     'mts', 'vob', 'bmp', 'jpg', 'png', 'tiff', 'gif', 'pcx', 'tga', 'exif', 'fpx', 'svg', 'psd', 'cdr',
                     'pcd', 'dxf', 'ufo', 'eps', 'ai', 'raw', 'wmf', 'mp3', 'aiff', 'aac']

In [5]:
# url中含该list中的字符串则不处理
excluded_url_pattern_list = ['search?', 'download?', 'subscribe?', 'system/403?', 'sustainabledevelopment.un.org/',
                             'https://www.un.org/unispal/documents/?']
# url符合pattern则做替换
url_clean_pattern_list = [(r'^http://\s*', 'https://'), (r'\r?\n', ''), (r'\s*#.*', ''), (r'(?<=\.pdf)&.*', ''),
                          (r'(?<=asp)\?.*', ''), (r'\.un\.org/../', '.un.org/'), (r'/[^./]*?/../', '/'),
                          (r'\s*[|/]$', ''), (r'https://(.*?\.un\.org)//\1/', r'https://\1/'), (' ', '%20')]


In [6]:
def escape(c):
    # Windows下文件名无法包含以下几个字符，需要转义
    return c.replace('?', '_QMARK_').replace(':', '_COLON_').replace('|', '_PIPE_').replace('/', '_SLASH_')\
        .replace('*', '_STAR_').replace('"', '_QT_').replace('\\', '_BS_').replace('<', '_LT_').replace('>', '_GT_')



In [7]:
def get_html(url):
    global url_status
    header = {
        'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/96.0.4664.110 Safari/537.36 "
    }
    try:
        resp = requests.get(url, headers=header, timeout=30)
        html = resp.content.decode('utf8')
        return html
    except Exception as e:
        print("%s: %s" % (time.strftime('%Y-%m-%d %H:%M:%S'), str(e)))
        return None



In [8]:
def is_media(url):
    if url.find('.') > 0:
        tmp = re.sub(r'^.*\.', '', url).lower()
        if tmp in media_format_list:
            return True
    return False

def has_excluded_url_pattern(url):
    if any(excluded_pattern in url for excluded_pattern in excluded_url_pattern_list):
        return True
    else:
        return False

def clean_url(url):
    for pattern, repl in url_clean_pattern_list:
        url = re.sub(pattern, repl, url)
    return url


In [9]:
def parse_urls(curr_url, html):
    urls = []
    base_url = re.sub(r'(?<=\.un\.org)/.*', '', curr_url)
    matched_urls = re.findall(link_pattern, html)
    for url in matched_urls:
        url = url.strip()
        if url[0] == '/':
            # 把相对路径改为绝对路径
            url = base_url + url
        if has_excluded_url_pattern(url):
            # 如果url中包含某些字符串则不处理
            continue
        elif is_media(url):
            # 如果是媒体格式的文件则不处理
            continue
        # 清洗url
        url = clean_url(url)
        if url not in urls:
            urls.append(url)
    return urls



In [20]:
def get_md5(content):
    md5 = hashlib.md5()
    md5.update(content.encode('utf8'))
    return md5.hexdigest()

def make_content_item(index, url, status, content):
    if content:
        md5 = get_md5(content)
    else:
        md5 = None
    ds_item = {'id': index, 'url': url, 'status': status, 'content': content, 'hash': md5, 'is_duplicate': 0}
    return ds_item

In [None]:
try:
    url_seed = load_dataset('hayesyang/un_corpus_seed', split='train')
except:
    initial_seed = {'id': [i for i in range(6)], 'url': [root_url + lang_list[i] for i in range(6)]}
    url_seed = Dataset.from_dict(initial_seed)

In [11]:
try:
    url_content = load_dataset('hayesyang/un_corpus_content', split='train')
except:
    url_content = Dataset.from_dict({'id':[], 'url':[], 'status': [], 'content': [], 'hash': [], 'is_duplicate': []})

Found cached dataset parquet (/home/user/.cache/huggingface/datasets/hayesyang___parquet/hayesyang--un_corpus_content-8a63ccf80c52bb19/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


In [24]:
def push_to_hub(seed_set, content_set):
    seed_set.push_to_hub('hayesyang/un_corpus_seed', token='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')
    content_set.push_to_hub('hayesyang/un_corpus_content', token='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')

In [25]:
def process(seed_set, content_set):
    start_index = len(content_set)
    end_index = len(seed_set)
    has_new = False
    count = 0
    
    for i in range(start_index, end_index):
        url = seed_set[i]['url']
        html = get_html(url)
        if html:
            new_content = make_content_item(i, url, 1, html)
            if new_content['hash'] in content_set['hash']:
                new_content['is_duplicate'] = 1
            content_set.add_item(new_content)
            
            urls = parse_urls(url, html)
            if len(urls) > 0:
                for new_url in urls:
                    if re.sub('/$', '', new_url) in seed_set['url']:
                        continue
                    seed_set.add_item({'id': len(seed_set), 'url': new_url})
                    has_new = True
            count += 1
        else:
            new_content = make_content_item(i, url, -1, None)
            
        if count % 200 == 0:
            push_to_hub(seed_set, content_set)
    
    push_to_hub(seed_set, content_set)
    return has_new, seed_set, content_set


In [None]:
has_new_url, url_seed, url_content = process(url_seed, url_content)

2023-03-18 02:33:34: 'utf-8' codec can't decode byte 0xe2 in position 10: invalid continuation byte


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 68/68 [00:00<00:00, 2731.92ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:00<00:00,  2.84it/s]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  4.33it/s]
Updating downloaded metadata with the new split.
Pushing dataset shards to the dataset hub:   0%|          | 0/3 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  20%|██        | 1/5 [00:00<00:02,  1.52ba/s][A
Creating parquet from Arrow format:  40%|████      | 2/5 [00:01<00:01,  1.60ba/s][A
Creating parquet from Arrow format:  60%|██████    | 3/5 [00:01<00:01,  1.55ba/s][A
Creating parquet from Arrow format:  80%|████████  | 4/5 [00:02<00:00,  1.40ba/s][A
Creating parquet from Arrow format: 100%|██████████| 5/5 [00:03<00:00,  1.60ba/s][A

Upload 1 LFS files

In [None]:
while has_new_url:
    has_new_url, url_seed, url_content = process(url_seed, url_content)
    