In [2]:
import time
import requests
import re
import hashlib
from urllib import parse
from urllib.request import build_opener, install_opener, urlretrieve
from datasets import Dataset
from datasets import load_dataset
import os.path
import sys

In [39]:
lang_list = ['zh', 'en', 'fr', 'es', 'ru', 'ar']
root_url = 'https://www.un.org/'
url_seed = set()
url_result = []
link_pattern = r'<a href="((?:https?://[^/]*?\.un\.org)?/[^"]*?)"'
media_format_list = ['avi', 'wmv', 'mpeg', 'mp4', 'mov', 'mkv', 'flv', 'f4v', 'm4v', 'rmvb', 'rm', '3gp', 'dat', 'ts',
                     'mts', 'vob', 'bmp', 'jpg', 'png', 'tiff', 'gif', 'pcx', 'tga', 'exif', 'fpx', 'svg', 'psd', 'cdr',
                     'pcd', 'dxf', 'ufo', 'eps', 'ai', 'raw', 'wmf', 'mp3', 'aiff', 'aac']
base_dir = 'hayesyang/un_corpus/'
log_dir = 'hayesyang/'
counter = 0

In [4]:
# url中含该list中的字符串则不处理
excluded_url_pattern_list = ['search?', 'download?', 'subscribe?', 'system/403?', 'sustainabledevelopment.un.org/',
                             'https://www.un.org/unispal/documents/?', '.pdf', '.doc', '.xls', '.ppt']
# url符合pattern则做替换
url_clean_pattern_list = [(r'^http://\s*', 'https://'), (r'\r?\n', ''), (r'\s*#.*', ''), (r'(?<=\.pdf)&.*', ''),
                          (r'(?<=asp)\?.*', ''), (r'\.un\.org/../', '.un.org/'), (r'/[^./]*?/../', '/'),
                          (r'\s*[|/?]$', ''), (r'https://(.*?\.un\.org)//\1/', r'https://\1/'), (' ', '%20'),
                          (r'(.*ldcportal/content/[^?\n]*)\?.*', r'\1'), (r'\t', ''), (r'\?(?!page=).*', '')]


In [36]:
def initialize_url_seed():
    global url_seed, url_result

    if not os.path.exists(log_dir):
        os.mkdir(log_dir)
    if os.path.exists(log_dir + 'url_seed.txt'):
        with open(log_dir + 'url_seed.txt', 'r', encoding='utf8') as f:
            lines = f.readlines()
            url_seed = set(line.strip() for line in lines)
    else:
        url_seed = set(root_url + lang for lang in lang_list)
    
    if os.path.exists(log_dir + 'url_result.txt'):
        with open(log_dir + 'url_result.txt', 'r', encoding='utf8') as f:
            lines = f.readlines()
            url_result = [{'url': line.split('\t')[0].strip(),
                           'status': int(line.split('\t')[1].strip()),
                           'path': line.split('\t')[2].strip(),
                           'hash': line.split('\t')[3].strip(),
                           'is_dup': int(line.split('\t')[4].strip())} for line in lines]

In [37]:
def save_url_result():
    global url_seed, url_result, log_dir
    
    seed_content = '\n'.join(url_seed)
    with open(log_dir + 'url_seed.txt', 'w', encoding='utf8') as f:
        f.write(seed_content)
        
    result_content = '\n'.join(['\t'.join([str(item[key]) for key in item.keys()]) for item in url_result])
    with open(log_dir + 'url_result.txt', 'w', encoding='utf8') as f:
        f.write(result_content)


In [29]:
def get_paths(url):
    global base_dir

    rel_name = re.sub('/$', '', re.sub(r'^https?://', '', url))
    rel_name = parse.unquote(rel_name)
    paths = rel_name.split('/')

    if paths[0][-7:] != '.un.org':
        return None, None
    
    rel_path = base_dir
    
    for i in range(1, len(paths) - 1):
        if paths[i].find('?') > 0 or paths[i].find('https:') >= 0:
            paths[i] = '/'.join(paths[i:])
            paths = paths[:i + 1]
            break

    try:
        if len(paths) > 1:
            for i in range(len(paths) - 1):
                if not os.path.exists(rel_path + paths[i]):
                    os.mkdir(rel_path + paths[i])
                rel_path += paths[i] + '/'
        else:
            rel_path += paths[0] + '/'
            if not os.path.exists(rel_path):
                os.mkdir(rel_path)
    except Exception as e:
        print("%s: %s" % (time.strftime('%Y-%m-%d %H:%M:%S'), str(e)))
        return None, None

    return rel_path, paths


In [30]:
def get_short_name(path):
    idx = 1
    while os.path.exists(path + '/' + '{:0>4d}'.format(idx) + '.html'):
        idx += 1
    return '{:0>4d}'.format(idx) + '.html'

In [31]:
def save_local(url, content):
    global base_dir
    rel_path, paths = get_paths(url)
    if rel_path:
        if len(paths) == 1:
            f_name = 'root.html'
        else:
            f_name = re.sub(r'\r?\n', '', paths[-1])
        f_name = escape(f_name)
        if '.' not in f_name:
            f_name += '.html'
        try:
            with open(rel_path + f_name, 'w', encoding='utf8') as f:
                f.write(content)
            return 1, rel_path.replace(base_dir, '')
        except Exception as e:
            if 'name too long' in str(e):
                f_name = get_short_name(rel_path)
                try:
                    with open(rel_path + f_name, 'w', encoding='utf8') as f:
                        f.write(content)
                    return 1, rel_path.replace(base_dir, '')
                except Exception as e2:
                    print("%s: %s" % (time.strftime('%Y-%m-%d %H:%M:%S'), str(e2)))
            else:
                print("%s: %s" % (time.strftime('%Y-%m-%d %H:%M:%S'), str(e)))
            return -1, None
    else:
        return -1, None


In [9]:
def escape(c):
    # Windows下文件名无法包含以下几个字符，需要转义
    return c.replace('?', '_QMARK_').replace(':', '_COLON_').replace('|', '_PIPE_').replace('/', '_SLASH_')\
        .replace('*', '_STAR_').replace('"', '_QT_').replace('\\', '_BS_').replace('<', '_LT_').replace('>', '_GT_')



In [32]:
def get_html(url):
    global url_status
    
    header = {
        'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/96.0.4664.110 Safari/537.36 "
    }
    try:
        resp = requests.get(url, headers=header, timeout=10)
        html = resp.content.decode('utf8')
        return html
    except Exception as e:
        print("%s: %s got exception - %s" % (time.strftime('%Y-%m-%d %H:%M:%S'), url, str(e)))
        if 'codec can\'t decode' in str(e):
            return 'ERR_STATUS -2'
        return None


In [11]:
def is_media(url):
    if url.find('.') > 0:
        tmp = re.sub(r'^.*\.', '', url).lower()
        if tmp in media_format_list:
            return True
    return False

def has_excluded_url_pattern(url):
    if any(excluded_pattern in url for excluded_pattern in excluded_url_pattern_list):
        return True
    else:
        return False

def clean_url(url):
    for pattern, repl in url_clean_pattern_list:
        url = re.sub(pattern, repl, url)
    return url


In [12]:
def parse_urls(curr_url, html):
    urls = []
    base_url = re.sub(r'(?<=\.un\.org)/.*', '', curr_url)
    matched_urls = re.findall(link_pattern, html)
    for url in matched_urls:
        url = url.strip()
        if url[0] == '/':
            # 把相对路径改为绝对路径
            url = base_url + url
        if has_excluded_url_pattern(url):
            # 如果url中包含某些字符串则不处理
            continue
        elif is_media(url):
            # 如果是媒体格式的文件则不处理
            continue
        # 清洗url
        url = clean_url(url)
        if url not in urls:
            urls.append(url)
    
    return urls



In [13]:
def get_md5(content):
    md5 = hashlib.md5()
    md5.update(content.encode('utf8'))
    return md5.hexdigest()

def make_content_item(url, status, path, content):
    global base_dir
    if content:
        md5 = get_md5(content)
    else:
        md5 = None
    item = {'url': url, 'status': status, 'path': path, 'hash': md5, 'is_duplicate': 0}
    return item

In [14]:
def process_url(url, hash_set):
    global counter, url_seed, url_result
    has_new = False
    
    if re.search(r'\.(?:pdf|docx?|pptx?|xlsx?)(?:[^A-Za-z]|$)', url):
        new_result = make_content_item(url, -1, None, None)
    else:
        html = get_html(url)
        if html:
            if html.startswith('ERR_STATUS'):
                status = int(html.replace('ERR_STATUS '))
                new_result = make_content_item(url, status, None, None)
            else:
                status, rel_path = save_local(url, html)
                new_result = make_content_item(url, status, rel_path, html)
                if new_result['hash'] in hash_set:
                    new_result['is_duplicate'] = 1
                else:
                    hash_set.add(new_result['hash'])

                urls = parse_urls(url, html)
                if len(urls) > 0:
                    for new_url in urls:
                        if re.sub('/$', '', new_url) in url_seed:
                            continue
                        url_seed.add(new_url)
                        has_new = True
        else:
            new_result = make_content_item(url, -1, None, None)
    
    url_result.append(new_result)
    counter += 1
    if counter % 500 == 0:
        print("%s: seed %i, content %i" % (time.strftime('%Y-%m-%d %H:%M:%S'), len(url_seed), len(url_result)))
        save_url_result()
        
    return has_new

In [15]:
import concurrent.futures

def run_threads(num_threads):
    global url_seed, url_result
    
    hash_set = set([item['hash'] for item in url_result])
    processed_url_list = [item['url'] for item in url_result]
    to_process_url = [url for url in url_seed if url not in processed_url_list]
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = [executor.submit(process_url, url, hash_set) for url in to_process_url]
        
    results = [future.result() for future in futures]
    
    save_url_result()

    has_new = any(result for result in results)
    return has_new


In [44]:
initialize_url_seed()

In [46]:
start = time.perf_counter()
has_new_url = run_threads(num_threads=6)
print(time.perf_counter() - start)

In [None]:
num_iter = 1
while has_new_url and num_iter < 5:
    start = time.perf_counter()
    has_new_url = run_threads(num_threads=6)
    print('iter %d costs %f' % (num_iter, time.perf_counter() - start))
    num_iter += 1