In [1]:
%%time 
# jupyter notebook 指令，用于显示整个cell运行时间

import concurrent.futures, time
from threading import Semaphore
my_Semaphore = Semaphore()

url_queue = ['url_' + str(i) for i in range(11)]
working_parser_num = 0

def worker(num):
    """模拟线程池不断从队列中取url的动作
    num：表示线程池中的第几个线程
    """
    global url_queue, working_parser_num, my_Semaphore
    while True:
        url = None
        my_Semaphore.acquire() # if 判断 和 pop 操作最好在一个原子操作中完成，否则可能被其他线程抢先 pop
        if url_queue:
            working_parser_num += 1 # 放在 pop 前，否则会出现 pop 后，队列变空，而 working_parser_num 还未来得及加一，导致其他线程结束
            url = url_queue.pop()
        my_Semaphore.release()
        
        if url:
            parser(url)
            print(url + ' processed.\n', end='')
            working_parser_num -= 1
        elif working_parser_num == 0:
            break
        else:
            time.sleep(3) # 用来等待其他工作中的线程处理数据，也通过延时避免本线程始终处于判断状态
    return 1

def parser(url):
    """用延时模拟：对url的解析，并向队列中写入新url
    """
    time.sleep(2)

THREAD_NUM = 5
with concurrent.futures.ThreadPoolExecutor(max_workers=THREAD_NUM) as executor: # 基于官方ThreadPoolExecutor例子的修改，亦见 P38
    # Start the load operations and mark each future with its URL
    future_to_tid = {executor.submit(worker, tid): tid for tid in range(THREAD_NUM)}
    for future in concurrent.futures.as_completed(future_to_tid):
        tid = future_to_tid[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%d generated an exception: %s' % (tid, exc))
        else:
            # print('task %d return %d.' % (tid, data))
            print('thread %d end. last task return %d.' % (tid, data))

url_6 processed.
url_10 processed.
url_8 processed.
url_7 processed.
url_9 processed.
url_5 processed.
url_4 processed.
url_3 processed.
url_2 processed.
url_1 processed.
url_0 processed.
thread 0 end. last task return 1.
thread 2 end. last task return 1.
thread 4 end. last task return 1.
thread 3 end. last task return 1.
thread 1 end. last task return 1.
Wall time: 7.06 s
