In [None]:
"""Exercise: thread files
• Get a list of files (from the current directory or from all the files in the “slides” repository.
• Process each file:
• 1. get size of file
• 2. count how many times each character appear in the file.
• The script should accept the number of threads to use
"""


In [None]:
import os
from collections import Counter
import concurrent.futures
import threading
import time


files = os.listdir("for_test")
for index in range(len(files)):
    files[index] = f'for_test/{files[index]}'

def count_str(file_path: str):
    with open(file_path, 'r') as file:
        laters = file.read()
        size = os.stat(file_path).st_size
        print('name file is ' + file_path.split('/')[-1])
        print(f"{Counter(laters)} times in " + file_path.split('/')[-1])
        print(f'size of file is {size}')


def op1():
    with concurrent.futures.ThreadPoolExecutor() as ex:
        ex.map(count_str, files)

def op2():
    threads = []
    for file in files:
        thread = threading.Thread(target=count_str, args=(file,))
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()

def without_threads():
    for file in files:
        count_str(file)
    
def main():
    start1 = time.perf_counter()
    op1()
    end1 = time.perf_counter()
    ##########
    start2 = time.perf_counter()
    op2()
    end2 = time.perf_counter()
    ##########
    start3 = time.perf_counter()
    without_threads()
    end3 = time.perf_counter()
    ##########
    print(f'with op 1 its take time {round(start1 - end1, 3)}\n')
    print(f'with op 2 its take time {round(start2 - end3, 3)}\n')
    print(f'without threads its take time {round(start3 - end3, 3)}\n')


if __name__ == '__main__':
    main()



In [None]:
"""Exercise: thread URL requests.
In the following script we fetch the URLs listed in a file (urls.txt):
 Given a file with a list of URLs, collect the title of each site.
 
1 https://google.com/
2 https://youtube.com/
3 https://facebook.com/
4 https://baidu.com/
5 https://twitter.com/
6 https://instagram.com/
7 https://wikipedia.com/
8 https://www.amazon.com/
9 https://yahoo.com/
10 https://yandex.ru/
11 https://vk.com/
12 https://live.com/
13 https://naver.com/
14 https://yahoo.co.jp/
15 https://google.com.br/
16 https://netflix.com/
17 https://reddit.com/
18 https://ok.ru/
19 https://mail.ru/
20 https://ebay.com/
21 https://linkedin.com/
22 https://qq.com/
23 https://pinterest.com/
24 https://bing.com/
25 https://whatsapp.com/
26 https://office.com/
27 https://amazon.de/
28 https://aliexpress.com/
29 https://amazon.co.jp/
30 https://msn.com/
31 https://google.de/
32 https://paypal.com/
33 https://rakuten.co.jp/
34 https://amazon.co.uk/
35 https://daum.net/
36 https://google.co.jp/
37 https://taobao.com/
38 https://bilbili.com/
39 https://imdb.com/
40 https://booking.com/
41 https://roblox.com/
42 https://9apps.com/
43 https://globo.com/Threads 858
44 https://duckduckgo.com/
45 https://www.nttdocomo.co.jp/

It takes about 1.5-2 sec / URL from home. 
(It depends on a lot of factors including your network connection.)

Create a version of the above script that can use K threads
"""
import time
import requests
from bs4 import BeautifulSoup


def get_urls(limit):
    with open('urls.txt') as fh:
        urls = list(map(lambda line: line.rstrip("\n"), fh))
    print(urls)
    if len(urls) > limit:
        urls = urls[:limit]
    return urls


def get_title(url):
    try:
        resp = requests.get(url)
        if resp.status_code != 200:
            return None, f"Incorrect status_code {resp.status_code} for {url}"
    except Exception as err:
        return None, f"Error: {err} for {url}"
    soup = BeautifulSoup(resp.content, 'html.parser')
    return soup.title.string, None


def get_titles(urls):
    titles = []
    for url in urls:
        # print(f"Processing {url}")
        title, err = get_title(url)
        if err:
            print(err)
        else:
            print(title)
        titles.append({
            "url": url,
            "title": title,
            "err": err,
        })
    return titles


def main():
    # if len(sys.argv) < 2:
    #     exit(f"Usage: {sys.argv[0]} LIMIT")
    # limit = int(sys.argv[1])
    limit = 40
    urls = get_urls(limit)
    # print(urls)
    start = time.time()

    titles = get_titles(urls)
    end = time.time()
    print(f"Elapsed time: {end - start} for {len(urls)} pages.")
    # print(titles)


if __name__ == '__main__':
    main()


With multi proses


In [None]:
import time
import requests
import multiprocessing as mp
import 


def get_urls(limit):
    with open('urls.txt') as fh:
        urls = list(map(lambda line: line.rstrip("\n"), fh))
    # print(urls)
    if len(urls) > limit:
        urls = urls[:limit]
    return urls


def get_title(url):
    try:
        resp = requests.get(url)
        if resp.status_code != 200:

            return url, None, f"Incorrect status_code {resp.status_code} for {url}"
    except Exception as err:
        return url, None, f"Error: {err} for {url}"
    # soup = BeautifulSoup(resp.content, 'html.parser')
    # print(soup.title.string)
    return url, resp.content[:10], None


def get_titles(urls):
    pool = mp.Pool()
    results = pool.map(get_title, urls)
    titles = []
    for result in results:
        url, title, arr = result
        if arr:
            print(arr)
        else:
            print(title)
            titles.append({'url': url, 'title': title, 'arr': arr})
    return titles


def main():
    # if len(sys.argv) < 2:
    #     exit(f"Usage: {sys.argv[0]} LIMIT")
    # limit = int(sys.argv[1])
    limit = 40
    urls = get_urls(limit)
    # print(urls)
    start = time.time()
    titles = get_titles(urls)
    end = time.time()
    print(f"Elapsed time: {end - start} for {len(urls)} pages.")
    print(titles)


if __name__ == '__main__':
    main()


With multi threads


In [None]:
import time
import requests
from bs4 import BeautifulSoup
import threading


def get_urls(limit):
    with open('urls.txt') as fh:
        urls = list(map(lambda line: line.rstrip("\n"), fh))
    # print(urls)
    if len(urls) > limit:
        urls = urls[:limit]
    return urls


def get_title(url):
    try:
        resp = requests.get(url)
        if resp.status_code != 200:
            return url, None, f"Incorrect status_code {resp.status_code} for {url}"
    except Exception as err:
        return url, None, f"Error: {err} for {url}"
    soup = BeautifulSoup(resp.content, 'html.parser')
    return url, soup.title.string, None


def get_titles(urls):
    results = []
    threads = []
    for url in urls:
        thread = threading.Thread(
            target=lambda: results.append(get_title(url)))
        threads.append(thread)
        thread.start()
    for thread in threads:
        thread.join()
    titles = []
    for result in results:
        url, title, arr = result
        if arr:
            print(f"url is {url}, arr is {arr}")
        else:
            print(f"url is {url}, title is {title}")
        titles.append({'url': url, 'title': title, 'arr': arr})
    return titles


def main():
    limit = 40
    urls = get_urls(limit)
    print(urls)
    start = time.time()
    titles = get_titles(urls)
    end = time.time()
    print(f"Elapsed time: {end - start} for {len(urls)} pages.")
    print(titles)


if __name__ == '__main__':
    main()


In [None]:
"""Exercise: thread queue
Write an application that handles a queue of jobs in N=5 threads.
Each job contains a number between 0-5.
Each thread takes the next element from the queue and sleeps for the given amount
of second (as an imitation of actual work it should be doing). When finished it checks
for another job. If there are no more jobs in the queue, the thread can close itself.

If that’s done, change the code so that each thread will generate a random
number between 0-5 (for sleep-time) and in 33% of the cases it will add it to the central queue
as a new job.

Another extension to this exercise is to change the code to limit the number of jobs each thread
can execute in its lifetime. When the thread has finished that many jobs it will quit and the
main thread will create a new worker thread.
"""


In [17]:
import threading
import random
import sys
import time

thread_count = 5
counter = 0
queue = list(map(lambda x: ('main', random.randrange(5)), range(20)))

def performing_task():
    while queue:
        print(len(queue))
        task = queue.pop()
        time.sleep(task[1])

def ex3():
    threads = []
    for number in range(thread_count):
        thread = threading.Thread(target=performing_task, name=f'number thread {number}')
        thread.start()
        threads.append(thread)
    for thread in threads:
        thread.join()

    
ex3()
print(queue)



20
19
18
17
16
15
14
13
12
11
10
9
8
7
6
5
4
3
2
1
[]
