## Multiple threading python basic

FOr I/O bound like reading file, networking (API,...)

In [None]:
from concurrent.futures import ThreadPoolExecutor
import threading
def worker(task):
    print(f"{threading.current_thread().name = }")
    print(threading.get_ident())
    print(f"Task {task} running")

# Create a thread pool with 2 workers
with ThreadPoolExecutor(max_workers=2) as executor:
    # Submit two tasks to run in parallel
    executor.submit(worker, 1)
    executor.submit(worker, 2)
    

In [None]:
import threading
import time
import os

print(f"{threading.active_count() = }")
print(f"{threading.enumerate() = }")

def square(num):
    print(f"{threading.current_thread().name = }")
    print(f"{threading.get_ident() = }")
    print(f"{threading.get_native_id() = }")
    print(f"{os.getpid() = }")
    print(f"{threading.active_count() = }")
    print(f"{threading.local().__dict__ = }")
    print(f"Square: {num*num}")
    time.sleep(1)

def cube(num):
    print(f"{threading.current_thread().name = }")
    print(f"{threading.get_ident() = }")
    print(f"{threading.get_native_id() = }")
    print(f"{threading.active_count() = }")
    print(f"{os.getpid() = }")
    print(f"Cube: {num*num*num}")
    time.sleep(1)

t1 = threading.Thread(target=square, args=(4,))
t2 = threading.Thread(target=cube, args=(4,))

t1.start()
t2.start()
t1.join()
t2.join()

print(f"{threading.active_count() = }")
print("Done!")

In [None]:
# for _ in locals().items():
#     print(f"{_ = }")

## Sample, can use for template for crawling data

In [None]:
# Sample, can use for template for crawling data
import threading
import time

def crawl(link, delay=3):
    print(f"crawl started for {link}")
    time.sleep(delay)  # Blocking I/O (simulating a network request)
    print(f"crawl ended for {link}")

links = [
    "https://python.org",
    "https://docs.python.org",
    "https://peps.python.org",
]

# Start threads for each link
threads = []
for link in links:
    # Using `args` to pass positional arguments and `kwargs` for keyword arguments
    t = threading.Thread(target=crawl, args=(link,), kwargs={"delay": 2})
    threads.append(t)

# Start each thread
for t in threads:
    t.start()

# Wait for all threads to finish
for t in threads:
    t.join()

## Can use to check page active or not

In [None]:

# Can use to check page active or not
import concurrent.futures
import urllib.request

URLS = ['http://www.foxnews.com/',
        'http://www.cnn.com/',
        'http://europe.wsj.com/',
        'http://www.bbc.co.uk/',
        'http://nonexistent-subdomain.python.org/']

# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
    with urllib.request.urlopen(url, timeout=timeout) as conn:
        return conn.read()

# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
    print(f"{future_to_url = }")
    print(f"{'':=^50}")
    for future in concurrent.futures.as_completed(future_to_url):
        print(f"{future = }")
        url = future_to_url[future]
        print(f"{url = }")
        try:
            data = future.result()
            # print(f"{data = }")
        except Exception as exc:
            print(f'{url = } generated an exception: {exc = }' )
        else:
            print(f'{url = } page is {len(data)} bytes')
        print(f"{'':=^50}")

### Can use multiple thread for reading multiplr files

In [None]:
# Can use multiple thread for reading multiplr files

import threading
import concurrent.futures
import traceback
import os

default_workers: int = min(32, os.cpu_count() + 4)

FILE_PATHS: list = [
    "/home/user/prj1mrdp/logs/2025-11-15.log",
    "/home/user/prj1mrdp/logs/2025-11-16.log",
    "/home/user/prj1mrdp/logs/tracking_error.log"
]

def open_file(file_path: str = "") -> None:
    with open(file_path, "r") as file:
        try:
            data = file.readlines()
            return data
        except Exception as e:
            tb_str: str = traceback.TracebackException.from_exception(e).format()
            print(f"[{open_file.__name__}] Error: {tb_str = }")
            return []

with concurrent.futures.ThreadPoolExecutor(max_workers=default_workers) as executor:
    read_file = {executor.submit(open_file, file_path): file_path for file_path in FILE_PATHS}
    print(f"{read_file = }")
    for file_readed in concurrent.futures.as_completed(read_file):
        print(f"{file_readed = }")
        file_path = read_file[file_readed]
        print(f"{file_path = }")
        try:
            data = file_readed.result()
        except Exception as e:
            print(f"{e = }")
        else:
            print(f"{file_path = }, {len(data) = }")

## How Python decides the default number of workers for threading pool?

In [None]:
# How Python decides the default number of workers for threading pool?
import os
print(f"{os.cpu_count() = }")
default_workers = min(32, os.cpu_count() + 4)
default_workers

In [None]:
import os
os.getpid()

In [None]:
# # Source - https://stackoverflow.com/a
# # Posted by JimJty, modified by community. See post 'Timeline' for change history
# # Retrieved 2025-11-19, License - CC BY-SA 4.0

# try:
#     # For Python 3
#     import queue
#     from urllib.request import urlopen
# except:
#     # For Python 2 
#     import Queue as queue
#     from urllib2 import urlopen

# import threading

# worker_data = ['http://google.com', 'http://yahoo.com', 'http://bing.com']

# # Load up a queue with your data. This will handle locking
# q = queue.Queue()
# for url in worker_data:
#     q.put(url)

# # Define a worker function
# def worker(url_queue):
#     queue_full = True
#     while queue_full:
#         try:
#             # Get your data off the queue, and do some work
#             url = url_queue.get(False)
#             data = urlopen(url).read()
#             print(len(data))

#         except queue.Empty:
#             queue_full = False

# # Create as many threads as you want
# thread_count = 5
# for i in range(thread_count):
#     t = threading.Thread(target=worker, args = (q,))
#     t.start()


In [None]:
# Check url is active or not, using threadpool and requests
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

urls = ['http://google.com','http://yahoo.com','http://bing.com']

def fetch(url, timeout=5):
    try:
        # Either create a session per thread or use requests.get (stateless)
        resp = requests.get(url, timeout=timeout)
        resp.raise_for_status()
        return url, len(resp.content)
    except Exception as e:
        return url, e

with ThreadPoolExecutor(max_workers=5) as ex:
    futures = {ex.submit(fetch, url): url for url in urls}
    for fut in as_completed(futures):
        url, result = fut.result()
        print(url, result)
