# one- vs multi-threading
In some cases instead of making a single thread python program you can use paralleling and speed it up  
Below is a damn simple programm that demonstrates a simple multrithreading app. Lets assume we have some simple task (def doTask()) and we have to launch it N times in a row (amountOftasks). In this task we have random time insleep that represents some work done or waiting a response from somewhere. Lets first launch it in a single thread one by one and then launch them via multithreading lib nad compare time difference!

In [1]:
import threading
import multiprocessing
import time
import random

amountOftasks = 5

def doTask(n):
    sleeptime = round(random.randint(1e4,1e5)/1e4,1)
    time.sleep(sleeptime)
    print("Job {0} done after {1} seconds at {2}, spent so far {3} seconds.".format(n,sleeptime,time.asctime().split(" ")[3],round(time.time()-startedAt,2)))

startedAt = time.time()
print("Start {0} tasks one-by-one in a row at {1}.".format(amountOftasks,time.asctime().split(" ")[3]))
for i in range(amountOftasks): 
    doTask(i)

startedAt = time.time()
print("Start {0} tasks in multithreading at {1}.".format(amountOftasks,time.asctime().split(" ")[3]))
for i in range(amountOftasks): 
    threading.Thread(target=doTask, args=(i,)).start()


Start 5 tasks one-by-one in a row at 01:44:48.
Job 0 done after 7.5 seconds at 01:44:55, spent so far 7.51 seconds.
Job 1 done after 1.6 seconds at 01:44:57, spent so far 9.11 seconds.
Job 2 done after 9.4 seconds at 01:45:06, spent so far 18.51 seconds.
Job 3 done after 6.0 seconds at 01:45:12, spent so far 24.52 seconds.
Job 4 done after 1.4 seconds at 01:45:14, spent so far 25.92 seconds.
Start 5 tasks in multithreading at 01:45:14.


# multithreading is much faster
Yes, in some cases you can significantly speedup your program computing tasks in parallel. Lets rewrite it with more control over threads:

In [2]:
threads = []
for i in range(amountOftasks):
    t = threading.Thread(target=doTask, args=(i,))
    threads.append(t)
    t.start()

# .join() method blocks main thread until this tread would not be killed

startedAt = time.time()
print("Start {0} tasks in multithreading at {1}.".format(amountOftasks,time.asctime().split(" ")[3]))
for one_thread in threads:
     # stdout also blocked here because it is controlled by main thread
    print(".join() thread {0} after {1} seconds.".format(str(one_thread),round(time.time()-startedAt,1)))
    one_thread.join()

print("All tasks are done at {0} seconds.".format(round(time.time()-startedAt,1)))

Start 5 tasks in multithreading at 01:45:14.
.join() thread <Thread(Thread-9, started 139640744941120)> after 0.0 seconds.
Job 3 done after 1.2 seconds at 01:45:15, spent so far 1.15 seconds.
Job 4 done after 1.5 seconds at 01:45:15, spent so far 1.45 seconds.
Job 2 done after 3.2 seconds at 01:45:17, spent so far 3.15 seconds.
Job 4 done after 4.4 seconds at 01:45:18, spent so far 4.4 seconds.
Job 1 done after 6.2 seconds at 01:45:20, spent so far 6.15 seconds.
Job 0 done after 7.8 seconds at 01:45:21, spent so far 7.8 seconds.
.join() thread <Thread(Thread-10, started 139640258426432)> after 7.8 seconds.
Job 1 done after 8.0 seconds at 01:45:22, spent so far 8.0 seconds.
.join() thread <Thread(Thread-11, started 139640250033728)> after 8.0 seconds.
Job 3 done after 8.2 seconds at 01:45:22, spent so far 8.21 seconds.
Job 0 done after 8.9 seconds at 01:45:22, spent so far 8.85 seconds.
Job 2 done after 9.0 seconds at 01:45:23, spent so far 9.01 seconds.
.join() thread <Thread(Thread-12

# multiprocessing or multithreading!
lets do the same in multiprocessing instead of multithreading. btw both libraries have very similar sintax

In [3]:

startedAt = time.time()
processes = [ ]
for i in range(amountOftasks):
    t = multiprocessing.Process(target=doTask, args=(i,))
    processes.append(t)
    t.start()

startedAt = time.time()
print("Start {0} tasks in multiprocessing at {1}.".format(amountOftasks,time.asctime().split(" ")[3]))
for one_process in processes:
    print(".join() Process {0} after {1} seconds.".format(str(one_process),round(time.time()-startedAt,1)))
    one_process.join()

print("All tasks are done in {0} seconds.".format(round(time.time()-startedAt,1)))

Start 5 tasks in multiprocessing at 01:45:23.
.join() Process <Process name='Process-1' pid=50742 parent=50547 started> after 0.0 seconds.
Job 1 done after 4.0 seconds at 01:45:27, spent so far 4.01 seconds.
Job 0 done after 8.7 seconds at 01:45:31, spent so far 8.71 seconds.
Job 4 done after 8.8 seconds at 01:45:31, spent so far 8.82 seconds.
.join() Process <Process name='Process-2' pid=50743 parent=50547 stopped exitcode=0> after 8.7 seconds.
.join() Process <Process name='Process-3' pid=50744 parent=50547 started> after 8.7 seconds.
Job 3 done after 9.5 seconds at 01:45:32, spent so far 9.52 seconds.
Job 2 done after 9.7 seconds at 01:45:32, spent so far 9.72 seconds.
.join() Process <Process name='Process-4' pid=50745 parent=50547 stopped exitcode=0> after 9.7 seconds.
.join() Process <Process name='Process-5' pid=50746 parent=50547 stopped exitcode=0> after 9.7 seconds.
All tasks are done in 9.7 seconds.


# interprocess communications 1
Any form of data exchange between processes is called interprocess communication. This can be done with shared memory by multiple processes or with data passing between them. Lets look first on memory sharing by a multithreading:

In [4]:
# some shared lins to be modifyed by several threads
mylist = [ ]
print("Shared list is now empty! len: {0}; content: {1}".format(len(mylist),str(mylist)))
def doTask(n):
    sleeptime = round(random.randint(1e4,1e5)/1e4,1)
    time.sleep(sleeptime)
    mylist.append(threading.get_ident())   # race condition aware!
    print("Job (add thread inent to shared list) {0} done after {1} seconds at {2}, spent so far {3} seconds.".format(n,sleeptime,time.asctime().split(" ")[3],round(time.time()-startedAt,2)))

threads = []
for i in range(amountOftasks):
    t = threading.Thread(target=doTask, args=(i,))
    threads.append(t)
    t.start()

startedAt = time.time()
print("Start {0} tasks in multithreading at {1}.".format(amountOftasks,time.asctime().split(" ")[3]))
for one_thread in threads:
    print(".join() thread {0} after {1} seconds.".format(str(one_thread),round(time.time()-startedAt,1)))
    one_thread.join()
    
print("All tasks are done in {0} seconds.".format(round(time.time()-startedAt,1)))
print("Shared list is not empty enymore! len: {0}; content: {1}".format(len(mylist),str(mylist)))

Shared list is now empty! len: 0; content: []
Start 5 tasks in multithreading at 01:45:32.
.join() thread <Thread(Thread-14, started 139640250033728)> after 0.0 seconds.
Job (add thread inent to shared list) 1 done after 4.1 seconds at 01:45:36, spent so far 4.1 seconds.
Job (add thread inent to shared list) 2 done after 4.3 seconds at 01:45:37, spent so far 4.3 seconds.
Job (add thread inent to shared list) 0 done after 6.5 seconds at 01:45:39, spent so far 6.5 seconds.
.join() thread <Thread(Thread-15, stopped 139640786904640)> after 6.5 seconds.
.join() thread <Thread(Thread-16, stopped 139640241641024)> after 6.5 seconds.
.join() thread <Thread(Thread-17, started 139640258426432)> after 6.5 seconds.
Job (add thread inent to shared list) 3 done after 9.0 seconds at 01:45:41, spent so far 9.01 seconds.
.join() thread <Thread(Thread-18, started 139640778511936)> after 9.0 seconds.
Job (add thread inent to shared list) 4 done after 9.7 seconds at 01:45:42, spent so far 9.71 seconds.
Al

# interprocess communications 2
If we will do the same with multiprocessing, the result wont be the same, because multiprocessing dont use shared memory and global variable will be copied to memory of each process before it wil be changed and wont be rewritten after procccess will finish its job.

In [5]:

mylist = [ ]
print("Shared list is now empty! len: {0}; content: {1}".format(len(mylist),str(mylist)))

startedAt = time.time()
processes = [ ]
for i in range(amountOftasks):
    t = multiprocessing.Process(target=doTask, args=(i,))
    processes.append(t)
    t.start()

startedAt = time.time()
print("Start {0} tasks in multiprocessing at {1}.".format(amountOftasks,time.asctime().split(" ")[3]))
for one_process in processes:
    print(".join() Process {0} after {1} seconds.".format(str(one_process),round(time.time()-startedAt,1)))
    one_process.join()

print("All tasks are done in {0} seconds.".format(round(time.time()-startedAt,1)))
print("Shared list is still empty, because we use multiprocessing! len: {0}; content: {1}".format(len(mylist),str(mylist)))


Shared list is now empty! len: 0; content: []
Start 5 tasks in multiprocessing at 01:45:42.
.join() Process <Process name='Process-6' pid=50778 parent=50547 started> after 0.0 seconds.
Job (add thread inent to shared list) 2 done after 3.9 seconds at 01:45:46, spent so far 3.91 seconds.
Job (add thread inent to shared list) 3 done after 5.0 seconds at 01:45:47, spent so far 5.02 seconds.
Job (add thread inent to shared list) 0 done after 5.4 seconds at 01:45:48, spent so far 5.41 seconds.
.join() Process <Process name='Process-7' pid=50779 parent=50547 started> after 5.4 seconds.
Job (add thread inent to shared list) 4 done after 6.0 seconds at 01:45:48, spent so far 6.02 seconds.
Job (add thread inent to shared list) 1 done after 6.7 seconds at 01:45:49, spent so far 6.71 seconds.
.join() Process <Process name='Process-8' pid=50780 parent=50547 stopped exitcode=0> after 6.7 seconds.
.join() Process <Process name='Process-9' pid=50781 parent=50547 stopped exitcode=0> after 6.7 seconds.

# interprocess communications 3
How to communicate between processes if we don't have an any shared memory? We can use fifo pipes between processes for this reason:

In [9]:
q = multiprocessing.Queue()

def doTask(n):
    sleeptime = round(random.randint(1e4,1e5)/1e4,1)
    time.sleep(sleeptime)
#    mylist.append(threading.get_ident())   # race condition aware!
    q.put(os.getpid())
    print("Job (add thread inent to shared list) {0} done after {1} seconds at {2}, spent so far {3} seconds.".format(n,sleeptime,time.asctime().split(" ")[3],round(time.time()-startedAt,2)))

mylist = [ ]
print("Shared list is now empty! len: {0}; content: {1}".format(len(mylist),str(mylist)))

startedAt = time.time()
processes = [ ]
for i in range(amountOftasks):
    t = multiprocessing.Process(target=doTask, args=(i,))
    processes.append(t)
    t.start()

startedAt = time.time()
print("Start {0} tasks in multiprocessing at {1}.".format(amountOftasks,time.asctime().split(" ")[3]))
for one_process in processes:
    print(".join() Process {0} after {1} seconds.".format(str(one_process),round(time.time()-startedAt,1)))
    one_process.join()


while not q.empty():
    mylist.append(q.get())

print("All tasks are done in {0} seconds.".format(round(time.time()-startedAt,1)))
print("Shared list is not empty now! len: {0}; content: {1}".format(len(mylist),str(mylist)))


Shared list is now empty! len: 0; content: []
Start 5 tasks in multiprocessing at 01:46:23.
.join() Process <Process name='Process-16' pid=50844 parent=50547 started> after 0.0 seconds.
Job (add thread inent to shared list) 3 done after 1.2 seconds at 01:46:24, spent so far 1.22 seconds.
Job (add thread inent to shared list) 0 done after 3.0 seconds at 01:46:26, spent so far 3.01 seconds.
.join() Process <Process name='Process-17' pid=50845 parent=50547 started> after 3.0 seconds.
Job (add thread inent to shared list) 2 done after 4.2 seconds at 01:46:27, spent so far 4.22 seconds.
Job (add thread inent to shared list) 4 done after 7.1 seconds at 01:46:30, spent so far 7.13 seconds.
Job (add thread inent to shared list) 1 done after 8.6 seconds at 01:46:31, spent so far 8.62 seconds.
.join() Process <Process name='Process-18' pid=50846 parent=50547 stopped exitcode=0> after 8.6 seconds.
.join() Process <Process name='Process-19' pid=50847 parent=50547 stopped exitcode=0> after 8.6 seco

# daemon and non daemon processes 1:
difference between daemon and non-daemon threads:
* daemon thread can be killed at the end of the main thread before end of damon thread
* non-daemon thread would make main thread to wait until non-daemon threads would end before exit() on main thread

In [10]:
def doTask(n):
	print("Job {0} started at {1}.".format(n,time.asctime().split(" ")[3]))
	sleeptime = round(random.randint(1e4,1e5)/1e4,1)
	time.sleep(sleeptime)
	print("Job {0} done after {1} seconds at {2}, spent so far {3} seconds.".format(n,sleeptime,time.asctime().split(" ")[3],round(time.time()-startedAt,2)))

t = threading.Thread(name='non-daemon', target=doTask, args=('non-daemon',))
d = threading.Thread(name='daemon', target=doTask, args=('daemon',))
d.setDaemon(True)

startedAt = time.time()
d.start()
t.start()



Job daemon started at 01:46:31.
Job non-daemon started at 01:46:31.


# daemon and non daemon processes 2: 
use .join() to wait until daemon thread would be ended

In [11]:
t = threading.Thread(name='non-daemon', target=doTask, args=('non-daemon',))
d = threading.Thread(name='daemon', target=doTask, args=('daemon',))
d.setDaemon(True)

d.start()
t.start()

d.join()
t.join()


Job daemon started at 01:46:31.
Job non-daemon started at 01:46:31.
Job non-daemon done after 3.8 seconds at 01:46:35, spent so far 3.86 seconds.
Job daemon done after 4.8 seconds at 01:46:36, spent so far 4.8 seconds.
Job daemon done after 6.5 seconds at 01:46:38, spent so far 6.56 seconds.
Job non-daemon done after 7.3 seconds at 01:46:39, spent so far 7.31 seconds.


# exmpl
see python/multiprocessing example provided, you can use it as a start point for the making an app

In [None]:
# -*- coding: utf-8 -*-
import io 
import tldextract
from urllib.parse import urlparse
import sys
from multiprocessing.pool import ThreadPool
import time
import random

top = sys.argv[2]
ourfile = io.open(sys.argv[1], 'r', encoding='utf-8')
ourfile2 = ourfile.readlines()
Y = open("Yes.txt", 'w', encoding='utf-8')
N = open("No.txt", 'w', encoding='utf-8')

def check_if_string_in_file(file_name, word):
    with io.open (file_name, "r", encoding='utf-8') as read_obj:
        for line in read_obj:
            if line.startswith("http://") or line.startswith("https://"):
                parsed = urlparse(line)
                scheme = "%s://" % parsed.scheme
                line = parsed.geturl().replace(scheme, '', 1)
            line2=tldextract.extract(line)
            if line2.subdomain.startswith("www"):
                line = "{}.{}".format(line2.domain, line2.suffix)
            line = line.rstrip("\n")
            if word == line:
                return True
    return False

def ourfilef(line):
    print(line)
    line=line.split('/')
    line=line[0]
    line = line.rstrip("\n")
    line2 = tldextract.extract(line)
    line3=line
    line4=line
    if line2.subdomain.startswith("www"):
        line3 = "{}.{}".format(line2.domain, line2.suffix)
    if check_if_string_in_file(top, line3):
        return line
    else:
        line4 = "{}.{}".format(line2.domain, line2.suffix)
        if check_if_string_in_file(top, line4):
            return line

testlist=[]
for i in range(1e6):
    testlist.append("hui")

def test(line):
    if line=="hui":
        return "hui"
    else: 
        return "psda"

results = ThreadPool(4).imap_unordered(test, testlist)
#results = ThreadPool(4).imap_unordered(ourfilef, ourfile2)
#for line in results:
#    if line is None:
#        print(line)
#    else:
#        Y.write(line+'\n')


for i in results:
    print(i)


