In [6]:
import pandas as pd
import numpy as np
import multiprocess
import time

In [7]:
# create a dataframe
num_rows = 1000000
num_columns = 5
data = {
    f'column_{i}': np.random.randint(0, 100, size=num_rows)
    for i in range(num_columns)
}
df = pd.DataFrame(data)
df

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
0,65,35,98,74,18
1,30,15,90,54,49
2,53,26,99,68,79
3,82,31,9,22,31
4,88,16,56,47,61
...,...,...,...,...,...
999995,14,83,71,43,1
999996,58,77,96,36,47
999997,4,89,38,19,71
999998,85,75,79,82,45


In [8]:
# number of chunks is same as number of cores to distribute uniformly
num_chunks = multiprocess.cpu_count()
num_chunks

11

In [9]:
# based on #cores estimating chunk size
chunk_size = len(df)//num_chunks
chunk_size

90909

In [124]:
for i in range(0, df.shape[0], chunk_size):
    print(i)

0
90909
181818
272727
363636
454545
545454
636363
727272
818181
909090
999999


In [14]:
# making list of 11 (#cores) chunks
chunks = []
rows = df.shape[0]

for i in range(0, rows, chunk_size):
    chunks.append(df[i:i+chunk_size])
    
chunks

[       column_0  column_1  column_2  column_3  column_4
 0            65        35        98        74        18
 1            30        15        90        54        49
 2            53        26        99        68        79
 3            82        31         9        22        31
 4            88        16        56        47        61
 ...         ...       ...       ...       ...       ...
 90904        97        65        44        42        52
 90905        67        82        90        65        48
 90906        86        54        91        84        17
 90907         2        54        17        82         9
 90908        23        19        79        13        31
 
 [90909 rows x 5 columns],
         column_0  column_1  column_2  column_3  column_4
 90909          9        28        41        91        83
 90910         85        29        74        17        53
 90911         88        21        37        64        12
 90912         48        94        73        19        

In [17]:
len(chunks)

12

12th chunk is because of over limit of chunk_size

In [18]:
chunks[11]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
999999,74,76,99,58,24


In [19]:
chunks[10]

Unnamed: 0,column_0,column_1,column_2,column_3,column_4
909090,27,20,90,13,96
909091,60,31,58,66,77
909092,29,59,4,36,83
909093,35,97,38,26,49
909094,68,6,21,76,31
...,...,...,...,...,...
999994,38,35,18,35,19
999995,14,83,71,43,1
999996,58,77,96,36,47
999997,4,89,38,19,71


In [113]:
# function to process the data
def process_data(chunk):
    return chunk.apply(lambda row: row.sum(), axis=1)

In [134]:
start_time = time.time()

# Create a Pool with 10 cores
with multiprocess.Pool(processes=num_chunks) as pool:
        # each input set is assigned to a single core for processing by the process_data function
        results = pool.map(process_data, chunks)

end_time = time.time()
print(end_time - start_time)
    

0.7402498722076416


In [20]:
# without parallel processing
result_for = []
start_time = time.time()

for i, row in df.iterrows():
    result_for.append(row.sum())

end_time = time.time()
print(end_time - start_time)

10.108469009399414
