### Parallel Apply is about 5x faster for these functions compared to serial apply

In [1]:
import numpy as np
import pandas as pd
import string
import random
from collections import Counter
from pprint import pprint

from parallel_functions import (
    stats_return_df,
    stats_return_series,
    stats_return_tuple_floats,
    stats_return_tuple_mixed1,
    stats_return_tuple_mixed2
)

from parallel_apply import apply_grouped_parallel_with_args

In [2]:
N = 10000
random.seed(42)
group = [random.choice(string.ascii_uppercase) for _ in range(N)]
value = [random.random() for _ in range(N)]

In [3]:
data = pd.DataFrame({'group': group, 'value': value})
data.head()

Unnamed: 0,group,value
0,U,0.241823
1,D,0.570235
2,A,0.099106
3,X,0.578134
4,I,0.013984


In [4]:
grouped_data = data.groupby('group')

## Expected Values

In [5]:
golden = grouped_data.agg({np.mean, np.sum, np.std})['value']
golden = golden[['mean', 'sum', 'std']]
golden

Unnamed: 0_level_0,mean,sum,std
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0.479596,159.225857,0.285855
B,0.48412,184.933803,0.284004
C,0.508132,203.252864,0.292171
D,0.496822,188.792211,0.287715
E,0.515183,211.740245,0.293946
F,0.492011,182.536244,0.293433
G,0.49916,170.712728,0.295912
H,0.505211,192.485573,0.281962
I,0.477857,172.984186,0.289502
J,0.516721,201.521083,0.283455


## Verify Speed

### Function returning Dataframe

In [6]:
%time val1 = grouped_data.apply(stats_return_df)

Wall time: 26.1 s


In [7]:
%time val2 = apply_grouped_parallel_with_args(grouped_data, stats_return_df)

Wall time: 6.01 s


In [8]:
val1.reset_index(level=1, drop=True, inplace=True)
pd.testing.assert_frame_equal(val2, val1, check_names=False)
pd.testing.assert_frame_equal(val2, golden, check_names=False)

### Function returning Series

In [9]:
%time val1 = grouped_data.apply(stats_return_series)

Wall time: 26.1 s


In [10]:
%time val2 = apply_grouped_parallel_with_args(grouped_data, stats_return_series)

Wall time: 5.65 s


In [11]:
pd.testing.assert_frame_equal(val2, val1, check_names=False)
pd.testing.assert_frame_equal(val2, golden, check_names=False)

### Function returning tuple (all floats)

In [12]:
%time val1 = grouped_data.apply(stats_return_tuple_floats)

Wall time: 26 s


In [13]:
%time val2 = apply_grouped_parallel_with_args(grouped_data, stats_return_tuple_floats)

Wall time: 6.47 s


In [14]:
pd.testing.assert_series_equal(val2, val1, check_names=False)

In [15]:
# See Format of returned Value --> It is series of tuples, each tuple has the returned values for that group
val2

A    (0.47959595616390266, 159.22585744641583, 0.28...
B    (0.48411990308159797, 184.9338029771704, 0.284...
C    (0.508132160065169, 203.2528640260677, 0.29217...
D    (0.4968216089461155, 188.79221139952386, 0.287...
E    (0.5151830775386749, 211.74024486839556, 0.293...
F    (0.4920114383827426, 182.53624363999742, 0.293...
G    (0.4991600229925111, 170.7127278634388, 0.2959...
H    (0.5052114775334176, 192.48557294023203, 0.281...
I    (0.4778568684617443, 172.98418638315152, 0.289...
J    (0.5167207252617775, 201.5210828520932, 0.2834...
K    (0.5040948174719622, 194.58059954417743, 0.298...
L    (0.5141089980394153, 192.79087426478083, 0.293...
M    (0.49337629937618793, 188.4697463617038, 0.282...
N    (0.5165398750192324, 186.47089488194297, 0.286...
O    (0.49760398585672916, 199.53919832854854, 0.28...
P    (0.4818950050112198, 187.93905195437577, 0.301...
Q    (0.5236513367408374, 188.51448122670132, 0.283...
R    (0.49250673156922564, 204.39029360122856, 0.29...
S    (0.49

### Function returning tuple (mixed values)

#### Without Unpacking

In [16]:
%time val1 = grouped_data.apply(stats_return_tuple_mixed1)

Wall time: 26.1 s


In [17]:
%time val2 = apply_grouped_parallel_with_args(grouped_data, stats_return_tuple_mixed1)

Wall time: 5.7 s


In [18]:
val2

A    ([mean], [159.22585744641583], 0.2858545295891...
B    ([mean], [184.9338029771704], 0.2840041552895567)
C    ([mean], [203.2528640260677], 0.2921714892805938)
D    ([mean], [188.79221139952386], 0.2877150581733...
E    ([mean], [211.74024486839556], 0.2939455900663...
F    ([mean], [182.53624363999742], 0.2934328384317...
G    ([mean], [170.7127278634388], 0.2959122715482919)
H    ([mean], [192.48557294023203], 0.2819615640612...
I    ([mean], [172.98418638315152], 0.289501519793003)
J    ([mean], [201.5210828520932], 0.2834551090710788)
K    ([mean], [194.58059954417743], 0.2988105829626...
L    ([mean], [192.79087426478083], 0.2935148808146...
M    ([mean], [188.4697463617038], 0.2821656602782814)
N    ([mean], [186.47089488194297], 0.2862064034936...
O    ([mean], [199.53919832854854], 0.282687362934946)
P    ([mean], [187.93905195437577], 0.3011649150842...
Q    ([mean], [188.51448122670132], 0.2835641712497...
R    ([mean], [204.39029360122856], 0.2908397417594...
S    ([mea

In [19]:
val1

group
A    ([mean], [159.22585744641583], 0.2858545295891...
B    ([mean], [184.9338029771704], 0.2840041552895567)
C    ([mean], [203.2528640260677], 0.2921714892805938)
D    ([mean], [188.79221139952386], 0.2877150581733...
E    ([mean], [211.74024486839556], 0.2939455900663...
F    ([mean], [182.53624363999742], 0.2934328384317...
G    ([mean], [170.7127278634388], 0.2959122715482919)
H    ([mean], [192.48557294023203], 0.2819615640612...
I    ([mean], [172.98418638315152], 0.289501519793003)
J    ([mean], [201.5210828520932], 0.2834551090710788)
K    ([mean], [194.58059954417743], 0.2988105829626...
L    ([mean], [192.79087426478083], 0.2935148808146...
M    ([mean], [188.4697463617038], 0.2821656602782814)
N    ([mean], [186.47089488194297], 0.2862064034936...
O    ([mean], [199.53919832854854], 0.282687362934946)
P    ([mean], [187.93905195437577], 0.3011649150842...
Q    ([mean], [188.51448122670132], 0.2835641712497...
R    ([mean], [204.39029360122856], 0.2908397417594...
S   

#### With unpacking (single row in dataframe for each group)

In [20]:
%time val2 = apply_grouped_parallel_with_args(grouped_data, stats_return_tuple_mixed1, unpacked=True)

Wall time: 5.64 s


In [21]:
type(val2), len(val2)

(list, 3)

In [22]:
val2[0] 

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,0.479596,0.449843
B,0,0.48412,0.492872
C,0,0.508132,0.49838
D,0,0.496822,0.491868
E,0,0.515183,0.51088
F,0,0.492011,0.497677
G,0,0.49916,0.524564
H,0,0.505211,0.52755
I,0,0.477857,0.443739
J,0,0.516721,0.530277


In [23]:
# val2[1]

In [24]:
# val2[2]

#### With unpacking (multiple rows in dataframe for each group)

In [25]:
%time val2 = apply_grouped_parallel_with_args(grouped_data, stats_return_tuple_mixed2, unpacked=True)

Wall time: 5.71 s


In [26]:
type(val2), len(val2)

(list, 3)

In [27]:
val2[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,stats
group,Unnamed: 1_level_1,Unnamed: 2_level_1
A,mean,0.479596
A,median,0.449843
B,mean,0.48412
B,median,0.492872
C,mean,0.508132
C,median,0.49838
D,mean,0.496822
D,median,0.491868
E,mean,0.515183
E,median,0.51088


### With Additional Arguments passed to the apply function

In [28]:
%time val2 = apply_grouped_parallel_with_args(grouped_data, stats_return_tuple_mixed1, unpacked=True, mean_offset=2)

Wall time: 5.94 s


In [29]:
val2[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,median
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,0,2.479596,0.449843
B,0,2.48412,0.492872
C,0,2.508132,0.49838
D,0,2.496822,0.491868
E,0,2.515183,0.51088
F,0,2.492011,0.497677
G,0,2.49916,0.524564
H,0,2.505211,0.52755
I,0,2.477857,0.443739
J,0,2.516721,0.530277
