Skip to content

Commit 0cb980e

Browse files
authored
Update II Data engineering toolbox.py
1 parent 799727d commit 0cb980e

File tree

1 file changed

+12
-1
lines changed

1 file changed

+12
-1
lines changed

Introduction to Data Engineering/II Data engineering toolbox.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373

7474
#---
7575
#From task to subtasks
76-
"""You will be using the multiprocessor.Pool API which allows you to distribute your workload over several processes. """
76+
"""multiprocessor.Pool API which allows you to distribute your workload over several processes. """
7777
# to apply a function over multiple cores
7878
@print_timing
7979
def parallel_apply(apply_func, groups, nb_cores):
@@ -90,3 +90,14 @@ def parallel_apply(apply_func, groups, nb_cores):
9090
# Parallel apply using 4 cores
9191
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 4)
9292

93+
#---
94+
#Using a DataFrame . dask from pandas
95+
""" parallelize an apply over several groups, is using the dask framework and its abstraction of the pandas DataFrame"""
96+
# import dask.dataframe
97+
import dask.dataframe as dd
98+
99+
# Set the number of partitions
100+
athlete_events_dask = dd.from_pandas(athlete_events, npartitions=4)
101+
102+
# Calculate the mean Age per Year .compute()
103+
print(athlete_events_dask.groupby('Year').Age.mean().compute())

0 commit comments

Comments
 (0)