Update II Data engineering toolbox.py

ortizfram · web-flow · commit d360b38fff11 · 2022-10-11T11:26:47.000-03:00
diff --git a/Introduction to Data Engineering/II Data engineering toolbox.py b/Introduction to Data Engineering/II Data engineering toolbox.py
@@ -116,3 +116,24 @@ def parallel_apply(apply_func, groups, nb_cores):
       Built from need to use structures queries for pararell processing
       Initially used Hadoop MapReduce                                                        """                                                                                                                                                                                                                                                         
       
+#---
+#A PySpark groupby
+"""The methods you're going to use in this exercise are:
+            .printSchema(): helps print the schema of a Spark DataFrame.
+            .groupBy(): grouping statement for an aggregation.
+            .mean(): take the mean over each group.
+            .show(): show the results."""
+# Print the type of athlete_events_spark
+print(type(athlete_events_spark))
+
+# Print the schema of athlete_events_spark
+print(athlete_events_spark.printSchema())
+
+# Group by the Year, and find the mean Age
+print(athlete_events_spark.groupBy('Year').mean('Age'))
+
+# Group by the Year, and find the mean Age
+print(athlete_events_spark.groupBy('Year').mean('Age').show())
+
+#---
+#