Update I Getting to know PySpark.py

ortizfram · web-flow · commit a8e8425068a5 · 2022-11-11T12:48:18.000-03:00
diff --git a/Introduction to PySpark/I Getting to know PySpark.py b/Introduction to PySpark/I Getting to know PySpark.py
@@ -17,6 +17,31 @@
     
       .  create connection : <SparkContext> class <sc>  #creating an instance of the
       .  attributes : <SparkConf() constructor>
+      
+ | create SparkSession |
+ 
+      # Import SparkSession 
+        > from pyspark.sql import SparkSession
+
+      # create SparkSession builder
+        > my_spark = SparkSession.builder.getOrCreate()
+
+      # print spark tables     
+        > print(spark.catalog.listTables())
+
+| SparkSession attributes |
+
+     - catalog: extract and view table data
+              . listTables() -> returns column names in cluster as list
+                > spark.catalog.listTables()
+                
+| SparkSession methods |
+
+     # always <SparkSessionName>.
+    -  .show() ->   print
+    -  .sql() ->  run a query ( <takes> queried 'string' <returns> DataFrame results )
+    -  .toPandas() ->   returns corresponding 'pandas' DataFrame
+
 """
 #|
 #|
@@ -52,4 +77,44 @@
 #|
 #|
 ### Creating a SparkSession
+# Import SparkSession from pyspark.sql
+from pyspark.sql import SparkSession
+
+# Create my_spark
+my_spark = SparkSession.builder.getOrCreate()
 
+# Print my_spark
+print(my_spark)
+#|
+#|
+### Viewing tables
+# Print the tables in the catalog
+print(spark.catalog.listTables())
+#|
+#|
+### Are you query-ious?
+# Don't change this query
+query = "FROM flights SELECT * LIMIT 10"
+
+# Get the first 10 rows of flights
+flights10 = spark.sql(query)
+
+# Show the results
+flights10.show()
+#|
+#|
+### Pandafy a Spark DataFrame
+# Don't change this query
+query = "SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest"
+
+# Run the query
+flight_counts = spark.sql(query)
+
+# Convert the results to a pandas DataFrame
+pd_counts = flight_counts.toPandas()
+
+# Print the head of pd_counts
+print(pd_counts.head())
+#|
+#|
+### Put some Spark in your data