Skip to content

Commit a8e8425

Browse files
authored
Update I Getting to know PySpark.py
1 parent 18b1d24 commit a8e8425

File tree

1 file changed

+65
-0
lines changed

1 file changed

+65
-0
lines changed

Introduction to PySpark/I Getting to know PySpark.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,31 @@
1717
1818
. create connection : <SparkContext> class <sc> #creating an instance of the
1919
. attributes : <SparkConf() constructor>
20+
21+
| create SparkSession |
22+
23+
# Import SparkSession
24+
> from pyspark.sql import SparkSession
25+
26+
# create SparkSession builder
27+
> my_spark = SparkSession.builder.getOrCreate()
28+
29+
# print spark tables
30+
> print(spark.catalog.listTables())
31+
32+
| SparkSession attributes |
33+
34+
- catalog: extract and view table data
35+
. listTables() -> returns column names in cluster as list
36+
> spark.catalog.listTables()
37+
38+
| SparkSession methods |
39+
40+
# always <SparkSessionName>.
41+
- .show() -> print
42+
- .sql() -> run a query ( <takes> queried 'string' <returns> DataFrame results )
43+
- .toPandas() -> returns corresponding 'pandas' DataFrame
44+
2045
"""
2146
#|
2247
#|
@@ -52,4 +77,44 @@
5277
#|
5378
#|
5479
### Creating a SparkSession
80+
# Import SparkSession from pyspark.sql
81+
from pyspark.sql import SparkSession
82+
83+
# Create my_spark
84+
my_spark = SparkSession.builder.getOrCreate()
5585

86+
# Print my_spark
87+
print(my_spark)
88+
#|
89+
#|
90+
### Viewing tables
91+
# Print the tables in the catalog
92+
print(spark.catalog.listTables())
93+
#|
94+
#|
95+
### Are you query-ious?
96+
# Don't change this query
97+
query = "FROM flights SELECT * LIMIT 10"
98+
99+
# Get the first 10 rows of flights
100+
flights10 = spark.sql(query)
101+
102+
# Show the results
103+
flights10.show()
104+
#|
105+
#|
106+
### Pandafy a Spark DataFrame
107+
# Don't change this query
108+
query = "SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest"
109+
110+
# Run the query
111+
flight_counts = spark.sql(query)
112+
113+
# Convert the results to a pandas DataFrame
114+
pd_counts = flight_counts.toPandas()
115+
116+
# Print the head of pd_counts
117+
print(pd_counts.head())
118+
#|
119+
#|
120+
### Put some Spark in your data

0 commit comments

Comments
 (0)