|
17 | 17 | |
18 | 18 | . create connection : <SparkContext> class <sc> #creating an instance of the |
19 | 19 | . attributes : <SparkConf() constructor> |
| 20 | + |
| 21 | + | create SparkSession | |
| 22 | + |
| 23 | + # Import SparkSession |
| 24 | + > from pyspark.sql import SparkSession |
| 25 | +
|
| 26 | + # create SparkSession builder |
| 27 | + > my_spark = SparkSession.builder.getOrCreate() |
| 28 | +
|
| 29 | + # print spark tables |
| 30 | + > print(spark.catalog.listTables()) |
| 31 | +
|
| 32 | +| SparkSession attributes | |
| 33 | +
|
| 34 | + - catalog: extract and view table data |
| 35 | + . listTables() -> returns column names in cluster as list |
| 36 | + > spark.catalog.listTables() |
| 37 | + |
| 38 | +| SparkSession methods | |
| 39 | +
|
| 40 | + # always <SparkSessionName>. |
| 41 | + - .show() -> print |
| 42 | + - .sql() -> run a query ( <takes> queried 'string' <returns> DataFrame results ) |
| 43 | + - .toPandas() -> returns corresponding 'pandas' DataFrame |
| 44 | +
|
20 | 45 | """ |
21 | 46 | #| |
22 | 47 | #| |
|
52 | 77 | #| |
53 | 78 | #| |
54 | 79 | ### Creating a SparkSession |
| 80 | +# Import SparkSession from pyspark.sql |
| 81 | +from pyspark.sql import SparkSession |
| 82 | + |
| 83 | +# Create my_spark |
| 84 | +my_spark = SparkSession.builder.getOrCreate() |
55 | 85 |
|
| 86 | +# Print my_spark |
| 87 | +print(my_spark) |
| 88 | +#| |
| 89 | +#| |
| 90 | +### Viewing tables |
| 91 | +# Print the tables in the catalog |
| 92 | +print(spark.catalog.listTables()) |
| 93 | +#| |
| 94 | +#| |
| 95 | +### Are you query-ious? |
| 96 | +# Don't change this query |
| 97 | +query = "FROM flights SELECT * LIMIT 10" |
| 98 | + |
| 99 | +# Get the first 10 rows of flights |
| 100 | +flights10 = spark.sql(query) |
| 101 | + |
| 102 | +# Show the results |
| 103 | +flights10.show() |
| 104 | +#| |
| 105 | +#| |
| 106 | +### Pandafy a Spark DataFrame |
| 107 | +# Don't change this query |
| 108 | +query = "SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest" |
| 109 | + |
| 110 | +# Run the query |
| 111 | +flight_counts = spark.sql(query) |
| 112 | + |
| 113 | +# Convert the results to a pandas DataFrame |
| 114 | +pd_counts = flight_counts.toPandas() |
| 115 | + |
| 116 | +# Print the head of pd_counts |
| 117 | +print(pd_counts.head()) |
| 118 | +#| |
| 119 | +#| |
| 120 | +### Put some Spark in your data |
0 commit comments