# Init SparkContext

In [1]:
from datetime import datetime
from pyspark import SparkContext, HiveContext
from pyspark.sql import SparkSession, SQLContext

In [2]:
spark = (SparkSession.builder.appName("pyspark-dataframe-demo-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())

sqlContext = SQLContext(spark)
spark.sparkContext.getConf().getAll()




[('spark.repl.local.jars',
  'file:///usr/local/spark-3.3.1-bin-hadoop3/jars/delta-core_2.12-2.2.0.jar,file:///usr/local/spark-3.3.1-bin-hadoop3/jars/hadoop-aws-3.3.2.jar,file:///usr/local/spark-3.3.1-bin-hadoop3/jars/delta-storage-2.2.0.jar,file:///usr/local/spark-3.3.1-bin-hadoop3/jars/aws-java-sdk-1.12.367.jar,file:///usr/local/spark-3.3.1-bin-hadoop3/jars/s3-2.18.41.jar,file:///usr/local/spark-3.3.1-bin-hadoop3/jars/aws-java-sdk-bundle-1.11.1026.jar'),
 ('spark.hadoop.fs.s3a.connection.ssl.enabled', 'false'),
 ('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.conc

# Create DataFrame

## By loading dataset

In [6]:
df_plants = spark.read.format("csv").load("s3a://warehouse/plants.csv", header=True)
df_plants.show()

+------------------+-------------+
|        Plant Name|Rooting Depth|
+------------------+-------------+
|         Artichoke|            D|
|           Arugula|            S|
|         Asparagus|            D|
|       Beans, bush|            M|
|Beans, lima (bush)|            D|
|       Beans, pole|            M|
|             Beets|            M|
|          Broccoli|            S|
|   Brussel sprouts|            S|
|           Cabbage|            S|
|           Carrots|            M|
|       Cauliflower|            S|
|            Celery|            S|
|             Chard|            M|
|           Edamame|            M|
|              Corn|            S|
|          Cucumber|            M|
|          Eggplant|            M|
|            Endive|            S|
|            Garlic|            S|
+------------------+-------------+
only showing top 20 rows



In [7]:
(
df_plants.write.mode("overwrite")
    .option("compression", "snappy")
    .option("path", "s3a://warehouse/plants.parquet")
    .format("parquet")
    .saveAsTable("plants")
)

# Spark SQL

In [8]:
spark.sql("show databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [9]:
spark.sql("show tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default|   plants|      false|
+---------+---------+-----------+



In [10]:
spark.sql("SELECT * FROM plants").show()

+------------------+-------------+
|        Plant Name|Rooting Depth|
+------------------+-------------+
|         Artichoke|            D|
|           Arugula|            S|
|         Asparagus|            D|
|       Beans, bush|            M|
|Beans, lima (bush)|            D|
|       Beans, pole|            M|
|             Beets|            M|
|          Broccoli|            S|
|   Brussel sprouts|            S|
|           Cabbage|            S|
|           Carrots|            M|
|       Cauliflower|            S|
|            Celery|            S|
|             Chard|            M|
|           Edamame|            M|
|              Corn|            S|
|          Cucumber|            M|
|          Eggplant|            M|
|            Endive|            S|
|            Garlic|            S|
+------------------+-------------+
only showing top 20 rows



In [11]:
spark.sql("SELECT COUNT(*) AS number_of_records FROM plants").show()

+-----------------+
|number_of_records|
+-----------------+
|               21|
+-----------------+

