# Spark parquet

In [14]:
from pyspark.sql import SparkSession

In [15]:
sql_query="CREATE EXTERNAL TABLE world_temprature( \
    day_month_year DATE, \
    temperature DOUBLE \
    ) \
    PARTITIONED BY ( \
    continent STRING, \
     country STRING,\
     city STRING\
   )\
   STORED AS PARQUET \
   LOCATION 'file:/C:/Users/malam/development/data/spark/parquet' \
   tblproperties (\"parquet.compress\"=\"SNAPPY\")"

In [23]:
spark=SparkSession.builder.appName("spark-parquet").master("local[*]").enableHiveSupport().getOrCreate()

In [24]:
data = [
    ('2021-02-07',97.25,'asia','india','chennai'),
    ('2021-03-07',98.33,'asia','india','mumbai'),
    ('2021-04-08',99.45,'asia','india','patna'),
    ('2021-09-07',96.35,'asia','india','delhi')
]

In [25]:
df=spark.createDataFrame(data,["day_month_year","temperature","continent","country","city"])

In [26]:
df.show()

+--------------+-----------+---------+-------+-------+
|day_month_year|temperature|continent|country|   city|
+--------------+-----------+---------+-------+-------+
|    2021-02-07|      97.25|     asia|  india|chennai|
|    2021-03-07|      98.33|     asia|  india| mumbai|
|    2021-04-08|      99.45|     asia|  india|  patna|
|    2021-09-07|      96.35|     asia|  india|  delhi|
+--------------+-----------+---------+-------+-------+



# Creating partition table

In [27]:
output_path="file:/C:/Users/malam/development/data/spark/parquet/world_temprature_by_date"
from pyspark.sql.functions import year,month

df.withColumn("year", year(df["day_month_year"]))\
  .withColumn("month", month(df["day_month_year"]))\
  .write\
  .option("parquet.compress","SNAPPY")\
  .mode("overwrite")\
  .partitionBy("year", "month")\
  .parquet(output_path)

# Reading Parquet

In [28]:
df = spark.read.parquet(output_path)

In [29]:
df.show()

+--------------+-----------+---------+-------+-------+----+-----+
|day_month_year|temperature|continent|country|   city|year|month|
+--------------+-----------+---------+-------+-------+----+-----+
|    2021-02-07|      97.25|     asia|  india|chennai|2021|    2|
|    2021-03-07|      98.33|     asia|  india| mumbai|2021|    3|
|    2021-04-08|      99.45|     asia|  india|  patna|2021|    4|
|    2021-09-07|      96.35|     asia|  india|  delhi|2021|    9|
+--------------+-----------+---------+-------+-------+----+-----+



In [30]:
spark.sql("select * from world_temprature_by_date")

AnalysisException: Table or view not found: world_temprature_by_date; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [world_temprature_by_date], [], false


In [None]:
spark.stop()