## SparkSession

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('Apache Spark test') \
    .config("spark.sql.shuffle.partitions", "50") \
    .getOrCreate()

In [2]:
spark

## DataFrame

In [3]:
df = spark.read.parquet("/Users/sg0218817/Desktop/olap/*.parquet")

In [None]:
df.count()

In [None]:
df.collect()[0]

In [None]:
df.rdd.getNumPartitions()

In [None]:
fdf = df \
    .filter('cabinCode = "Y"') \
    .groupBy('legCabinId') \
        .sum('reservationHoldings.resHoldTotal', 'reservationHoldings.resHoldGroup') \
        .withColumnRenamed('sum(reservationHoldings.resHoldTotal AS `resHoldTotal`)', 'resHoldTotal') \
        .withColumnRenamed('sum(reservationHoldings.resHoldGroup AS `resHoldGroup`)', 'resHoldGroup') \
    .cache()


In [None]:
fdf.explain()

In [None]:
fdf.count()

In [None]:
fdf.rdd.getNumPartitions()

In [None]:
fdf.show(10, False)

## Partitioning and PushDown Predicates

In [4]:
import pyspark.sql.functions as pf
  
df.sort('departureDate') \
    .withColumn('departureDate_year', pf.year(pf.col('departureDate'))) \
    .withColumn('departureDate_month', pf.month(pf.col('departureDate'))) \
    .withColumn('departureDate_day', pf.dayofmonth(pf.col('departureDate'))) \
    .coalesce(10) \
    .write \
        .format('parquet') \
        .mode('overwrite') \
        .partitionBy('departureDate_year', 'departureDate_month') \
        .save('plik.parquet')

In [5]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .collect()

[Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.Y', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.M', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.B', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.U', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.M', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.P', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.U', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.K', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.K', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.H', departureDate=

In [6]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .explain()

== Physical Plan ==
*(1) Project [legClassId#719, departureDate#728]
+- *(1) FileScan parquet [legClassId#719,departureDate#728,departureDate_year#741,departureDate_month#742] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/Users/sg0218817/Private/IT/others/spark/notebooks/plik.parquet], PartitionCount: 13, PartitionFilters: [], PushedFilters: [], ReadSchema: struct<legClassId:string,departureDate:timestamp>


In [7]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .filter('departureDate_year = 2019') \
    .collect()

[Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.Y', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.M', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.B', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.U', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.M', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.P', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.U', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.K', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.K', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.H', departureDate=

In [8]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .filter('departureDate_year = 2019') \
    .explain()

== Physical Plan ==
*(1) Project [legClassId#855, departureDate#864]
+- *(1) FileScan parquet [legClassId#855,departureDate#864,departureDate_year#877,departureDate_month#878] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/Users/sg0218817/Private/IT/others/spark/notebooks/plik.parquet], PartitionCount: 9, PartitionFilters: [isnotnull(departureDate_year#877), (departureDate_year#877 = 2019)], PushedFilters: [], ReadSchema: struct<legClassId:string,departureDate:timestamp>


In [9]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .filter('departureDate_year = 2019 and departureDate_month = 5') \
    .collect()

[Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.Y', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.M', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.B', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.U', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.M', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.P', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.U', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.K', departureDate=datetime.datetime(2019, 5, 12, 3, 30)),
 Row(legClassId='SVO.SU.2550[2019-05-06].AMS.Y.K', departureDate=datetime.datetime(2019, 5, 6, 11, 10)),
 Row(legClassId='SVO.SU.36[2019-05-12].LED.Y.H', departureDate=

In [10]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .filter('departureDate_year = 2019 and departureDate_month = 5') \
    .explain()

== Physical Plan ==
*(1) Project [legClassId#991, departureDate#1000]
+- *(1) FileScan parquet [legClassId#991,departureDate#1000,departureDate_year#1013,departureDate_month#1014] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/Users/sg0218817/Private/IT/others/spark/notebooks/plik.parquet], PartitionCount: 1, PartitionFilters: [isnotnull(departureDate_year#1013), isnotnull(departureDate_month#1014), (departureDate_year#101..., PushedFilters: [], ReadSchema: struct<legClassId:string,departureDate:timestamp>


In [11]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .filter('departureDate_year = 2019 and departureDate_month = 5 and departureDate_day = 1') \
    .collect()

[Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.Y', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.B', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.M', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.U', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.P', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.K', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.H', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.L', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.X', departureDate=datetime.datetime(2019, 5, 1, 0, 0)),
 Row(legClassId='SVO.SU.1834[2019-04-30].MSQ.Y.Q', departureDate=datetime

In [12]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .filter('departureDate_year = 2019 and departureDate_month = 5 and departureDate_day = 15') \
    .explain()

== Physical Plan ==
*(1) Project [legClassId#1127, departureDate#1136]
+- *(1) Filter (isnotnull(departureDate_day#1148) && (departureDate_day#1148 = 15))
   +- *(1) FileScan parquet [legClassId#1127,departureDate#1136,departureDate_day#1148,departureDate_year#1149,departureDate_month#1150] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/Users/sg0218817/Private/IT/others/spark/notebooks/plik.parquet], PartitionCount: 1, PartitionFilters: [isnotnull(departureDate_year#1149), isnotnull(departureDate_month#1150), (departureDate_year#114..., PushedFilters: [IsNotNull(departureDate_day), EqualTo(departureDate_day,15)], ReadSchema: struct<legClassId:string,departureDate:timestamp,departureDate_day:int>


In [13]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .filter('departureDate_year = 2019 and departureDate_month = 5 and departureDate_day = 15 and origin = "BCN"') \
    .collect()

[Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.Y', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.B', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.M', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.U', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.K', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.H', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.L', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.X', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y.Q', departureDate=datetime.datetime(2019, 5, 15, 13, 50)),
 Row(legClassId='BCN.SU.2639[2019-05-15].SVO.Y

In [14]:
spark.read.load("plik.parquet") \
    .select('legClassId', 'departureDate') \
    .filter('departureDate_year = 2019 and departureDate_month = 5 and departureDate_day = 15 and origin = "BCN"') \
    .explain()

== Physical Plan ==
*(1) Project [legClassId#1263, departureDate#1272]
+- *(1) Filter (((isnotnull(departureDate_day#1284) && isnotnull(origin#1273)) && (departureDate_day#1284 = 15)) && (origin#1273 = BCN))
   +- *(1) FileScan parquet [legClassId#1263,departureDate#1272,origin#1273,departureDate_day#1284,departureDate_year#1285,departureDate_month#1286] Batched: true, Format: Parquet, Location: InMemoryFileIndex[file:/Users/sg0218817/Private/IT/others/spark/notebooks/plik.parquet], PartitionCount: 1, PartitionFilters: [isnotnull(departureDate_year#1285), isnotnull(departureDate_month#1286), (departureDate_year#128..., PushedFilters: [IsNotNull(departureDate_day), IsNotNull(origin), EqualTo(departureDate_day,15), EqualTo(origin,B..., ReadSchema: struct<legClassId:string,departureDate:timestamp,origin:string,departureDate_day:int>
