In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import desc

# http://localhost:4040
spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.shuffle.partitions', '5')
spark


In [2]:
df = spark.read.format('csv')\
    .option('header', 'true')\
    .option('inferSchema', 'true')\
    .load('../data/flight-data/2015-summary.csv')

df.show()
df.printSchema()


+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [3]:
df.filter('count > 10')\
    .select('ORIGIN_COUNTRY_NAME', 'DEST_COUNTRY_NAME', 'count') \
    .sort('count', ascending=False)\
    .explain()



== Physical Plan ==
*(2) Sort [count#12 DESC NULLS LAST], true, 0
+- Exchange rangepartitioning(count#12 DESC NULLS LAST, 5)
   +- *(1) Project [ORIGIN_COUNTRY_NAME#11, DEST_COUNTRY_NAME#10, count#12]
      +- *(1) Filter (isnotnull(count#12) && (count#12 > 10))
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,ORIGIN_COUNTRY_NAME#11,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/sg0218817/Private/IT/others/spark/src/main/data/flight-data/2015-su..., PartitionFilters: [], PushedFilters: [IsNotNull(count), GreaterThan(count,10)], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [4]:
df1 = df.filter('count > 10')\
    .select('ORIGIN_COUNTRY_NAME', 'DEST_COUNTRY_NAME', 'count') \
    .sort('count', ascending=False)

df1.show()
df1.explain()


+-------------------+------------------+------+
|ORIGIN_COUNTRY_NAME| DEST_COUNTRY_NAME| count|
+-------------------+------------------+------+
|      United States|     United States|370002|
|             Canada|     United States|  8483|
|      United States|            Canada|  8399|
|             Mexico|     United States|  7187|
|      United States|            Mexico|  7140|
|      United States|    United Kingdom|  2025|
|     United Kingdom|     United States|  1970|
|      United States|             Japan|  1548|
|              Japan|     United States|  1496|
|      United States|           Germany|  1468|
| Dominican Republic|     United States|  1420|
|      United States|Dominican Republic|  1353|
|            Germany|     United States|  1336|
|      United States|       South Korea|  1048|
|        The Bahamas|     United States|   986|
|      United States|       The Bahamas|   955|
|             France|     United States|   952|
|      United States|            France|

In [5]:
df.createOrReplaceTempView('flight_data')

df2 = spark.sql('SELECT ORIGIN_COUNTRY_NAME, DEST_COUNTRY_NAME, count '
               'FROM flight_data '
               'WHERE count > 10 '
               'ORDER BY count DESC')

df2.show()
df2.explain()


+-------------------+------------------+------+
|ORIGIN_COUNTRY_NAME| DEST_COUNTRY_NAME| count|
+-------------------+------------------+------+
|      United States|     United States|370002|
|             Canada|     United States|  8483|
|      United States|            Canada|  8399|
|             Mexico|     United States|  7187|
|      United States|            Mexico|  7140|
|      United States|    United Kingdom|  2025|
|     United Kingdom|     United States|  1970|
|      United States|             Japan|  1548|
|              Japan|     United States|  1496|
|      United States|           Germany|  1468|
| Dominican Republic|     United States|  1420|
|      United States|Dominican Republic|  1353|
|            Germany|     United States|  1336|
|      United States|       South Korea|  1048|
|        The Bahamas|     United States|   986|
|      United States|       The Bahamas|   955|
|             France|     United States|   952|
|      United States|            France|

In [6]:
df3 = df.groupBy('DEST_COUNTRY_NAME') \
    .sum('count')\
    .withColumnRenamed('sum(count)', 'destination_total')\
    .sort(desc('destination_total'))\
    .limit(5)

df3.show()
df3.explain()


+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#71L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#10,destination_total#71L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[sum(cast(count#12 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 5)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_sum(cast(count#12 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/sg0218817/Private/IT/others/spark/src/main/data/flight-data/2015-su..., PartitionFilters: [], PushedFilters: [], R

In [7]:
df4 = spark.sql('SELECT DEST_COUNTRY_NAME, SUM(count) AS destination_total '
                'FROM flight_data '
                'GROUP BY DEST_COUNTRY_NAME '
                'ORDER BY destination_total DESC '
                'LIMIT 5')

df4.show()
df4.explain()


+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#85L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#10,destination_total#85L])
+- *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[sum(cast(count#12 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#10, 5)
      +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#10], functions=[partial_sum(cast(count#12 as bigint))])
         +- *(1) FileScan csv [DEST_COUNTRY_NAME#10,count#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/sg0218817/Private/IT/others/spark/src/main/data/flight-data/2015-su..., PartitionFilters: [], PushedFilters: [], R