In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('flight').getOrCreate()

In [3]:
flightData2015 = spark.read.option("inferSchema","true").option("header","true").csv("2015-summary.csv")

In [4]:
flightData2015.take(3)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344)]

In [5]:
flightData2015.sort("count").explain()

== Physical Plan ==
*Sort [count#2 ASC NULLS FIRST], true, 0
+- Exchange rangepartitioning(count#2 ASC NULLS FIRST, 200)
   +- *FileScan csv [DEST_COUNTRY_NAME#0,ORIGIN_COUNTRY_NAME#1,count#2] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/mack/spark-2.1.0-bin-hadoop2.7/python/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,ORIGIN_COUNTRY_NAME:string,count:int>


In [6]:
spark.conf.set("spark.sql.shuffle.partitions", "5")

In [7]:
flightData2015.sort("count").take(2)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1),
 Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1)]

In [8]:
flightData2015.sort("count").take(3)

[Row(DEST_COUNTRY_NAME='Moldova', ORIGIN_COUNTRY_NAME='United States', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Singapore', count=1)]

In [9]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [10]:
#python
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

In [11]:
dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()

In [12]:
sqlWay.explain()

== Physical Plan ==
*HashAggregate(keys=[DEST_COUNTRY_NAME#0], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#0, 5)
   +- *HashAggregate(keys=[DEST_COUNTRY_NAME#0], functions=[partial_count(1)])
      +- *FileScan csv [DEST_COUNTRY_NAME#0] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/mack/spark-2.1.0-bin-hadoop2.7/python/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [13]:
dataFrameWay.explain()

== Physical Plan ==
*HashAggregate(keys=[DEST_COUNTRY_NAME#0], functions=[count(1)])
+- Exchange hashpartitioning(DEST_COUNTRY_NAME#0, 5)
   +- *HashAggregate(keys=[DEST_COUNTRY_NAME#0], functions=[partial_count(1)])
      +- *FileScan csv [DEST_COUNTRY_NAME#0] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/mack/spark-2.1.0-bin-hadoop2.7/python/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>


In [14]:
spark.sql("SELECT max(count) FROM flight_data_2015").take(1)

[Row(max(count)=370002)]

In [15]:
from pyspark.sql.functions import max

In [16]:
flightData2015.select(max("count")).take(1)

[Row(max(count)=370002)]

In [17]:
# using sql in Python
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY sum(count) DESC
LIMIT 5
""")

In [18]:
maxSql.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [4]:
# using df in python
from pyspark.sql.functions import desc

In [5]:
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.show()

+-----------------+-----------------+
|DEST_COUNTRY_NAME|destination_total|
+-----------------+-----------------+
|    United States|           411352|
|           Canada|             8399|
|           Mexico|             7140|
|   United Kingdom|             2025|
|            Japan|             1548|
+-----------------+-----------------+



In [6]:
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.explain()

== Physical Plan ==
TakeOrderedAndProject(limit=5, orderBy=[destination_total#39L DESC NULLS LAST], output=[DEST_COUNTRY_NAME#0,destination_total#39L])
+- *HashAggregate(keys=[DEST_COUNTRY_NAME#0], functions=[sum(cast(count#2 as bigint))])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#0, 200)
      +- *HashAggregate(keys=[DEST_COUNTRY_NAME#0], functions=[partial_sum(cast(count#2 as bigint))])
         +- *FileScan csv [DEST_COUNTRY_NAME#0,count#2] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/mack/spark-2.1.0-bin-hadoop2.7/python/2015-summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string,count:int>
