# 1. Import Required Libraries and Initialize SparkSession
Import `SparkSession` from `pyspark.sql` and create a Spark session.

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Chapter 2 Example").getOrCreate()

# 2. Create a DataFrame with a Range of Numbers
Use `spark.range(1000)` to build a DataFrame of numbers from 0 to 999 and show its schema.

In [None]:
myRange = spark.range(1000).toDF("number")
myRange.printSchema()
myRange.show(5)

# 3. Filter DataFrame for Even Numbers
Select only even numbers using a `where` clause.

In [None]:
divisBy2 = myRange.where("number % 2 = 0")
divisBy2.show(5)

# 4. Read CSV Data into DataFrame
Load the 2015 flight-summary CSV with inferred schema and header.

In [None]:
flightData2015 = spark
  .read
  .option("inferSchema", "true")
  .option("header", "true")
  .csv("/data/flight-data/csv/2015-summary.csv")
flightData2015.printSchema()

# 5. Create Temporary SQL View
Register the DataFrame as a temp view for SQL queries.

In [None]:
flightData2015.createOrReplaceTempView("flight_data_2015")

# 6. Group and Count Using SQL and DataFrame APIs
Count the number of flights per destination with both SQL and DataFrame methods.

In [None]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1) AS cnt
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")
dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()
sqlWay.show(5)
dataFrameWay.show(5)

# 7. Explain Query Execution Plans
Use `explain()` to inspect physical plans.

In [None]:
print("SQL plan:")
sqlWay.explain()
print("DataFrame plan:")
dataFrameWay.explain()

# 8. Find Maximum Value in a Column
Use `pyspark.sql.functions.max` to get the max of the `count` column.

In [None]:
from pyspark.sql.functions import max
flightData2015.select(max("count")).show()

# 9. Aggregate and Sort Data Using SQL
Sum counts by destination, sort descending, limit to top 5.

In [None]:
maxSql = spark.sql("""
SELECT DEST_COUNTRY_NAME, sum(count) AS destination_total
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
ORDER BY destination_total DESC
LIMIT 5
""")
maxSql.show()

# 10. Aggregate and Sort Data Using DataFrame API
Perform the same aggregation in the DataFrame API.

In [None]:
from pyspark.sql.functions import desc
flightData2015
  .groupBy("DEST_COUNTRY_NAME")
  .sum("count")
  .withColumnRenamed("sum(count)", "destination_total")
  .sort(desc("destination_total"))
  .limit(5)
  .show()

# 11. Explain Aggregation Query Execution Plan
Inspect the physical plan for the DataFrame aggregation+sort query.

In [None]:
flightData2015
  .groupBy("DEST_COUNTRY_NAME")
  .sum("count")
  .withColumnRenamed("sum(count)", "destination_total")
  .sort(desc("destination_total"))
  .limit(5)
  .explain()