##Range

In [0]:
df = spark.range(1000).toDF("number")

In [0]:
display(df)

number
0
1
2
3
4
5
6
7
8
9


In [0]:
print(dbutils.fs.ls('/FileStore/tables/'))

Out[2]: [FileInfo(path='dbfs:/FileStore/tables/01_dimensions_toRaw.csv', name='01_dimensions_toRaw.csv', size=214, modificationTime=1740544140000),
 FileInfo(path='dbfs:/FileStore/tables/2015_summary.csv', name='2015_summary.csv', size=7337, modificationTime=1738511442000),
 FileInfo(path='dbfs:/FileStore/tables/2015_summary.json', name='2015_summary.json', size=21624, modificationTime=1742188850000),
 FileInfo(path='dbfs:/FileStore/tables/64814.json', name='64814.json', size=174759, modificationTime=1740311442000),
 FileInfo(path='dbfs:/FileStore/tables/Airline_Dataset_Enhanced_Sample.csv', name='Airline_Dataset_Enhanced_Sample.csv', size=883511, modificationTime=1742062645000),
 FileInfo(path='dbfs:/FileStore/tables/BigMart_Sales.csv', name='BigMart_Sales.csv', size=869537, modificationTime=1736957997000),
 FileInfo(path='dbfs:/FileStore/tables/drivers.json', name='drivers.json', size=180812, modificationTime=1736959171000),
 FileInfo(path='dbfs:/FileStore/tables/matchinfo/', name='m

In [0]:
df_flightData = (
    spark.read.format("csv")
    .option("header", True)
    .option("inferSchema", True)
    .load("/FileStore/tables/2015_summary.csv")
)

In [0]:
df_flightData.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [0]:
df_flightData.createOrReplaceTempView('flight_data_2015')

In [0]:
sqlWay = spark.sql("""SELECT DEST_COUNTRY_NAME, COUNT(1) as countRows
                FROM flight_data_2015
            GROUP BY DEST_COUNTRY_NAME
""")

In [0]:
display(sqlWay)

DEST_COUNTRY_NAME,countRows
Anguilla,1
Russia,1
Paraguay,1
Senegal,1
Sweden,1
Kiribati,1
Guyana,1
Philippines,1
Djibouti,1
Malaysia,1


In [0]:
dfWay = df_flightData.groupBy("DEST_COUNTRY_NAME").count()

In [0]:
display(dfWay)

DEST_COUNTRY_NAME,count
Anguilla,1
Russia,1
Paraguay,1
Senegal,1
Sweden,1
Kiribati,1
Guyana,1
Philippines,1
Djibouti,1
Malaysia,1


In [0]:
dfWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#111], functions=[finalmerge_count(merge count#141L) AS count(1)#133L])
   +- AQEShuffleRead coalesced
      +- ShuffleQueryStage 0, Statistics(sizeInBytes=4.6 KiB, rowCount=132, isRuntime=true)
         +- Exchange hashpartitioning(DEST_COUNTRY_NAME#111, 200), ENSURE_REQUIREMENTS, [plan_id=195]
            +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#111], functions=[partial_count(1) AS count#141L])
               +- FileScan csv [DEST_COUNTRY_NAME#111] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/FileStore/tables/2015_summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>
+- == Initial Plan ==
   HashAggregate(keys=[DEST_COUNTRY_NAME#111], functions=[finalmerge_count(merge count#141L) AS count(1)#133L])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#111, 200), ENSURE_REQUIRE

In [0]:
sqlWay.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=true
+- == Final Plan ==
   *(2) HashAggregate(keys=[DEST_COUNTRY_NAME#111], functions=[finalmerge_count(merge count#124L) AS count(1)#118L])
   +- AQEShuffleRead coalesced
      +- ShuffleQueryStage 0, Statistics(sizeInBytes=4.6 KiB, rowCount=132, isRuntime=true)
         +- Exchange hashpartitioning(DEST_COUNTRY_NAME#111, 200), ENSURE_REQUIREMENTS, [plan_id=230]
            +- *(1) HashAggregate(keys=[DEST_COUNTRY_NAME#111], functions=[partial_count(1) AS count#124L])
               +- FileScan csv [DEST_COUNTRY_NAME#111] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(1 paths)[dbfs:/FileStore/tables/2015_summary.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<DEST_COUNTRY_NAME:string>
+- == Initial Plan ==
   HashAggregate(keys=[DEST_COUNTRY_NAME#111], functions=[finalmerge_count(merge count#124L) AS count(1)#118L])
   +- Exchange hashpartitioning(DEST_COUNTRY_NAME#111, 200), ENSURE_REQUIRE

In [0]:
from pyspark.sql.functions import max

df_flightData.select(max('count')).display()

max(count)
370002


In [0]:
maxSQL = spark.sql(""" SELECT DEST_COUNTRY_NAME, sum(count) as destination_total
                            FROM flight_data_2015
                        GROUP BY DEST_COUNTRY_NAME
                            ORDER BY destination_total DESC
                        LIMIT 5""")

In [0]:
display(maxSQL)

DEST_COUNTRY_NAME,destination_total
United States,411352
Canada,8399
Mexico,7140
United Kingdom,2025
Japan,1548


In [0]:
df_flightData = df_flightData.groupBy('DEST_COUNTRY_NAME').sum('count')

In [0]:
display(df_flightData)

DEST_COUNTRY_NAME,sum(count)
Anguilla,41
Russia,176
Paraguay,60
Senegal,40
Sweden,118
Kiribati,26
Guyana,64
Philippines,134
Djibouti,1
Malaysia,2


In [0]:
from pyspark.sql.functions import desc
maxDF = df_flightData.groupBy('DEST_COUNTRY_NAME')\
                     .sum('count').withColumnRenamed("sum(count)", 'destination_total')\
                     .orderBy(desc('destination_total'))\
                     .limit(5)\
                     .select('DEST_COUNTRY_NAME', 'destination_total')