### Access respective Azure storage container and blob to fetch data.

In [0]:
# Azure storage access info
blob_account_name = "azureopendatastorage"
blob_container_name = "holidaydatacontainer"
blob_relative_path = "Processed"
blob_sas_token = r""

### Construct remote blob path to access the parquet files remotely.

In [0]:
# Allow SPARK to read from Blob remotely
wasbs_path = 'wasbs://%s@%s.blob.core.windows.net/%s' % (blob_container_name, blob_account_name, blob_relative_path)
spark.conf.set(
  'fs.azure.sas.%s.%s.blob.core.windows.net' % (blob_container_name, blob_account_name),
  blob_sas_token)
print('Remote blob path: ' + wasbs_path)

### Read the parquet files and load the data into Temporary view.

In [0]:
# SPARK read parquet, note that it won't load any data yet by now
df = spark.read.parquet(wasbs_path)
print('Register the DataFrame as a SQL temporary view: PublicHoliday')
df.createOrReplaceTempView('PublicHoliday')

### SQL Operations on the Temporary View.

Display total number of rows

In [0]:
sqlQuery = 'SELECT COUNT(*) as Count FROM PublicHoliday'
display(spark.sql(sqlQuery))

Display top 10 rows

In [0]:
# Display top 10 rows
print('Displaying top 10 rows: ')
sqlQuery = 'SELECT * FROM PublicHoliday LIMIT 10'
display(spark.sql(sqlQuery))

Display records by specifying column names

In [0]:
sqlQuery = 'SELECT countryOrRegion,countryRegionCode,holidayName,normalizeHolidayName,date,isPaidTimeOff FROM PublicHoliday'
display(spark.sql(sqlQuery))

Display the list of unique Country and region code from the data

In [0]:
sqlQuery = 'SELECT distinct countryRegionCode, countryOrRegion FROM PublicHoliday order by countryRegionCode asc '
display(spark.sql(sqlQuery))

Display the list of unique PaidTimeOff from the data

In [0]:
sqlQuery = 'SELECT distinct isPaidTimeOff FROM PublicHoliday'
display(spark.sql(sqlQuery))

Display the list of Country, Holiday count by year from the data

In [0]:

sqlQuery = 'SELECT countryOrRegion AS Country, YEAR(date) as Year, COUNT(HolidayName) AS HolidayCount FROM PublicHoliday GROUP BY countryOrRegion, YEAR order by countryOrRegion, Year'
display(spark.sql(sqlQuery))

Display the list of Country, Holiday count by year from the data for the year 2025

In [0]:
sqlQuery = 'SELECT countryOrRegion AS Country, YEAR(date) as Year, COUNT(HolidayName) AS HolidayCount FROM PublicHoliday WHERE YEAR(date) = 2025 GROUP BY countryOrRegion, YEAR order by countryOrRegion, Year'
display(spark.sql(sqlQuery))

Display the list of Country, Holiday count by year from the data for the year 2025 and filter by PaidTimeOff

In [0]:
display(spark.sql('SELECT countryOrRegion AS Country, YEAR(date) as Year, COUNT(HolidayName) AS HolidayCount FROM PublicHoliday WHERE YEAR(date) = 2025 AND isPaidTimeOff = true GROUP BY countryOrRegion, YEAR order by countryOrRegion, Year'))

Display the list of Country, Holiday Name, Date and PaidTime off from the data for the year 2025

In [0]:
sqlQuery = "SELECT countryOrRegion AS Country,countryRegionCode AS CountryCode,holidayName AS HolidayName,CAST(date AS DATE) AS HolidayDate,isPaidTimeOff FROM PublicHoliday WHERE YEAR(date) = 2025 AND countryOrRegion = 'India'"
display(spark.sql(sqlQuery))

Display the list of Country, Holiday Name, Date and PaidTime off from the data for the year 2025 and filter by PaidTimeOff

In [0]:
sqlQuery = "SELECT countryOrRegion AS Country,countryRegionCode AS CountryCode,holidayName AS HolidayName,CAST(date AS DATE) AS HolidayDate,isPaidTimeOff FROM PublicHoliday WHERE YEAR(date) = 2025 AND countryOrRegion = 'India' and isPaidTimeOff = true"
display(spark.sql(sqlQuery))

Display the list of Country, Holiday Name, Date and Day of the week from the data for the year 2025

In [0]:
sqlQuery = "SELECT countryOrRegion AS Country,holidayName AS HolidayName,CAST(date AS DATE) AS HolidayDate,date_format((CAST(date AS DATE)),'EEEE') AS DayOfWeek FROM PublicHoliday WHERE YEAR(date) = 2025 AND countryOrRegion = 'India'"
display(spark.sql(sqlQuery))