### PySpark Schema Definition for Hotel Data
This schema defines the structure of the hotel dataset, specifying data types and whether fields are nullable.

In [7]:
from pyspark.sql.types import *
# Define the schema for the data
hotelSchema = StructType([
    StructField("Date", DataType(), True),
    StructField("Month", IntegerType(), True),
    StructField("Weekday", IntegerType(), True),
    StructField("Season", StringType(), True),
    StructField("Holiday", IntegerType(), True),
    StructField("Marketing_Spend", IntegerType(), True),
    StructField("Revenue", IntegerType(), True),
    StructField("Room_Revenue", IntegerType(), True),
    StructField("Occupancy_Rate", FloatType(), True),
    StructField("ADR", FloatType(), True),
    StructField("RevPAR", FloatType(), True),
    StructField("Available_Rooms", IntegerType(), True),
    StructField("Reserved_Rooms", IntegerType(), True),
    StructField("Booking_Channel", StringType(), True),
    StructField("Guest_Type", StringType(), True),
    StructField("Market_Segment", FloatType(), True),
    StructField("Guest_Country", StringType(), True),
    StructField("Complaints", IntegerType(), True),
    StructField("Compliment", IntegerType(), True),
    StructField("Bookings", IntegerType(), True),
    StructField("No_Shows", IntegerType(), True),
    StructField("Cancellations", IntegerType(), True),
    StructField("Checkouts", IntegerType(), True),
    StructField("New_Bookings", IntegerType(), True),
    StructField("Checkins", IntegerType(), True),
    StructField("Average_Review_Score", FloatType(), True),
    StructField("Revenue_Managed_Guests", IntegerType(), True),
    StructField("RevPAR_Managed_Guests", FloatType(), True),
    StructField("Occupancy_Managed_Guests", IntegerType(), True),
    StructField("RevPAR_All", FloatType(), True),
    StructField("Occupancy_All", FloatType(), True),
    StructField("Room_Revenue_All", IntegerType(), True),
    StructField("Total_Revenue", IntegerType(), True),
    StructField("Operating_Expenses", IntegerType(), True),
    StructField("Fixed_Costs", IntegerType(), True),
    StructField("Variable_Costs", IntegerType(), True),
    StructField("Total_Costs", IntegerType(), True),
    StructField("Profit", IntegerType(), True)
])

StatementMeta(, 43a11794-6a32-48fd-9692-c63d56fa45f9, 9, Finished, Available, Finished)

### Loading Hotel Booking Data from Bronze Folder
This section loads the hotel booking data from a CSV file located in the Bronze folder, applying the defined schema for structured data processing.

In [2]:
#Loading the csv file from the Bronze folder
df = spark.read.format('csv').option("header", "true").schema(hotelSchema).load('Files/Bronze/hotelBook.csv')
display(df.head(5))

StatementMeta(, 43a11794-6a32-48fd-9692-c63d56fa45f9, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 8db2d22c-75b3-452c-b2fb-a7d558c29a08)

### Data Quality Check and Preprocessing
This section performs a series of data quality checks and preprocessing steps:
- Counts the total number of rows, columns, and elements in the dataset.
- Replaces specified placeholder values with `None` to standardize missing data representation.
- Counts and reports the number of null values in each column to identify data gaps.


In [3]:
# Number of rows
row_count = df.count()
print(f"Number of rows: {row_count}")

# Number of columns
column_count = len(df.columns)
print(f"Number of columns: {column_count}")

# Total number of elements
total_elements = row_count * column_count
print(f"Total number of elements: {total_elements}")

StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 5, Finished, Available, Finished)

Number of rows: 109
Number of columns: 38
Total number of elements: 4142


In [4]:
# Replace specified values with None
df = df.replace([" ", "?", "-", "_","##"], None)

StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 6, Finished, Available, Finished)

In [5]:
from pyspark.sql.functions import col
# Count the number of null values in each column
null_counts = {}
for column in df.columns:
    null_counts = {}
for column in df.columns:
    null_count = df.filter(col(column).isNull()).count()
    null_counts[column] = null_count

# Print the null counts for each column
for column, count in null_counts.items():
    print(f"Column {column} has {count} null values")
    null_counts[column] = null_count

StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 7, Finished, Available, Finished)

Column Date has 0 null values
Column Month has 0 null values
Column Weekday has 0 null values
Column Season has 0 null values
Column Holiday has 0 null values
Column Marketing_Spend has 0 null values
Column Revenue has 0 null values
Column Room_Revenue has 0 null values
Column Occupancy_Rate has 0 null values
Column ADR has 0 null values
Column RevPAR has 0 null values
Column Available_Rooms has 0 null values
Column Reserved_Rooms has 0 null values
Column Booking_Channel has 0 null values
Column Guest_Type has 0 null values
Column Market_Segment has 0 null values
Column Guest_Country has 0 null values
Column Complaints has 0 null values
Column Compliment has 0 null values
Column Bookings has 0 null values
Column No_Shows has 0 null values
Column Cancellations has 0 null values
Column Checkouts has 0 null values
Column New_Bookings has 0 null values
Column Checkins has 0 null values
Column Average_Review_Score has 0 null values
Column Revenue_Managed_Guests has 0 null values
Column RevP

### Date Handling and Conversion Between PySpark and Pandas DataFrames
This section handles date-related operations and conversion between PySpark and Pandas:
- Converts the PySpark DataFrame to a Pandas DataFrame for date format manipulation.
- Converts the 'Date' column to a datetime format in Pandas and checks for any parsing issues.
- Adjusts the 'Date' column format within the PySpark DataFrame using the `date_format` function.
- Converts the modified Pandas DataFrame back to a PySpark DataFrame for further processing.


In [8]:
'''import pandas as pd
# Convert the PySpark DataFrame to a Pandas DataFrame
pdf = df.toPandas()
# Convert the 'Date' column to datetime format in Pandas
pdf['Date'] = pd.to_datetime(pdf['Date'], format='%d/%m/%Y', errors='coerce')

# Check for any NaT values which indicate parsing issue
pdf.head() '''
from pyspark.sql.functions import col, dayofmonth, month, year, date_format

df = df.withColumn("Date", date_format(col("Date"), "MMM-yyyy"))
display(df.head(10))

StatementMeta(, 43a11794-6a32-48fd-9692-c63d56fa45f9, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 12d75ec8-c34a-4e5e-a726-0b93a7b45f8c)

In [7]:
# Convert the Pandas DataFrame back to a PySpark DataFrame
df = spark.createDataFrame(pdf)
display(df.head(5))

StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 9, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 07e6f750-daa0-4a5c-94a0-9faff1f93dc5)

### Data Transformation: Converting Numeric Values to Descriptive Labels
This section transforms numerical data into more descriptive labels for better readability:
- Converts the `Month` column from integers to month names (e.g., 1 to "January").
- Transforms the `Weekday` column from numeric values to day names (e.g., 1 to "Monday").
- Converts binary values in the `Holiday` column to "Yes" or "No" labels to indicate whether a day is a holiday.


In [8]:
from pyspark.sql.functions import when, col

# Ensure the Month column is of integer type
df = df.withColumn("Month", col("Month").cast("int"))

df = df.withColumn("Month", 
    when(df["Month"] == 1, "January")
    .when(df["Month"] == 2, "February")
    .when(df["Month"] == 3, "March")
    .when(df["Month"] == 4, "April")
    .when(df["Month"] == 5, "May")
    .when(df["Month"] == 6, "June")
    .when(df["Month"] == 7, "July")
    .when(df["Month"] == 8, "August")
    .when(df["Month"] == 9, "September")
    .when(df["Month"] == 10, "October")
    .when(df["Month"] == 11, "November")
    .when(df["Month"] == 12, "December"))

display(df.head(5))

StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 10, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 4ec997bf-31e2-4085-9380-5dff4433894a)

In [9]:
# Convert numeric weekdays to day names
df = df.withColumn("Weekday", 
    when(df["Weekday"] == 1, "Monday")
    .when(df["Weekday"] == 2, "Tuesday")
    .when(df["Weekday"] == 3, "Wednesday")
    .when(df["Weekday"] == 4, "Thursday")
    .when(df["Weekday"] == 5, "Friday")
    .when(df["Weekday"] == 6, "Saturday")
    .when(df["Weekday"] == 7, "Sunday")
    .otherwise("Unknown"))
display(df.head(5))

StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 11, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 1f68d964-31d2-4fd6-9ec4-8c7429098d66)

In [10]:
# Convert 0/1 for holidays to "No"/"Yes"
df = df.withColumn("Holiday", col("Holiday").cast("string"))
df = df.withColumn("Holiday", 
    when(df["Holiday"] == 1, "Yes")
    .when(df["Holiday"] == 0, "No")
    .otherwise("No"))
display(df.head(5))

StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 12, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 94a34713-3486-4de9-a3ec-01b79d7b0cfe)

### Delta Table Creation and Upsert Operation
This section performs the following operations on a Delta table:
- Creates a Delta table named `hotelBook_silver` with a defined schema if it does not already exist.
- Performs an upsert operation using Delta Lake's merge functionality:
  - **Update** existing records based on matching conditions (currently empty in the provided code).
  - **Insert** new records from the DataFrame if they do not already exist in the table.


In [11]:
from pyspark.sql.types import *
from delta.tables import *

# Define the schema for the hotelBook_silver table
DeltaTable.createIfNotExists(spark) \
     .tableName("hotelBook_silver") \
     .addColumn("Date", DateType()) \
     .addColumn("Month", StringType()) \
     .addColumn("Weekday", StringType()) \
     .addColumn("Season", StringType()) \
     .addColumn("Holiday", StringType()) \
     .addColumn("Marketing_Spend", FloatType()) \
     .addColumn("Revenue", FloatType()) \
     .addColumn("Room_Revenue", FloatType()) \
     .addColumn("Occupancy_Rate", FloatType()) \
     .addColumn("ADR", FloatType()) \
     .addColumn("RevPAR", FloatType()) \
     .addColumn("Available_Rooms", IntegerType()) \
     .addColumn("Reserved_Rooms", IntegerType()) \
     .addColumn("Booking_Channel", StringType()) \
     .addColumn("Guest_Type", StringType()) \
     .addColumn("Market_Segment", FloatType()) \
     .addColumn("Guest_Country", StringType()) \
     .addColumn("Complaints", IntegerType()) \
     .addColumn("Compliment", IntegerType()) \
     .addColumn("Bookings", IntegerType()) \
     .addColumn("No_Shows", IntegerType()) \
     .addColumn("Cancellations", IntegerType()) \
     .addColumn("Checkouts", IntegerType()) \
     .addColumn("New_Bookings", IntegerType()) \
     .addColumn("Checkins", IntegerType()) \
     .addColumn("Average_Review_Score", FloatType()) \
     .addColumn("Revenue_Managed_Guests", FloatType()) \
     .addColumn("RevPAR_Managed_Guests", FloatType()) \
     .addColumn("Occupancy_Managed_Guests", FloatType()) \
     .addColumn("RevPAR_All", FloatType()) \
     .addColumn("Occupancy_All", FloatType()) \
     .addColumn("Room_Revenue_All", FloatType()) \
     .addColumn("Total_Revenue", FloatType()) \
     .addColumn("Operating_Expenses", FloatType()) \
     .addColumn("Fixed_Costs", FloatType()) \
     .addColumn("Variable_Costs", FloatType()) \
     .addColumn("Total_Costs", FloatType()) \
     .addColumn("Profit", FloatType()) \
     .execute()


StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 13, Finished, Available, Finished)

<delta.tables.DeltaTable at 0x7f26f3e9bdc0>

In [12]:
# Update existing records and insert new ones based on a condition defined by the columns SalesOrderNumber, OrderDate, CustomerName, and Item.

from delta.tables import *
    
deltaTable = DeltaTable.forPath(spark, 'abfss://edcd96a3-3877-4553-b100-54aeef9c4401@onelake.dfs.fabric.microsoft.com/3f561343-f018-4ab6-a09f-74d0dd933efb/Tables/hotelbook_silver')
    
dfUpdates = df
    
deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.Date = updates.Date and silver.Month = updates.Month and silver.Weekday = updates.Weekday'
  ) \
   .whenMatchedUpdate(set =
    {
          
    }
  ) \
 .whenNotMatchedInsert(values =
    {
      "Date": "updates.Date",
      "Month": "updates.Month",
      "Weekday": "updates.Weekday",
      "Season": "updates.Season",
      "Holiday": "updates.Holiday",
      "Marketing_Spend": "updates.Marketing_Spend",
      "Revenue": "updates.Revenue",
      "Room_Revenue": "updates.Room_Revenue",
      "Occupancy_Rate": "updates.Occupancy_Rate",
      "ADR": "updates.ADR",
      "RevPAR": "updates.RevPAR",
      "Available_Rooms": "updates.Available_Rooms",
      "Reserved_Rooms": "updates.Reserved_Rooms",
      "Booking_Channel": "updates.Booking_Channel",
      "Guest_Type": "updates.Guest_Type",
      "Market_Segment": "updates.Market_Segment",
      "Guest_Country": "updates.Guest_Country",
      "Complaints": "updates.Complaints",
      "Compliment": "updates.Compliment",
      "Bookings": "updates.Bookings",
      "No_Shows": "updates.No_Shows",
      "Cancellations": "updates.Cancellations",
      "Checkouts": "updates.Checkouts",
      "New_Bookings": "updates.New_Bookings",
      "Checkins": "updates.Checkins",
      "Average_Review_Score": "updates.Average_Review_Score",
      "Revenue_Managed_Guests": "updates.Revenue_Managed_Guests",
      "RevPAR_Managed_Guests": "updates.RevPAR_Managed_Guests",
      "Occupancy_Managed_Guests": "updates.Occupancy_Managed_Guests",
      "RevPAR_All": "updates.RevPAR_All",
      "Occupancy_All": "updates.Occupancy_All",
      "Room_Revenue_All": "updates.Room_Revenue_All",
      "Total_Revenue": "updates.Total_Revenue",
      "Operating_Expenses": "updates.Operating_Expenses",
      "Fixed_Costs": "updates.Fixed_Costs",
      "Variable_Costs": "updates.Variable_Costs",
      "Total_Costs": "updates.Total_Costs",
      "Profit": "updates.Profit"
    }
  ) \
  .execute()

StatementMeta(, 14e4c907-a69c-4975-8144-8f89855dedd9, 14, Finished, Available, Finished)