<a href="https://colab.research.google.com/github/nickname8888/pyspark-prac/blob/main/Hotel_booking_basic_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f

In [2]:
spark = SparkSession.builder.appName("HotelEDA").getOrCreate()

df = spark.read.csv("hotel_bookings.csv", header=True, inferSchema=True)

In [27]:
# Display Schema
df.printSchema()

# Show Sample Data
df.show(5)

root
 |-- hotel: string (nullable = true)
 |-- is_canceled: integer (nullable = true)
 |-- lead_time: integer (nullable = true)
 |-- arrival_date_year: integer (nullable = true)
 |-- arrival_date_month: string (nullable = true)
 |-- arrival_date_week_number: integer (nullable = true)
 |-- arrival_date_day_of_month: integer (nullable = true)
 |-- stays_in_weekend_nights: integer (nullable = true)
 |-- stays_in_week_nights: integer (nullable = true)
 |-- adults: integer (nullable = true)
 |-- children: string (nullable = true)
 |-- babies: integer (nullable = true)
 |-- meal: string (nullable = true)
 |-- country: string (nullable = true)
 |-- market_segment: string (nullable = true)
 |-- distribution_channel: string (nullable = true)
 |-- is_repeated_guest: integer (nullable = true)
 |-- previous_cancellations: integer (nullable = true)
 |-- previous_bookings_not_canceled: integer (nullable = true)
 |-- reserved_room_type: string (nullable = true)
 |-- assigned_room_type: string (nullab

In [7]:
# get missing values

from pyspark.sql.functions import col

missing_values = {c: df.filter(col(c).isNull()).count() for c in df.columns}
print(missing_values)

{'hotel': 0, 'is_canceled': 0, 'lead_time': 0, 'arrival_date_year': 0, 'arrival_date_month': 0, 'arrival_date_week_number': 0, 'arrival_date_day_of_month': 0, 'stays_in_weekend_nights': 0, 'stays_in_week_nights': 0, 'adults': 0, 'children': 0, 'babies': 0, 'meal': 0, 'country': 0, 'market_segment': 0, 'distribution_channel': 0, 'is_repeated_guest': 0, 'previous_cancellations': 0, 'previous_bookings_not_canceled': 0, 'reserved_room_type': 0, 'assigned_room_type': 0, 'booking_changes': 0, 'deposit_type': 0, 'agent': 0, 'company': 0, 'days_in_waiting_list': 0, 'customer_type': 0, 'adr': 0, 'required_car_parking_spaces': 0, 'total_of_special_requests': 0, 'reservation_status': 0, 'reservation_status_date': 0}


In [8]:
missing_values = {k: v for k,v in missing_values.items() if v > 0}
print(missing_values)

{}


In [11]:
# corrected way to get missing values as null values are written separately as string NULL

missing_values = {c: df.filter((col(c).isNull()) | (col(c) == "") | (col(c) == "NA") | (col(c) == "NULL")).count() for c in df.columns}

# Show only columns with missing values
missing_values = {k: v for k, v in missing_values.items() if v > 0}

print(missing_values)


{'children': 4, 'country': 488, 'agent': 16340, 'company': 112593}


In [12]:
 df_cleaned = df.fillna({
    "children": 0,  # Replace NULLs with 0 (since children count should be numeric)
    "country": "Unknown",  # Replace missing country with 'Unknown'
    "agent": "Unknown",  # Replace missing agent ID with 'Unknown'
    "company": "Unknown"  # Replace missing company ID with 'Unknown'
})

In [13]:
df_cleaned.show(5)

+------------+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+--------+------+----+-------+--------------+--------------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+------------+-----+-------+--------------------+-------------+----+---------------------------+-------------------------+------------------+-----------------------+
|       hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|country|market_segment|distribution_channel|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|deposit_type|agent|company|days_in_waiting_list|customer_type| adr|required_car_parking_spaces|total_

In [22]:
# calculate mean average daily rate

mean_adr = df_cleaned.select(f.mean(col("adr"))).collect()[0][0]
print("mean average daily rate is:",mean_adr)

mean average daily rate is: 101.83112153446218


## **Aggregations and other functions**

In [23]:
# Average Lead Time per Hotel Type

df_cleaned.groupBy("hotel").agg(f.avg("lead_time").alias("Avg_Lead_Time")).show()

+------------+------------------+
|       hotel|     Avg_Lead_Time|
+------------+------------------+
|  City Hotel|109.73572419009201|
|Resort Hotel| 92.67568647029456|
+------------+------------------+



In [24]:
# Total Cancellations per Hotel Type

df_cleaned.groupBy("hotel").agg(f.count(f.when(col("is_canceled") == 1, True)).alias("Total_Cancellations")).show()

+------------+-------------------+
|       hotel|Total_Cancellations|
+------------+-------------------+
|  City Hotel|              33102|
|Resort Hotel|              11122|
+------------+-------------------+



In [25]:
df_cleaned.select(f.max("adr").alias("Max_ADR"), f.min("adr").alias("Min_ADR")).show()

+-------+-------+
|Max_ADR|Min_ADR|
+-------+-------+
| 5400.0|  -6.38|
+-------+-------+



In [26]:
df_bookings = df_cleaned.select("reservation_status", "hotel", "agent", "adr")
df_agents = df_cleaned.select("agent", "company")

df_joined = df_bookings.join(df_agents, on="agent", how="left")
df_joined.show(5)

+-----+------------------+------------+---+-------+
|agent|reservation_status|       hotel|adr|company|
+-----+------------------+------------+---+-------+
| NULL|         Check-Out|Resort Hotel|0.0|    485|
| NULL|         Check-Out|Resort Hotel|0.0|   NULL|
| NULL|         Check-Out|Resort Hotel|0.0|   NULL|
| NULL|         Check-Out|Resort Hotel|0.0|   NULL|
| NULL|         Check-Out|Resort Hotel|0.0|     72|
+-----+------------------+------------+---+-------+
only showing top 5 rows



In [29]:
df_cleaned.filter(col("children") >= 3).show(5)

+------------+-----------+---------+-----------------+------------------+------------------------+-------------------------+-----------------------+--------------------+------+--------+------+----+-------+--------------+--------------------+-----------------+----------------------+------------------------------+------------------+------------------+---------------+------------+-----+-------+--------------------+-------------+------+---------------------------+-------------------------+------------------+-----------------------+
|       hotel|is_canceled|lead_time|arrival_date_year|arrival_date_month|arrival_date_week_number|arrival_date_day_of_month|stays_in_weekend_nights|stays_in_week_nights|adults|children|babies|meal|country|market_segment|distribution_channel|is_repeated_guest|previous_cancellations|previous_bookings_not_canceled|reserved_room_type|assigned_room_type|booking_changes|deposit_type|agent|company|days_in_waiting_list|customer_type|   adr|required_car_parking_spaces|to

In [30]:
# Total Nights Stayed per Hotel Type

df_cleaned.withColumn("total_nights", col("stays_in_weekend_nights") + col("stays_in_week_nights"))\
  .groupBy("hotel")\
  .agg(f.sum("total_nights").alias("Total_Nights_Stayed"))\
  .show()

+------------+-------------------+
|       hotel|Total_Nights_Stayed|
+------------+-------------------+
|  City Hotel|             236256|
|Resort Hotel|             173001|
+------------+-------------------+



In [31]:
# Percentage of Cancellations

total_bookings = df_cleaned.count()
canceled_bookings = df_cleaned.filter(col("is_canceled") == 1).count()

cancellation_rate = (canceled_bookings / total_bookings) * 100

print("The cancellation rate is:", cancellation_rate, "%")

The cancellation rate is: 37.041628277075134 %
