In [6]:
import psycopg2
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

DB_HOST = os.getenv('DB_HOST')
DB_PORT = os.getenv('DB_PORT')
DB_NAME = os.getenv('DB_NAME')
DB_USER = os.getenv('DB_USER')
DB_PASSWORD = os.getenv('DB_PASSWORD')

# Connect to database
conn = psycopg2.connect(
    host=DB_HOST,
    port=DB_PORT,
    database=DB_NAME,
    user=DB_USER,
    password=DB_PASSWORD
)

# Helper function to run queries
def run_query(sql):
    return pd.read_sql(sql, conn)

print(f"Successfully connected to Hotel Operations database!")
print(f"Tables: branch, request, service")

Successfully connected to Hotel Operations database!
Tables: branch, request, service


# Practical Exam: Hotel Operations

LuxurStay Hotels is a major, international chain of hotels. They offer hotels for both business and leisure travellers in major cities across the world. The chain prides themselves on the level of customer service that they offer. 

However, the management has been receiving complaints about slow room service in some hotel branches. As these complaints are impacting the customer satisfaction rates, it has become a serious issue. Recent data shows that customer satisfaction has dropped from the 4.5 rating that they expect. 

You are working with the Head of Operations to identify possible causes and hotel branches with the worst problems. 

## Data

The following schema diagram shows the tables available. You have only been provided with data where customers provided a feedback rating.

![hotel_operations](hotel_operations.png)

# Task 1

Before you can start any analysis, you need to confirm that the data is accurate and reflects what you expect to see. 

It is known that there are some issues with the `branch` table, and the data team have provided the following data description. 

Write a query to return data matching this description, including identifying and cleaning all invalid values. You must match all column names and description criteria. Your output should be a DataFrame named 'clean_branch_data'.

| Column Name | Criteria                                                |
|-------------|---------------------------------------------------------|
|id | Nominal. The unique identifier of the hotel. </br>Missing values are not possible due to the database structure.|
| location | Nominal. The location of the particular hotel. One of four possible values, 'EMEA', 'NA', 'LATAM' and 'APAC'. </br>Missing values should be replaced with “Unknown”. |
| total_rooms | Discrete. The total number of rooms in the hotel. Must be a positive integer between 1 and 400. </br>Missing values should be replaced with the default number of rooms, 100. |
| staff_count | Discrete. The number of staff employeed in the hotel service department. </br>Missing values should be replaced with the total_rooms multiplied by 1.5. |
| opening_date | Discrete. The year in which the hotel opened. This can be any value between 2000 and 2023. </br>Missing values should be replaced with 2023. |
| target_guests | Nominal. The primary type of guest that is expected to use the hotel. Can be one of 'Leisure' or 'Business'. </br>Missing values should be replaced with 'Leisure'. |

In [7]:
# Task 1: Clean branch data according to specification
query = """
SELECT
    id,

    -- Normalize location (trim, uppercase for matching, output in proper case)
    CASE 
        WHEN UPPER(TRIM(location)) IN ('EMEA','NA','LATAM','APAC') 
            THEN UPPER(TRIM(location))
        ELSE 'Unknown'
    END AS location,

    -- Total rooms: ensure valid range or default to 100
    CASE 
        WHEN total_rooms BETWEEN 1 AND 400 THEN CAST(total_rooms AS INT)
        ELSE 100
    END AS total_rooms,

    -- Staff count: use value or default formula
    CASE 
        WHEN staff_count IS NOT NULL THEN CAST(staff_count AS INT)
        ELSE CAST(ROUND(
            (CASE 
                WHEN total_rooms BETWEEN 1 AND 400 THEN total_rooms
                ELSE 100
            END) * 1.5
        ) AS INT)
    END AS staff_count,

    -- Opening date: check numeric and valid range, else 2023
    CASE 
        WHEN opening_date ~ '^[0-9]+$'
             AND CAST(opening_date AS INT) BETWEEN 2000 AND 2023
            THEN CAST(opening_date AS INT)
        ELSE 2023
    END AS opening_date,

    -- Normalize target_guests
    CASE 
        WHEN UPPER(TRIM(target_guests)) = 'LEISURE' THEN 'Leisure'
        WHEN UPPER(TRIM(target_guests)) LIKE 'B%' THEN 'Business'
        ELSE 'Leisure'
    END AS target_guests

FROM branch;
"""

clean_branch_data = run_query(query)
print(f"Cleaned branch data: {len(clean_branch_data)} hotels")
display(clean_branch_data.head(10))

Cleaned branch data: 100 hotels


  return pd.read_sql(sql, conn)


Unnamed: 0,id,location,total_rooms,staff_count,opening_date,target_guests
0,1,LATAM,168,178,2017,Business
1,2,APAC,154,82,2010,Leisure
2,3,APAC,212,467,2003,Leisure
3,4,APAC,230,387,2023,Business
4,5,APAC,292,293,2002,Business
5,6,Unknown,260,590,2022,Leisure
6,7,EMEA,259,442,2018,Business
7,8,Unknown,259,285,2023,Business
8,9,Unknown,157,274,2001,Business
9,10,EMEA,205,138,2013,Leisure


# Task 2

The Head of Operations wants to know whether there is a difference in time taken to respond to a customer request in each hotel. They already know that different services take different lengths of time. 

Calculate the average and maximum duration for each branch and service. 
- Your output should be a DataFrame named 'average_time_service'
- It should include the columns `service_id`, `branch_id`, `avg_time_taken` and `max_time_taken`
- Values should be rounded to two decimal places where appropriate. 

In [3]:
# Task 2: Average and maximum duration for each branch and service
query = """
SELECT
    service_id,
    branch_id,
    ROUND(AVG(time_taken)::numeric, 2) AS avg_time_taken,
    MAX(time_taken) AS max_time_taken
	
FROM request
GROUP BY service_id, branch_id;
"""

average_time_service = run_query(query)
print(f"Branch-service time analysis: {len(average_time_service)} combinations")
display(average_time_service.head(10))

Branch-service time analysis: 385 combinations


  return pd.read_sql(sql, conn)


Unnamed: 0,service_id,branch_id,avg_time_taken,max_time_taken
0,2,46,13.09,16
1,4,99,9.13,13
2,1,8,2.56,10
3,2,13,13.53,17
4,1,46,2.08,4
5,3,15,6.73,7
6,2,35,13.17,16
7,1,1,2.44,12
8,3,13,6.8,8
9,1,57,2.29,5


# Task 3

The management team want to target improvements in `Meal` and `Laundry` service in Europe (`EMEA`) and Latin America (`LATAM`). 

Write a query to return the `description` of the service, the `id` and `location` of the branch, the id of the request as `request_id` and the `rating` for the services and locations of interest to the management team. 

Your output should be a DataFrame named 'target_hotels'.

Use the original branch table, not the output of task 1. 

In [4]:
# Task 3: Target hotels for Meal and Laundry service in EMEA and LATAM
query = """
SELECT
    s.description,
    b.id AS id,
    b.location,
    r.id AS request_id,
    r.rating
	
FROM request r
JOIN service s
  ON r.service_id = s.id
JOIN branch b
  ON r.branch_id = b.id
	
WHERE s.description IN ('Meal', 'Laundry')
  AND b.location IN ('EMEA', 'LATAM');
"""

target_hotels = run_query(query)
print(f"Target hotels for improvement: {len(target_hotels)} requests")
display(target_hotels.head(10))

Target hotels for improvement: 5047 requests


  return pd.read_sql(sql, conn)


Unnamed: 0,description,id,location,request_id,rating
0,Laundry,63,EMEA,3,4
1,Laundry,69,LATAM,6,5
2,Meal,44,EMEA,18,4
3,Laundry,57,LATAM,19,3
4,Meal,1,LATAM,21,4
5,Meal,26,LATAM,26,5
6,Laundry,34,EMEA,27,4
7,Laundry,60,LATAM,35,4
8,Meal,21,EMEA,37,4
9,Meal,1,LATAM,38,4


# Task 4

So that you can take a more detailed look at the lowest performing hotels, you want to get service and branch information where the average rating for the branch and service combination is lower than 4.5 - the target set by management.  

- Your output should be a DataFrame named 'average_rating'
- It should return the `service_id` and `branch_id`, and the average rating (`avg_rating`)
- Values should be rounded to 2 decimal places where appropriate.

In [8]:
# Task 4: Service-branch combinations with average rating below 4.5 target
query = """
SELECT
    service_id,
    branch_id,
    ROUND(AVG(rating)::numeric, 2) AS avg_rating
	
FROM request
GROUP BY service_id, branch_id
HAVING AVG(rating) < 4.5;
"""

average_rating = run_query(query)
print(f"Underperforming service-branch combinations: {len(average_rating)}")
display(average_rating)

Underperforming service-branch combinations: 215


  return pd.read_sql(sql, conn)


Unnamed: 0,service_id,branch_id,avg_rating
0,2,46,3.78
1,4,99,3.83
2,1,8,3.64
3,1,46,3.81
4,3,15,4.00
...,...,...,...
210,3,8,3.38
211,1,64,3.59
212,4,93,3.72
213,4,88,3.60
