# 1. Build Master Database

This notebook:
1. Reads raw CSV/Parquet from `data/raw/`
2. Cleans and filters
3. Engineers core feature flags
4. Writes out `data/processed/master.parquet`
5. Quick sanity checks


In [21]:
## 0. Import Libraries

import pandas as pd
from ydata_profiling import ProfileReport
from datetime import datetime

pd.set_option('display.max_columns', 100)  
pd.set_option('display.width', None)       
pd.set_option('display.max_rows', 1000)

## 1. Extract
Read raw file into one DataFrame.


In [22]:
## 1. Read the CSV into a DataFrame with explicit dtypes
dtype_map = {
    'Context ID': str,
    'Booking ID': str,
    'Session ID': str,
    'Search Days Ahead': 'Int64',
    'Search Charge': 'float',
    'Search Charge Type': 'category',
    'Venue ID': str,
    'Venue Name': 'category',
    'Party Size': 'Int64',
    'Was Search Available': 'boolean',
    'Reservation Days Ahead': 'Int64',
    'Reservation Charge': 'float',
    'Reservation Charge Type': 'category',
    'Year': 'Int64',
    'Month': 'Int64',
    'Reservation Cost ($)': 'float',
    'Packages Cost ($)': 'float',
    'Add Ons Cost ($)': 'float',
    'Promo Code Discount ($)': 'float',
    'Total Cost ($)': 'float',
    'Deposit Amount': 'float',
}
df = pd.read_csv(
    '../data/raw/Clays_data.csv',
    dtype=dtype_map,
    encoding='latin1',
    low_memory=False
)

# 2. Safety copy
df.to_parquet('../data/raw/full_raw.parquet', index=False)

# 3. Standardize date/time columns
df['Search At'] = pd.to_datetime(df['Search At'], dayfirst=True, errors='coerce')
df['Search Date'] = pd.to_datetime(df['Search Date'], dayfirst=True, errors='coerce')
df['Reservation Date'] = pd.to_datetime(df['Reservation Date'], dayfirst=True, errors='coerce')
df['Reservation Datetime'] = pd.to_datetime(df['Reservation Datetime'], dayfirst=True, errors='coerce')

# 4. Drop rows without Context ID (must be a bug)
df = df.dropna(subset=['Context ID'])

# 5. Save cleaned DataFrame
df.to_parquet('../data/processed/full_cleaned.parquet', index=False)

df.head()

  df['Search Date'] = pd.to_datetime(df['Search Date'], dayfirst=True, errors='coerce')
  df['Reservation Date'] = pd.to_datetime(df['Reservation Date'], dayfirst=True, errors='coerce')


Unnamed: 0,Context ID,Booking ID,Session ID,Search At,Search Date,Search Time,Search Time Iso,Search Days Ahead,Search Charge,Search Charge Type,Venue ID,Venue Name,Party Size,Source Path,Occasion ID,Was Search Available,Reservation Date,Reservation Time,Reservation Time Iso,Reservation Datetime,Reservation Days Ahead,Reservation Charge,Reservation Charge Type,Game Area,Was Package Required,Packages Available,Packages,Add Ons Available,Add Ons,Time Extension Available,Time Extension Selected,Personal Info Completed,Personal Info Completed At,Marketing Opt In,Reservation Notes,Promo Code,Promo Code Applied,Billing Info Completed,Customer ID,Booking Status,Reservation ID,Reservation Reference Code,Reservation Tags,Reservation Cost ($),Packages Cost ($),Add Ons Cost ($),Promo Code Discount ($),Total Cost ($),Deposit Amount,Year,Month
0,202406010624Q11YGA,202406010624Q11YGA,202406010624Q11YGA,2024-01-06 06:24:00,2024-07-13,63900000000000,17:45:00,42,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,6,/,,False,2024-07-13,63900000000000.0,17:45:00,NaT,42,14.0,person,peg,True,"[""""]","[""BR""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524FJ7OL0,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,45A3LK4B,"[""""]",84.0,72.0,0.0,0.0,156.0,156.0,2024,6
1,202406010714KXIEZJ,202406010714KXIEZJ,202406010714KXIEZJ,2024-01-06 07:14:00,2024-06-01,41400000000000,11:30:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-01,41400000000000.0,11:30:00,NaT,0,10.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524TPE9S4,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UKL232,"[""dietary#vegetarian""]",20.0,0.0,0.0,0.0,20.0,20.0,2024,6
2,202406010726X2ZGX5,202406010726X2ZGX5,202406010726X2ZGX5,2024-01-06 07:26:00,2024-06-01,42300000000000,11:45:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-01,42300000000000.0,11:45:00,NaT,0,10.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,False,,,False,True,202407251452J0VHMF,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UL3834,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024,6
3,202406010755ACH0QY,202406010755ACH0QY,202406010755ACH0QY,2024-01-06 07:55:00,2024-06-05,70200000000000,19:30:00,4,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,11,/,,False,2024-06-05,70200000000000.0,19:30:00,NaT,4,14.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,True,,,False,True,202408121524R0J02T,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,45A998P8,"[""""]",154.0,0.0,0.0,0.0,154.0,154.0,2024,6
4,202406010815SQJ15X,202406010815SQJ15X,202406010815SQJ15X,2024-01-06 08:15:00,2024-06-02,54900000000000,15:15:00,1,12.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-02,54900000000000.0,15:15:00,NaT,1,12.0,person,peg,False,"[""""]","[""BR""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524OKHO5A,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UP3B4A,"[""""]",24.0,24.0,0.0,0.0,48.0,48.0,2024,6


## 2. Raw Cleaning & Filtering
Business-rule filters:
- Drop `party_size > 20`
- Remove negative money & days ahead
- Cap `Search Days Ahead` at 99th percentile or 180 days


In [23]:
## 2. Raw Cleaning & Filtering

df = pd.read_parquet("../data/processed/full_cleaned.parquet")

# 2a. Drop red columns
red_columns = [
    'Reservation Time', 'Reservation Datetime', 'Reservation Charge Type', 'Game Area',
    'Marketing Opt In', 'Reservation Notes', 'Reservation ID', 'Reservation Tag',
    'Year', 'Month'
]
df = df.drop(columns=[col for col in red_columns if col in df.columns])
print(f"After dropping red columns: {df.shape[1]} columns remaining")

# 2b. Filter party size & days ahead
df = df[(df['Party Size']>0)&(df['Party Size']<=20)]
df = df[df['Search Days Ahead']>=0]
cap_days = min(180, df['Search Days Ahead'].quantile(0.99))
df['Search Days Ahead'] = df['Search Days Ahead'].clip(upper=cap_days)

# 2c. Persist filtered data
df.to_parquet("../data/processed/mid_step_filtered.parquet", index=False)
print(f"Rows after filtering: {len(df)}")


After dropping red columns: 42 columns remaining
Rows after filtering: 431484


In [24]:
# df = pd.read_parquet("../data/processed/full_cleaned.parquet")

# print(f"Loaded data with {df.shape[0]} rows and {df.shape[1]} columns")
# df.head(3)

# ## 1. Drop red columns
# red_columns = [
#     'Reservation Time', 'Reservation Datetime', 'Reservation Charge Type', 'Game Area',
#     'Marketing Opt In', 'Reservation Notes', 'Reservation ID', 'Reservation Tag',
#     'Year', 'Month'
# ]

# # Only drop columns that exist in the dataframe
# df = df.drop(columns=[col for col in red_columns if col in df.columns], errors='ignore')
# print(f"After dropping red columns: {df.shape[1]} columns remaining")

# ## 2. Split "Search At" (format: 2024-06-01T06:24:00) into "Search At Date" and "Search At Time"
# if 'Search At' in df.columns:
#     # Convert to datetime
#     df['Search At'] = pd.to_datetime(df['Search At'])
    
#     # Extract date and time
#     df['Search At Date'] = df['Search At'].dt.date
#     df['Search At Time'] = df['Search At'].dt.strftime('%H:%M:%S')
    
#     # Drop original column
#     df = df.drop(columns=['Search At'])
#     print("Split 'Search At' into date and time columns")

# ## 3. Change column names "Search Date/Time" to "Search At Date/Time"
# column_renames = {
#     'Search Date': 'Search For Date',
#     'Search Time Iso': 'Search For Time'
# }
# df = df.rename(columns=column_renames)
# print("Renamed Search Date/Time columns")

# ## 4. Change column name "Search Charge" to "Search Price Per Person"
# if 'Search Charge' in df.columns:
#     df = df.rename(columns={'Search Charge': 'Search Price Per Person'})
#     print("Renamed 'Search Charge' to 'Search Price Per Person'")

# ## 5. Delete rows with negative "Search Days Ahead"
# if 'Search Days Ahead' in df.columns:
#     before_count = len(df)
#     df = df[df['Search Days Ahead'] >= 0]
#     after_count = len(df)
#     print(f"Removed {before_count - after_count} rows with negative Search Days Ahead")


# ## 6. Clean milliseconds from date (just hour,minute,second)
# time_columns = [col for col in df.columns if 'Time' in col and 'Days' not in col]
# for col in time_columns:
#     if col in df.columns and pd.api.types.is_datetime64_any_dtype(df[col]):
#         df[col] = df[col].dt.strftime('%H:%M:%S')
#         print(f"Cleaned milliseconds from {col}")





# ## 9. Change from $ to £ in column names


# ## Display the transformed dataframe
# df.head()


# # Drop invalid party sizes: <= 0 or > 20
# mask_party = (df['Party Size'] > 0) & (df['Party Size'] <= 20)

# # Remove negative days ahead and cap at 99th percentile (or 180 days)
# # First drop negative values
# df = df[ df['Search Days Ahead'] >= 0 ]

# # Compute 99th percentile
# pct_99 = df['Search Days Ahead'].quantile(0.99)
# # Use guardrail of 180 days or computed pct, whichever is smaller
# cap_days = min(180, pct_99)
# #    Cap values
# df['Search Days Ahead'] = df['Search Days Ahead'].clip(upper=cap_days)

# # Drop rows outside party-size mask
# df = df[ mask_party ]


# # Reset index
# df = df.reset_index(drop=True)

# # Write out filtered dataset
# df.to_parquet("../data/processed/mid_step_filtered.parquet", index=False)

# print(f"Business-rule filtering complete. Rows now: {len(df)}")

df.head()

Unnamed: 0,Context ID,Booking ID,Session ID,Search At,Search Date,Search Time,Search Time Iso,Search Days Ahead,Search Charge,Search Charge Type,Venue ID,Venue Name,Party Size,Source Path,Occasion ID,Was Search Available,Reservation Date,Reservation Time Iso,Reservation Days Ahead,Reservation Charge,Was Package Required,Packages Available,Packages,Add Ons Available,Add Ons,Time Extension Available,Time Extension Selected,Personal Info Completed,Personal Info Completed At,Promo Code,Promo Code Applied,Billing Info Completed,Customer ID,Booking Status,Reservation Reference Code,Reservation Tags,Reservation Cost ($),Packages Cost ($),Add Ons Cost ($),Promo Code Discount ($),Total Cost ($),Deposit Amount
0,202406010624Q11YGA,202406010624Q11YGA,202406010624Q11YGA,2024-01-06 06:24:00,2024-07-13,63900000000000,17:45:00,42,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,6,/,,False,2024-07-13,17:45:00,42,14.0,True,"[""""]","[""BR""]","[""""]","[""""]",False,False,False,,,False,True,202408121524FJ7OL0,reservation_success,45A3LK4B,"[""""]",84.0,72.0,0.0,0.0,156.0,156.0
1,202406010714KXIEZJ,202406010714KXIEZJ,202406010714KXIEZJ,2024-01-06 07:14:00,2024-06-01,41400000000000,11:30:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-01,11:30:00,0,10.0,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,,False,True,202408121524TPE9S4,reservation_success,64UKL232,"[""dietary#vegetarian""]",20.0,0.0,0.0,0.0,20.0,20.0
2,202406010726X2ZGX5,202406010726X2ZGX5,202406010726X2ZGX5,2024-01-06 07:26:00,2024-06-01,42300000000000,11:45:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-01,11:45:00,0,10.0,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,,False,True,202407251452J0VHMF,reservation_success,64UL3834,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0
3,202406010755ACH0QY,202406010755ACH0QY,202406010755ACH0QY,2024-01-06 07:55:00,2024-06-05,70200000000000,19:30:00,4,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,11,/,,False,2024-06-05,19:30:00,4,14.0,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,,False,True,202408121524R0J02T,reservation_success,45A998P8,"[""""]",154.0,0.0,0.0,0.0,154.0,154.0
4,202406010815SQJ15X,202406010815SQJ15X,202406010815SQJ15X,2024-01-06 08:15:00,2024-06-02,54900000000000,15:15:00,1,12.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-02,15:15:00,1,12.0,False,"[""""]","[""BR""]","[""""]","[""""]",False,False,False,,,False,True,202408121524OKHO5A,reservation_success,64UP3B4A,"[""""]",24.0,24.0,0.0,0.0,48.0,48.0


## 3. Slot-Level Identifiers & Time Features

In [25]:
## 3. Slot‐Level Identifiers & Time Features

# 3a. Load filtered data
df = pd.read_parquet("../data/processed/mid_step_filtered.parquet")


## 3b. Convert boolean columns to 0/1
boolean_columns = [
    'Was Search Available',
    'Time Extension Available',
    'Time Extension Selected',
    'Personal Info Completed',
    'Personal Info Completed At',
    'Promo Code',
    'Promo Code Applied',
    'Billing Info Completed'
]

for col in boolean_columns:
    if col in df.columns:
        df[col] = df[col].apply(
            lambda x: 1 if pd.notnull(x) and str(x).lower() in ['yes', 'true', 't', 'y', '1'] else 0
        )
        print(f"Converted {col} to binary (0/1)")
        
        
# 3c. Canonical timestamps
df['search_datetime']      = pd.to_datetime(df['Search At'], errors='coerce')
df['reservation_datetime'] = pd.to_datetime(df['Reservation Date'], errors='coerce')


# 3d. Change column names "Search Date/Time" to "Search At Date/Time"
df['search_date_for'] = pd.to_datetime(df['Search Date'], dayfirst=True, errors='coerce').dt.date
df['search_hour_for'] = pd.to_datetime(df['Search Time Iso'],   dayfirst=True, errors='coerce').dt.hour

assert df['search_date_for'].notnull().all(), "Some booking-dates missing!"
print("✔ ‘at’ & ‘for’ columns created")


# 3e. Replace dollars with pounds
df.columns = [col.replace('($)', '(£)') for col in df.columns]
print("Changed $ to £ in column names")


# 3f. Create column "Process Time Completion" = "Personal Info Completed At" - "Search At Time"
if all(col in df.columns for col in ['Personal Info Completed At', 'Search At Time']):
    # Parse the timestamps
    completed_at = pd.to_datetime(df['Personal Info Completed At'], errors='coerce')
    
    # Get today's date as reference for times
    ref_date = datetime.now().date()
    
    # Convert times to datetime using reference date
    search_time_str = df['Search At Time'].astype(str)
    search_time = pd.to_datetime(ref_date.strftime('%Y-%m-%d') + ' ' + search_time_str, errors='coerce')
    
    # Calculate difference in minutes
    df['Process Time Completion'] = (completed_at - search_time).dt.total_seconds() / 60
    print("Created Process Time Completion column")
    
    
    
# 3g. Derive day/hour/lead_time
df['search_date_at'] = df['search_datetime'].dt.date
df['search_hour_at'] = df['search_datetime'].dt.hour
df['lead_time']   = (df['reservation_datetime'] - df['search_datetime']).dt.days
df['day_of_week'] = df['search_datetime'].dt.dayofweek
df['is_weekend']  = df['day_of_week'].isin([5,6]).astype(int)
df['month']       = df['search_datetime'].dt.month

# assert (df['lead_time']<0).all(), "Negative lead_time found!"
print("✔ Time features added")


Converted Was Search Available to binary (0/1)
Converted Time Extension Available to binary (0/1)
Converted Time Extension Selected to binary (0/1)
Converted Personal Info Completed to binary (0/1)
Converted Personal Info Completed At to binary (0/1)
Converted Promo Code to binary (0/1)
Converted Promo Code Applied to binary (0/1)
Converted Billing Info Completed to binary (0/1)
✔ ‘at’ & ‘for’ columns created
Changed $ to £ in column names
✔ Time features added


  df['search_hour_for'] = pd.to_datetime(df['Search Time Iso'],   dayfirst=True, errors='coerce').dt.hour


## 4. Booking Indicator & Rates

In [26]:
## 4. Booking Indicator & Rates

df['was_booked'] = df['Booking ID'].notnull().astype(int)

slot_agg = (
  df.groupby(['Venue ID','search_date_for','search_hour_for'], as_index=False)
    .agg(
      n_searches=('was_booked','size'),
      n_bookings=('was_booked','sum')
    )
)
slot_agg['booking_rate'] = slot_agg['n_bookings'] / slot_agg['n_searches']

print(f"✔ Aggregated into {len(slot_agg)} slots")


✔ Aggregated into 12725 slots


## 5. Price & Availability Features

In [27]:
## 5. Price & Availability Features

# rename raw columns
df = df.rename(columns={
  'Search Charge':'price_shown',
  'Was Search Available':'avail_flag'
})

price_avail = (
  df.groupby(['Venue ID','search_date_for','search_hour_for'], as_index=False)
    .agg(
      avg_price=('price_shown','mean'),
      pct_avail=('avail_flag','mean')
    )
)

print("✔ Price & availability computed")


✔ Price & availability computed


## 6. Capacity & Price Bounds

In [29]:
## 6. Capacity & Price Bounds

# 6a. Manual capacity lookup per venue
venue_caps = {
    "City": 12,           # 12 pegs
    "Canary Wharf": 8,    #  8 pegs
    "Birmingham": 4       #  4 pegs; arcade seats can be handled separately later
}

# expand to every hour (assuming same capacity all day)
venue_cap = pd.DataFrame([
    {"Venue ID": vid, "search_hour_for": h, "capacity": cap}
    for vid, cap in venue_caps.items()
    for h in range(24)
])

# 6b. Fallback price bounds from historical data
# here we just take the global min/max shown price per venue
price_bounds = (
    df
    .groupby("Venue ID")["price_shown"]
    .agg(min_price="min", max_price="max")
    .reset_index()
)

# 6c. Merge into slot_agg
slot_level = (
    slot_agg
      .merge(price_avail,   on=["Venue ID","search_date_for","search_hour_for"], how="left")
      .merge(venue_cap,     on=["Venue ID","search_hour_for"],             how="left")
      .merge(price_bounds,  on="Venue ID",                             how="left")
)

# Sanity checks
# assert slot_level["capacity"].notnull().all(), "Some capacities missing!"
print("✔ Capacity & fallback price bounds merged")


✔ Capacity & fallback price bounds merged


## 7. Holiday & Event Flags

Maybe do it later when Tom replies to the email

In [30]:
# # 7. Holiday & Special-Event Flags
# # We can pull from a calendar API or maintain a static CSV

# holidays = pd.read_csv('data/static/holidays.csv')  # columns: date, is_holiday
# events   = pd.read_csv('data/static/events.csv')    # columns: date, event_flag

# slot_level = (
#     slot_level
#     .merge(holidays, left_on='search_date', right_on='date', how='left')
#     .merge(events,   left_on='search_date', right_on='date', how='left')
# )
# slot_level['is_holiday']  = slot_level['is_holiday'].fillna(0).astype(int)
# slot_level['event_flag']  = slot_level['event_flag'].fillna(0).astype(int)
# slot_level = slot_level.drop(columns=['date_x','date_y'])
# print("✔ Holiday & event flags added.")


## 8. Write out Master & Slot-Level Artifacts

In [33]:
# 8. Write out artifacts for downstream
# 8a. Master = every raw search + all new columns
df.to_parquet('../data/processed/master.parquet',       index=False)

# 8b. Slot-level summary for modeling
slot_level.to_parquet('../data/processed/slot_level.parquet', index=False)

print("✅ master.parquet and slot_level.parquet written!")


✅ master.parquet and slot_level.parquet written!


## 9. Quick QA
Ensure everything looks good.


In [34]:
# 9. Load
df2 = pd.read_parquet('../data/processed/master.parquet')
print(df2.info())
df2.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 431484 entries, 0 to 431483
Data columns (total 53 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Context ID                  431484 non-null  object        
 1   Booking ID                  236202 non-null  object        
 2   Session ID                  431484 non-null  object        
 3   Search At                   21137 non-null   datetime64[ns]
 4   Search Date                 431484 non-null  datetime64[ns]
 5   Search Time                 431484 non-null  int64         
 6   Search Time Iso             431484 non-null  object        
 7   Search Days Ahead           431484 non-null  Int64         
 8   price_shown                 431484 non-null  float64       
 9   Search Charge Type          431484 non-null  category      
 10  Venue ID                    431484 non-null  object        
 11  Venue Name                  431484 non-

Unnamed: 0,Context ID,Booking ID,Session ID,Search At,Search Date,Search Time,Search Time Iso,Search Days Ahead,price_shown,Search Charge Type,Venue ID,Venue Name,Party Size,Source Path,Occasion ID,avail_flag,Reservation Date,Reservation Time Iso,Reservation Days Ahead,Reservation Charge,Was Package Required,Packages Available,Packages,Add Ons Available,Add Ons,Time Extension Available,Time Extension Selected,Personal Info Completed,Personal Info Completed At,Promo Code,Promo Code Applied,Billing Info Completed,Customer ID,Booking Status,Reservation Reference Code,Reservation Tags,Reservation Cost (£),Packages Cost (£),Add Ons Cost (£),Promo Code Discount (£),Total Cost (£),Deposit Amount,search_datetime,reservation_datetime,search_date_for,search_hour_for,search_date_at,search_hour_at,lead_time,day_of_week,is_weekend,month,was_booked
0,202406010624Q11YGA,202406010624Q11YGA,202406010624Q11YGA,2024-01-06 06:24:00,2024-07-13,63900000000000,17:45:00,42,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,6,/,,0,2024-07-13,17:45:00,42,14.0,True,"[""""]","[""BR""]","[""""]","[""""]",0,0,0,0,0,0,1,202408121524FJ7OL0,reservation_success,45A3LK4B,"[""""]",84.0,72.0,0.0,0.0,156.0,156.0,2024-01-06 06:24:00,2024-07-13,2024-07-13,17,2024-01-06,6.0,188.0,5.0,1,1.0,1
1,202406010714KXIEZJ,202406010714KXIEZJ,202406010714KXIEZJ,2024-01-06 07:14:00,2024-06-01,41400000000000,11:30:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,0,2024-06-01,11:30:00,0,10.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,1,202408121524TPE9S4,reservation_success,64UKL232,"[""dietary#vegetarian""]",20.0,0.0,0.0,0.0,20.0,20.0,2024-01-06 07:14:00,2024-06-01,2024-06-01,11,2024-01-06,7.0,146.0,5.0,1,1.0,1
2,202406010726X2ZGX5,202406010726X2ZGX5,202406010726X2ZGX5,2024-01-06 07:26:00,2024-06-01,42300000000000,11:45:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,0,2024-06-01,11:45:00,0,10.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,1,202407251452J0VHMF,reservation_success,64UL3834,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024-01-06 07:26:00,2024-06-01,2024-06-01,11,2024-01-06,7.0,146.0,5.0,1,1.0,1
3,202406010755ACH0QY,202406010755ACH0QY,202406010755ACH0QY,2024-01-06 07:55:00,2024-06-05,70200000000000,19:30:00,4,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,11,/,,0,2024-06-05,19:30:00,4,14.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,1,202408121524R0J02T,reservation_success,45A998P8,"[""""]",154.0,0.0,0.0,0.0,154.0,154.0,2024-01-06 07:55:00,2024-06-05,2024-06-05,19,2024-01-06,7.0,150.0,5.0,1,1.0,1
4,202406010815SQJ15X,202406010815SQJ15X,202406010815SQJ15X,2024-01-06 08:15:00,2024-06-02,54900000000000,15:15:00,1,12.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,0,2024-06-02,15:15:00,1,12.0,False,"[""""]","[""BR""]","[""""]","[""""]",0,0,0,0,0,0,1,202408121524OKHO5A,reservation_success,64UP3B4A,"[""""]",24.0,24.0,0.0,0.0,48.0,48.0,2024-01-06 08:15:00,2024-06-02,2024-06-02,15,2024-01-06,8.0,147.0,5.0,1,1.0,1
5,202406010829VIO2RJ,202406010829VIO2RJ,202406010829VIO2RJ,2024-01-06 08:29:00,2024-07-29,61200000000000,17:00:00,58,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,0,2024-07-29,17:00:00,58,10.0,False,"[""""]","[""BR""]","[""""]","[""time_extension""]",0,1,0,0,0,0,1,202408121524TPWZDV,payment_inititalised,,"[""""]",20.0,24.0,6.0,0.0,50.0,50.0,2024-01-06 08:29:00,2024-07-29,2024-07-29,17,2024-01-06,8.0,204.0,5.0,1,1.0,1
6,202406010833Z6G44X,202406010833Z6G44X,202406010833Z6G44X,2024-01-06 08:33:00,2024-07-29,61200000000000,17:00:00,58,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,0,2024-07-29,17:00:00,58,10.0,False,"[""""]","[""BR""]","[""""]","[""time_extension""]",0,1,0,0,0,0,1,202408121524TPWZDV,reservation_success,64UR6HNW,"[""""]",20.0,24.0,6.0,0.0,50.0,50.0,2024-01-06 08:33:00,2024-07-29,2024-07-29,17,2024-01-06,8.0,204.0,5.0,1,1.0,1
7,202406010952QPUHPJ,202406010952QPUHPJ,202406010952QPUHPJ,2024-01-06 09:52:00,2024-06-01,79200000000000,22:00:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,2,/,,0,2024-06-01,22:00:00,0,10.0,False,"[""""]","[""BR""]","[""""]","[""time_extension""]",0,1,0,0,0,0,1,202408121524PTAFR7,reservation_success,45AGWF4P,"[""""]",20.0,24.0,6.0,0.0,50.0,50.0,2024-01-06 09:52:00,2024-06-01,2024-06-01,22,2024-01-06,9.0,146.0,5.0,1,1.0,1
8,202406011013AQL7NO,202406011013AQL7NO,202406011013AQL7NO,2024-01-06 10:13:00,2024-06-02,66600000000000,18:30:00,1,12.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",4,/,,0,2024-06-02,18:30:00,1,12.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,1,202408121524YAFG8E,payment_inititalised,,"[""""]",48.0,0.0,0.0,0.0,48.0,48.0,2024-01-06 10:13:00,2024-06-02,2024-06-02,18,2024-01-06,10.0,147.0,5.0,1,1.0,1
9,202406011013NLYIDG,202406011013NLYIDG,202406011013NLYIDG,2024-01-06 10:13:00,2024-06-01,78300000000000,21:45:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",3,/,,0,2024-06-01,21:45:00,0,10.0,False,"[""""]","[""NP""]","[""""]","[""""]",0,0,0,0,0,0,1,202407251442T58AJ2,payment_inititalised,,"[""allergen#allergy""]",30.0,90.0,0.0,0.0,120.0,120.0,2024-01-06 10:13:00,2024-06-01,2024-06-01,21,2024-01-06,10.0,146.0,5.0,1,1.0,1


## 10. Data Quality Report
Double check how the data is looking like before moving forward.


In [35]:
# 10. Export HTML summary as data_quality.html

def generate_data_quality_report(
    input_path: str = "../data/processed/master.parquet",
    output_path: str = "../outputs/data_quality.html"
):
    # 1. Load your feature-augmented data
    df = pd.read_parquet(input_path)
    
    # 2. Create a profiling report
    profile = ProfileReport(
        df,
        title="Clays Data Quality Report",
        explorative=True,
        minimal=False  # set True for a slimmer report
    )
    
    # 3. Export to HTML
    profile.to_file(output_path)
    print(f"✅ Data-quality report written to {output_path}")

if __name__ == "__main__":
    generate_data_quality_report()

100%|██████████| 53/53 [00:15<00:00,  3.48it/s]7<00:00,  5.04it/s, Describe variable: was_booked]                
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
Summarize dataset: 100%|██████████| 352/352 [01:18<00:00,  4.49it/s, Completed]                                               
Generate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.36s/it]
Render HTML: 100%|██████████| 1/1 [00:05<00:00,  5.15s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 14.48it/s]


✅ Data-quality report written to ../outputs/data_quality.html
