# 1. Build Master Database

This notebook:
1. Reads raw CSV/Parquet from `data/raw/`
2. Cleans and filters
3. Engineers core feature flags
4. Writes out `data/processed/master.parquet`
5. Quick sanity checks


In [51]:
# 0. Import Libraries

import pandas as pd
from ydata_profiling import ProfileReport
from Data_Quality import analyze_data_quality
from datetime import datetime


pd.set_option('display.max_columns', 100)  
pd.set_option('display.width', None)       
pd.set_option('display.max_rows', 1000)

#analyze_data_quality("/Users/arvindpuri/Documents/Clays_LondonLAB_Project/data/raw/Clays_data.csv")

## 1. Extract
Read raw file into one DataFrame.


In [54]:
# 1. Read the CSV into a DataFrame with explicit dtypes
dtype_map = {
    'Context ID': str,
    'Booking ID': str,
    'Session ID': str,
    'Search Days Ahead': 'Int64',
    'Search Charge': 'float',
    'Search Charge Type': 'category',
    'Venue ID': str,
    'Venue Name': 'category',
    'Party Size': 'Int64',
    'Was Search Available': 'boolean',
    'Reservation Days Ahead': 'Int64',
    'Reservation Charge': 'float',
    'Reservation Charge Type': 'category',
    'Year': 'Int64',
    'Month': 'Int64',
    'Reservation Cost ($)': 'float',
    'Packages Cost ($)': 'float',
    'Add Ons Cost ($)': 'float',
    'Promo Code Discount ($)': 'float',
    'Total Cost ($)': 'float',
    'Deposit Amount': 'float',
}
df = pd.read_csv(
    '/Users/arvindpuri/Documents/Clays_LondonLAB_Project/data/raw/Clays_data.csv',
    dtype=dtype_map,
    encoding='latin1',
    low_memory=False
)

# 2. Safety copy
#df.to_parquet('../data/raw/full_raw.parquet', index=False)

# 3. Standardize date/time columns
df['Search At'] = pd.to_datetime(df['Search At'], dayfirst=True, errors='coerce')
df['Search Date'] = pd.to_datetime(df['Search Date'], dayfirst=True, errors='coerce')
df['Reservation Date'] = pd.to_datetime(df['Reservation Date'], dayfirst=True, errors='coerce')
df['Reservation Datetime'] = pd.to_datetime(df['Reservation Datetime'], dayfirst=True, errors='coerce')

# 4. Drop rows without Context ID (must be a bug)
df = df.dropna(subset=['Context ID'])

# 5. Save cleaned DataFrame
df.to_parquet('/Users/arvindpuri/Documents/Clays_LondonLAB_Project/data/processed/full_cleaned.parquet', index=False)

df.head()

  df['Search Date'] = pd.to_datetime(df['Search Date'], dayfirst=True, errors='coerce')
  df['Reservation Date'] = pd.to_datetime(df['Reservation Date'], dayfirst=True, errors='coerce')


Unnamed: 0,Context ID,Booking ID,Session ID,Search At,Search Date,Search Time,Search Time Iso,Search Days Ahead,Search Charge,Search Charge Type,Venue ID,Venue Name,Party Size,Source Path,Occasion ID,Was Search Available,Reservation Date,Reservation Time,Reservation Time Iso,Reservation Datetime,Reservation Days Ahead,Reservation Charge,Reservation Charge Type,Game Area,Was Package Required,Packages Available,Packages,Add Ons Available,Add Ons,Time Extension Available,Time Extension Selected,Personal Info Completed,Personal Info Completed At,Marketing Opt In,Reservation Notes,Promo Code,Promo Code Applied,Billing Info Completed,Customer ID,Booking Status,Reservation ID,Reservation Reference Code,Reservation Tags,Reservation Cost ($),Packages Cost ($),Add Ons Cost ($),Promo Code Discount ($),Total Cost ($),Deposit Amount,Year,Month
0,202406010624Q11YGA,202406010624Q11YGA,202406010624Q11YGA,2024-01-06 06:24:00,2024-07-13,63900000000000,17:45:00,42,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,6,/,,False,2024-07-13,63900000000000.0,17:45:00,NaT,42,14.0,person,peg,True,"[""""]","[""BR""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524FJ7OL0,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,45A3LK4B,"[""""]",84.0,72.0,0.0,0.0,156.0,156.0,2024,6
1,202406010714KXIEZJ,202406010714KXIEZJ,202406010714KXIEZJ,2024-01-06 07:14:00,2024-06-01,41400000000000,11:30:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-01,41400000000000.0,11:30:00,NaT,0,10.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524TPE9S4,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UKL232,"[""dietary#vegetarian""]",20.0,0.0,0.0,0.0,20.0,20.0,2024,6
2,202406010726X2ZGX5,202406010726X2ZGX5,202406010726X2ZGX5,2024-01-06 07:26:00,2024-06-01,42300000000000,11:45:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-01,42300000000000.0,11:45:00,NaT,0,10.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,False,,,False,True,202407251452J0VHMF,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UL3834,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024,6
3,202406010755ACH0QY,202406010755ACH0QY,202406010755ACH0QY,2024-01-06 07:55:00,2024-06-05,70200000000000,19:30:00,4,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,11,/,,False,2024-06-05,70200000000000.0,19:30:00,NaT,4,14.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,True,,,False,True,202408121524R0J02T,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,45A998P8,"[""""]",154.0,0.0,0.0,0.0,154.0,154.0,2024,6
4,202406010815SQJ15X,202406010815SQJ15X,202406010815SQJ15X,2024-01-06 08:15:00,2024-06-02,54900000000000,15:15:00,1,12.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-02,54900000000000.0,15:15:00,NaT,1,12.0,person,peg,False,"[""""]","[""BR""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524OKHO5A,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UP3B4A,"[""""]",24.0,24.0,0.0,0.0,48.0,48.0,2024,6


## 2. Clean
# Booking Data Transformation Script

## Purpose
This script processes booking data to standardize formats, clean invalid values, and prepare the data for analysis.

## What This Script Does

### Data Loading
- Loads 431,484 rows with 44 columns from the booking dataset

### Column Transformations
- Removes specified "red" columns (Reservation Time, Reservation Datetime, etc.)
- Renames "Search Date/Time" columns to "Search At Date/Time" for consistency
- Changes dollar symbols ($) to pound symbols (£) in column names

### Data Type Conversions
- Converts the following columns to binary format (0/1):
  - Was Search Available
  - Time Extension Available
  - Time Extension Selected
  - Personal Info Completed
  - Personal Info Completed At
  - Promo Code
  - Promo Code Applied
  - Billing Info Completed

### Data Cleaning
- Removes rows with negative "Search Days Ahead" values
- Creates a new "Process Time Completion" column to track processing times

### Results
- After all transformations, the dataset contains 431,484 rows of clean, standardized data

## Usage
Run this script against your booking data CSV/Excel file to prepare it for analysis.


In [58]:
# 2. Clean
df = pd.read_parquet("/Users/arvindpuri/Documents/Clays_LondonLAB_Project/data/processed/full_cleaned.parquet")


print(f"Loaded data with {df.shape[0]} rows and {df.shape[1]} columns")
df.head(2)

## 1. Drop red columns
red_columns = [
    'Reservation Time', 'Reservation Datetime', 'Reservation Charge Type', 'Game Area',
    'Marketing Opt In', 'Reservation Notes', 'Reservation ID', 'Reservation Tag',
    'Year', 'Month'
]

# Only drop columns that exist in the dataframe
df = df.drop(columns=[col for col in red_columns if col in df.columns], errors='ignore')
print(f"After dropping red columns: {df.shape[1]} columns remaining")

## 2. Split "Search At" (format: 2024-06-01T06:24:00) into "Search At Date" and "Search At Time"
if 'Search At' in df.columns:
    # Convert to datetime
    df['Search At'] = pd.to_datetime(df['Search At'])
    
    # Extract date and time
    df['Search At Date'] = df['Search At'].dt.date
    df['Search At Time'] = df['Search At'].dt.strftime('%H:%M:%S')
    
    # Drop original column
    df = df.drop(columns=['Search At'])
    print("Split 'Search At' into date and time columns")

## 3. Change column names "Search Date/Time" to "Search At Date/Time"
column_renames = {
    'Search Date': 'Search For Date',
    'Search Time Iso': 'Search For Time'
}
df = df.rename(columns=column_renames)
print("Renamed Search Date/Time columns")

## 4. Change column name "Search Charge" to "Search Price Per Person"
if 'Search Charge' in df.columns:
    df = df.rename(columns={'Search Charge': 'Search Price Per Person'})
    print("Renamed 'Search Charge' to 'Search Price Per Person'")

## 5. Delete rows with negative "Search Days Ahead"
if 'Search Days Ahead' in df.columns:
    before_count = len(df)
    df = df[df['Search Days Ahead'] >= 0]
    after_count = len(df)
    print(f"Removed {before_count - after_count} rows with negative Search Days Ahead")


## 6. Clean milliseconds from date (just hour,minute,second)
time_columns = [col for col in df.columns if 'Time' in col and 'Days' not in col]
for col in time_columns:
    if col in df.columns and pd.api.types.is_datetime64_any_dtype(df[col]):
        df[col] = df[col].dt.strftime('%H:%M:%S')
        print(f"Cleaned milliseconds from {col}")

## 7. Convert boolean columns to 0/1
boolean_columns = [
    'Was Search Available',
    'Time Extension Available',
    'Time Extension Selected',
    'Personal Info Completed',
    'Personal Info Completed At',
    'Promo Code',
    'Promo Code Applied',
    'Billing Info Completed'
]

for col in boolean_columns:
    if col in df.columns:
        df[col] = df[col].apply(
            lambda x: 1 if pd.notnull(x) and str(x).lower() in ['yes', 'true', 't', 'y', '1'] else 0
        )
        print(f"Converted {col} to binary (0/1)")

## 8. Create column "Process Time Completion" = "Personal Info Completed At" - "Search At Time"
if all(col in df.columns for col in ['Personal Info Completed At', 'Search At Time']):
    # Parse the timestamps
    completed_at = pd.to_datetime(df['Personal Info Completed At'], errors='coerce')
    
    # Get today's date as reference for times
    ref_date = datetime.now().date()
    
    # Convert times to datetime using reference date
    search_time_str = df['Search At Time'].astype(str)
    search_time = pd.to_datetime(ref_date.strftime('%Y-%m-%d') + ' ' + search_time_str, errors='coerce')
    
    # Calculate difference in minutes
    df['Process Time Completion'] = (completed_at - search_time).dt.total_seconds() / 60
    print("Created Process Time Completion column")

## 9. Change from $ to £ in column names
df.columns = [col.replace('($)', '(£)') for col in df.columns]
print("Changed $ to £ in column names")

## Display the transformed dataframe
df.head()


# Drop invalid party sizes: <= 0 or > 20
mask_party = (df['Party Size'] > 0) & (df['Party Size'] <= 20)

# Remove negative days ahead and cap at 99th percentile (or 180 days)
# First drop negative values
df = df[ df['Search Days Ahead'] >= 0 ]

# Compute 99th percentile
pct_99 = df['Search Days Ahead'].quantile(0.99)
# Use guardrail of 180 days or computed pct, whichever is smaller
cap_days = min(180, pct_99)
#    Cap values
df['Search Days Ahead'] = df['Search Days Ahead'].clip(upper=cap_days)

# Drop rows outside party-size mask
df = df[ mask_party ]



# Reset index
df = df.reset_index(drop=True)

# Write out filtered dataset
df.to_parquet("/Users/arvindpuri/Documents/Clays_LondonLAB_Project/data/processed/mid_step_filtered.parquet", index=False)

print(f"Business-rule filtering complete. Rows now: {len(df)}")

Loaded data with 439775 rows and 51 columns
After dropping red columns: 42 columns remaining
Split 'Search At' into date and time columns
Renamed Search Date/Time columns
Renamed 'Search Charge' to 'Search Price Per Person'
Removed 43 rows with negative Search Days Ahead
Converted Was Search Available to binary (0/1)
Converted Time Extension Available to binary (0/1)
Converted Time Extension Selected to binary (0/1)
Converted Personal Info Completed to binary (0/1)
Converted Personal Info Completed At to binary (0/1)
Converted Promo Code to binary (0/1)
Converted Promo Code Applied to binary (0/1)
Converted Billing Info Completed to binary (0/1)
Created Process Time Completion column
Changed $ to £ in column names
Business-rule filtering complete. Rows now: 431484


## 3. Feature Engineering
Core flags:
- `was_booked`
- `lead_time_days`
- `hour_of_day`, `day_of_week`, `is_weekend`
- Convert key cols to categorical


In [60]:
# 3. Feature Engineering
def add_feature_flags(
    input_path: str = "../data/processed/mid_step_filtered.parquet",
    output_path: str = "../data/processed/master.parquet"
):
    # 1. Load filtered data
    df = pd.read_parquet(input_path)
    
    # 2. was_booked: 1 if there's a Reservation ID, else 0
    df["was_booked"] = df["Reservation ID"].notnull().astype("int8")
    
    # 3. lead_time_days = (reservation_date − search_date).days
    df["lead_time_days"] = (
        df["Reservation Date"] - df["Search Date"]
    ).dt.days.astype("Int16")
    # Drop any negative lead times that remain
    df = df[df["lead_time_days"] >= 0]
    
    # 4. Extract search-time features
    df["hour_of_day"] = pd.to_datetime(df["Search Time"]).dt.hour
    
    # undecided how to do these:
    #   df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype("boolean")
    #   df["day_of_week"] = df["Search At"].dt.dayofweek.astype("Int8")  # Monday=0

    
    # 5. Convert business columns to categorical
    for col in [
        "Venue Name",
        "Search Charge Type",
        "Reservation Charge Type",
        "Booking Status",
    ]:
        if col in df:
            df[col] = df[col].astype("category")
    
    # 6. Persist the feature‐augmented dataset
    df.to_parquet(output_path, index=False)
    print(f"✅ Features added & saved to {output_path}")

if __name__ == "__main__":
    add_feature_flags()

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/mid_step_filtered.parquet'

## 4. Quick QA
Ensure everything looks good.


In [21]:
# 4. Load
df2 = pd.read_parquet('../data/processed/master.parquet')
print(df2.info())
df2.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 233214 entries, 0 to 233213
Data columns (total 54 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   Context ID                  233214 non-null  object        
 1   Booking ID                  233214 non-null  object        
 2   Session ID                  233214 non-null  object        
 3   Search At                   21064 non-null   datetime64[ns]
 4   Search Date                 233214 non-null  datetime64[ns]
 5   Search Time                 233214 non-null  int64         
 6   Search Time Iso             233214 non-null  object        
 7   Search Days Ahead           233214 non-null  Int64         
 8   Search Charge               233214 non-null  float64       
 9   Search Charge Type          233214 non-null  category      
 10  Venue ID                    233214 non-null  object        
 11  Venue Name                  233214 non-

Unnamed: 0,Context ID,Booking ID,Session ID,Search At,Search Date,Search Time,Search Time Iso,Search Days Ahead,Search Charge,Search Charge Type,...,Packages Cost ($),Add Ons Cost ($),Promo Code Discount ($),Total Cost ($),Deposit Amount,Year,Month,was_booked,lead_time_days,hour_of_day
0,202406010624Q11YGA,202406010624Q11YGA,202406010624Q11YGA,2024-01-06 06:24:00,2024-07-13,63900000000000,17:45:00,42,14.0,person,...,72.0,0.0,0.0,156.0,156.0,2024,6,1,0,17
1,202406010714KXIEZJ,202406010714KXIEZJ,202406010714KXIEZJ,2024-01-06 07:14:00,2024-06-01,41400000000000,11:30:00,0,10.0,person,...,0.0,0.0,0.0,20.0,20.0,2024,6,1,0,11
2,202406010726X2ZGX5,202406010726X2ZGX5,202406010726X2ZGX5,2024-01-06 07:26:00,2024-06-01,42300000000000,11:45:00,0,10.0,person,...,0.0,0.0,0.0,20.0,20.0,2024,6,1,0,11
3,202406010755ACH0QY,202406010755ACH0QY,202406010755ACH0QY,2024-01-06 07:55:00,2024-06-05,70200000000000,19:30:00,4,14.0,person,...,0.0,0.0,0.0,154.0,154.0,2024,6,1,0,19
4,202406010815SQJ15X,202406010815SQJ15X,202406010815SQJ15X,2024-01-06 08:15:00,2024-06-02,54900000000000,15:15:00,1,12.0,person,...,24.0,0.0,0.0,48.0,48.0,2024,6,1,0,15


## 5. Data Quality Report
Double check how the data is looking like before moving forward.


In [None]:
# Export HTML summary as data_quality.html

def generate_data_quality_report(
    input_path: str = "../data/processed/master.parquet",
    output_path: str = "../outputs/data_quality.html"
):
    # 1. Load your feature-augmented data
    df = pd.read_parquet(input_path)
    
    # 2. Create a profiling report
    profile = ProfileReport(
        df,
        title="Clays Data Quality Report",
        explorative=True,
        minimal=False  # set True for a slimmer report
    )
    
    # 3. Export to HTML
    profile.to_file(output_path)
    print(f"✅ Data-quality report written to {output_path}")

if __name__ == "__main__":
    generate_data_quality_report()

100%|██████████| 54/54 [00:05<00:00,  9.04it/s]7<00:01,  4.21it/s, Describe variable: hour_of_day]               
Summarize dataset: 100%|██████████| 60/60 [00:08<00:00,  7.45it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:11<00:00, 11.28s/it]
Render HTML: 100%|██████████| 1/1 [00:00<00:00,  1.68it/s]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 50.89it/s]

✅ Data-quality report written to ../outputs/data_quality_slimmer.html



