# 1. Build Master Database

This notebook:
1. Reads raw CSV/Parquet from `data/raw/`
2. Cleans and filters
3. Engineers core feature flags
4. Writes out `data/processed/master.parquet`
5. Quick sanity checks


In [78]:
## 0. Import Libraries

import pandas as pd
from ydata_profiling import ProfileReport
from datetime import datetime

pd.set_option('display.max_columns', 100)  
pd.set_option('display.width', None)       
pd.set_option('display.max_rows', 1000)

## 1. Extract
Read raw file into one DataFrame.


In [79]:
## 1. Read the CSV into a DataFrame with explicit dtypes
dtype_map = {
    'Context ID': str,
    'Booking ID': str,
    'Session ID': str,
    'Search Days Ahead': 'Int64',
    'Search Charge': 'float',
    'Search Charge Type': 'category',
    'Venue ID': str,
    'Venue Name': 'category',
    'Party Size': 'Int64',
    'Was Search Available': 'boolean',
    'Reservation Days Ahead': 'Int64',
    'Reservation Charge': 'float',
    'Reservation Charge Type': 'category',
    'Year': 'Int64',
    'Month': 'Int64',
    'Reservation Cost ($)': 'float',
    'Packages Cost ($)': 'float',
    'Add Ons Cost ($)': 'float',
    'Promo Code Discount ($)': 'float',
    'Total Cost ($)': 'float',
    'Deposit Amount': 'float',
}
df = pd.read_csv(
    '../data/raw/Clays_data.csv',
    dtype=dtype_map,
    encoding='latin1',
    low_memory=False
)

# 2. Safety copy
df.to_parquet('../data/raw/full_raw.parquet', index=False)

# 3. Standardize date/time columns
df['Search At'] = pd.to_datetime(df['Search At'], dayfirst=True, errors='coerce')
df['Search Date'] = pd.to_datetime(df['Search Date'], dayfirst=True, errors='coerce')
df['Reservation Date'] = pd.to_datetime(df['Reservation Date'], dayfirst=True, errors='coerce')
df['Reservation Datetime'] = pd.to_datetime(df['Reservation Datetime'], dayfirst=True, errors='coerce')

# 4. Drop rows without Context ID (must be a bug)
df = df.dropna(subset=['Context ID'])

# 5. Save cleaned DataFrame
df.to_parquet('../data/processed/full_cleaned.parquet', index=False)

df.head()

  df['Search Date'] = pd.to_datetime(df['Search Date'], dayfirst=True, errors='coerce')
  df['Reservation Date'] = pd.to_datetime(df['Reservation Date'], dayfirst=True, errors='coerce')


Unnamed: 0,Context ID,Booking ID,Session ID,Search At,Search Date,Search Time,Search Time Iso,Search Days Ahead,Search Charge,Search Charge Type,Venue ID,Venue Name,Party Size,Source Path,Occasion ID,Was Search Available,Reservation Date,Reservation Time,Reservation Time Iso,Reservation Datetime,Reservation Days Ahead,Reservation Charge,Reservation Charge Type,Game Area,Was Package Required,Packages Available,Packages,Add Ons Available,Add Ons,Time Extension Available,Time Extension Selected,Personal Info Completed,Personal Info Completed At,Marketing Opt In,Reservation Notes,Promo Code,Promo Code Applied,Billing Info Completed,Customer ID,Booking Status,Reservation ID,Reservation Reference Code,Reservation Tags,Reservation Cost ($),Packages Cost ($),Add Ons Cost ($),Promo Code Discount ($),Total Cost ($),Deposit Amount,Year,Month
0,202406010624Q11YGA,202406010624Q11YGA,202406010624Q11YGA,2024-01-06 06:24:00,2024-07-13,63900000000000,17:45:00,42,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,6,/,,False,2024-07-13,63900000000000.0,17:45:00,NaT,42,14.0,person,peg,True,"[""""]","[""BR""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524FJ7OL0,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,45A3LK4B,"[""""]",84.0,72.0,0.0,0.0,156.0,156.0,2024,6
1,202406010714KXIEZJ,202406010714KXIEZJ,202406010714KXIEZJ,2024-01-06 07:14:00,2024-06-01,41400000000000,11:30:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-01,41400000000000.0,11:30:00,NaT,0,10.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524TPE9S4,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UKL232,"[""dietary#vegetarian""]",20.0,0.0,0.0,0.0,20.0,20.0,2024,6
2,202406010726X2ZGX5,202406010726X2ZGX5,202406010726X2ZGX5,2024-01-06 07:26:00,2024-06-01,42300000000000,11:45:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-01,42300000000000.0,11:45:00,NaT,0,10.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,False,,,False,True,202407251452J0VHMF,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UL3834,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024,6
3,202406010755ACH0QY,202406010755ACH0QY,202406010755ACH0QY,2024-01-06 07:55:00,2024-06-05,70200000000000,19:30:00,4,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Clays Canary Wharf,11,/,,False,2024-06-05,70200000000000.0,19:30:00,NaT,4,14.0,person,peg,False,"[""""]","[""""]","[""""]","[""""]",False,False,False,,True,,,False,True,202408121524R0J02T,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,45A998P8,"[""""]",154.0,0.0,0.0,0.0,154.0,154.0,2024,6
4,202406010815SQJ15X,202406010815SQJ15X,202406010815SQJ15X,2024-01-06 08:15:00,2024-06-02,54900000000000,15:15:00,1,12.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,"Clays, The City",2,/,,False,2024-06-02,54900000000000.0,15:15:00,NaT,1,12.0,person,peg,False,"[""""]","[""BR""]","[""""]","[""""]",False,False,False,,False,,,False,True,202408121524OKHO5A,reservation_success,ahNzfnNldmVucm9vbXMtc2VjdXJlciULEhtuaWdodGxvb3...,64UP3B4A,"[""""]",24.0,24.0,0.0,0.0,48.0,48.0,2024,6


## 2. Raw Cleaning & Filtering
Business-rule filters:
- Drop `party_size > 20`
- Remove negative money & days ahead
- Cap `Search Days Ahead` at 99th percentile or 180 days
- Drop `Search At` before August 1st 2024 


In [80]:
## 2. Raw Cleaning & Filtering

df = pd.read_parquet("../data/processed/full_cleaned.parquet")

# 2a. Drop red columns
red_columns = [
    'Reservation Time', 'Reservation Datetime', 'Reservation Charge Type', 'Game Area',
    'Marketing Opt In', 'Reservation Notes', 'Reservation ID', 'Reservation Tag',
    'Year', 'Month'
]
df = df.drop(columns=[col for col in red_columns if col in df.columns])
print(f"After dropping red columns: {df.shape[1]} columns remaining")

# 2b. Filter party size & days ahead
df = df[(df['Party Size']>0)&(df['Party Size']<=20)]
df = df[df['Search Days Ahead']>=0]
cap_days = min(180, df['Search Days Ahead'].quantile(0.99))
df['Search Days Ahead'] = df['Search Days Ahead'].clip(upper=cap_days)

# 2c. Drop `Search At` before August 1st 2024 
df["Search At"] = pd.to_datetime(df["Search At"], errors="coerce") # Ensure it's a datetime (safe step)
df = df[df["Search At"] >= "2024-08-01"] # Filter to keep only rows on or after August 1st, 2024

# 2d. Persist filtered data
df.to_parquet("../data/processed/mid_step_filtered.parquet", index=False)
print(f"Rows after filtering: {len(df)}")


After dropping red columns: 42 columns remaining
Rows after filtering: 9405


## 3. Slot-Level Identifiers & Time Features

In [81]:
## 3. Slot‐Level Identifiers & Time Features

# 3a. Load filtered data
df = pd.read_parquet("../data/processed/mid_step_filtered.parquet")


## 3b. Convert boolean columns to 0/1
boolean_columns = [
    'Was Search Available',
    'Time Extension Available',
    'Time Extension Selected',
    'Personal Info Completed',
    'Personal Info Completed At',
    'Promo Code',
    'Promo Code Applied',
    'Billing Info Completed'
]

for col in boolean_columns:
    if col in df.columns:
        df[col] = df[col].apply(
            lambda x: 1 if pd.notnull(x) and str(x).lower() in ['yes', 'true', 't', 'y', '1'] else 0
        )
        print(f"Converted {col} to binary (0/1)")
        
        
# 3c. Canonical timestamps
df['search_datetime']      = pd.to_datetime(df['Search At'], errors='coerce')
df['reservation_datetime'] = pd.to_datetime(df['Reservation Date'], errors='coerce')


# 3d. Change column names "Search Date/Time" to "Search At Date/Time"
df['search_date_for'] = pd.to_datetime(df['Search Date'], dayfirst=True, errors='coerce').dt.date
df['search_hour_for'] = pd.to_datetime(df['Search Time Iso'],   dayfirst=True, errors='coerce').dt.hour

assert df['search_date_for'].notnull().all(), "Some booking-dates missing!"
print("✔ ‘at’ & ‘for’ columns created")


# 3e. Replace dollars with pounds
df.columns = [col.replace('($)', '(£)') for col in df.columns]
print("Changed $ to £ in column names")


# 3f. Create column "Process Time Completion" = "Personal Info Completed At" - "Search At Time"
if all(col in df.columns for col in ['Personal Info Completed At', 'Search At Time']):
    # Parse the timestamps
    completed_at = pd.to_datetime(df['Personal Info Completed At'], errors='coerce')
    
    # Get today's date as reference for times
    ref_date = datetime.now().date()
    
    # Convert times to datetime using reference date
    search_time_str = df['Search At Time'].astype(str)
    search_time = pd.to_datetime(ref_date.strftime('%Y-%m-%d') + ' ' + search_time_str, errors='coerce')
    
    # Calculate difference in minutes
    df['Process Time Completion'] = (completed_at - search_time).dt.total_seconds() / 60
    print("Created Process Time Completion column")
    

# 3g. Good Venue Names
venue_mapping = {
    "ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3BfVmVudWUYgICsreOh4wgM": "Canary Wharf",
    "ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3BfVmVudWUYgID4itGHgQgM": "The City",
    "ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3BfVmVudWUYgICexo6z2QsM": "Birmingham"
}
df["Venue Name"] = df["Venue ID"].map(venue_mapping)
    
    
# 3h. Derive day/hour/lead_time
df['search_date_at'] = df['search_datetime'].dt.date
df['search_hour_at'] = df['search_datetime'].dt.hour
df['lead_time']   = (df['reservation_datetime'] - df['search_datetime']).dt.days
df['day_of_week'] = df['search_datetime'].dt.dayofweek
df['is_weekend']  = df['day_of_week'].isin([5,6]).astype(int)
df['month']       = df['search_datetime'].dt.month

# assert (df['lead_time']<0).all(), "Negative lead_time found!"
print("✔ Time features added")


Converted Was Search Available to binary (0/1)
Converted Time Extension Available to binary (0/1)
Converted Time Extension Selected to binary (0/1)
Converted Personal Info Completed to binary (0/1)
Converted Personal Info Completed At to binary (0/1)
Converted Promo Code to binary (0/1)
Converted Promo Code Applied to binary (0/1)
Converted Billing Info Completed to binary (0/1)
✔ ‘at’ & ‘for’ columns created
Changed $ to £ in column names
✔ Time features added


  df['search_hour_for'] = pd.to_datetime(df['Search Time Iso'],   dayfirst=True, errors='coerce').dt.hour


In [82]:
venue_map = (
    df[["Venue ID", "Venue Name"]]
    .dropna()
    .drop_duplicates()
    .groupby("Venue ID")["Venue Name"]
    .first()
)


## 4. Booking Indicator & Rates

In [83]:
## 4. Booking Indicator & Rates

df['was_booked'] = df['Booking ID'].notnull().astype(int)

slot_agg = (
    df.groupby(['Venue Name', 'search_date_for', 'search_hour_for'], as_index=False)
      .agg(
          n_searches   = ('was_booked', 'size'),
          n_bookings   = ('was_booked', 'sum'),
          lead_time    = ('lead_time',  'median'),   # or 'mean'
          day_of_week  = ('day_of_week','first'),    # same within a group
          is_weekend   = ('is_weekend','first')
      )
)
slot_agg['booking_rate'] = slot_agg['n_bookings'] / slot_agg['n_searches']


print(f"✔ Aggregated into {len(slot_agg)} slots")


✔ Aggregated into 2011 slots


## 5. Price & Availability Features

In [84]:
## 5. Price & Availability Features

# rename raw columns
df = df.rename(columns={
  'Search Charge':'price_shown',
  'Was Search Available':'avail_flag'
})

price_avail = (
  df.groupby(['Venue Name','search_date_for','search_hour_for'], as_index=False)
    .agg(
      avg_price=('price_shown','mean'),
      pct_avail=('avail_flag','mean')
    )
)

print("✔ Price & availability computed")


✔ Price & availability computed


## 6. Capacity & Price Bounds

In [85]:
## 6. Capacity & Price Bounds

# 6a. Manual capacity lookup per venue
venue_caps = {
    "City": 12,           # 12 pegs
    "Canary Wharf": 8,    #  8 pegs
    "Birmingham": 4       #  4 pegs; arcade seats can be handled separately later
}

# expand to every hour (assuming same capacity all day)
venue_cap = pd.DataFrame([
    {"Venue Name": vid, "search_hour_for": h, "capacity": cap}
    for vid, cap in venue_caps.items()
    for h in range(24)
])

# 6b. Fallback price bounds from historical data
# here we just take the global min/max shown price per venue
price_bounds = (
    df
    .groupby("Venue Name")["price_shown"]
    .agg(min_price="min", max_price="max")
    .reset_index()
)

# 6c. Merge into slot_agg
slot_level = (
    slot_agg
      .merge(price_avail,   on=["Venue Name","search_date_for","search_hour_for"], how="left")
      .merge(venue_cap,     on=["Venue Name","search_hour_for"],             how="left")
      .merge(price_bounds,  on="Venue Name",                             how="left")
)

# Sanity checks
# assert slot_level["capacity"].notnull().all(), "Some capacities missing!"
print("✔ Capacity & fallback price bounds merged")


✔ Capacity & fallback price bounds merged


## 7. Holiday & Event Flags

Maybe do it later when Tom replies to the email

In [86]:
# # 7. Holiday & Special-Event Flags
# # We can pull from a calendar API or maintain a static CSV

# holidays = pd.read_csv('data/static/holidays.csv')  # columns: date, is_holiday
# events   = pd.read_csv('data/static/events.csv')    # columns: date, event_flag

# slot_level = (
#     slot_level
#     .merge(holidays, left_on='search_date', right_on='date', how='left')
#     .merge(events,   left_on='search_date', right_on='date', how='left')
# )
# slot_level['is_holiday']  = slot_level['is_holiday'].fillna(0).astype(int)
# slot_level['event_flag']  = slot_level['event_flag'].fillna(0).astype(int)
# slot_level = slot_level.drop(columns=['date_x','date_y'])
# print("✔ Holiday & event flags added.")


## 8. Write out Master & Slot-Level Artifacts

In [87]:
# 8. Write out artifacts for downstream
# 8a. Master = every raw search + all new columns
df.to_parquet('../data/processed/master.parquet',             index=False)

# 8b. Slot-level summary for modeling
slot_level.to_parquet('../data/processed/slot_level.parquet', index=False)
slot_level.to_csv('../data/processed/slot_level.csv',         index=False)

print("✅ master.parquet and slot_levels written!")


✅ master.parquet and slot_levels written!


## 9. Quick QA
Ensure everything looks good.


In [88]:
# 9. Load
df2 = pd.read_parquet('../data/processed/master.parquet')
print(df2.info())
df2.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9405 entries, 0 to 9404
Data columns (total 53 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   Context ID                  9405 non-null   object        
 1   Booking ID                  9357 non-null   object        
 2   Session ID                  9405 non-null   object        
 3   Search At                   9405 non-null   datetime64[ns]
 4   Search Date                 9405 non-null   datetime64[ns]
 5   Search Time                 9405 non-null   int64         
 6   Search Time Iso             9405 non-null   object        
 7   Search Days Ahead           9405 non-null   Int64         
 8   price_shown                 9405 non-null   float64       
 9   Search Charge Type          9405 non-null   category      
 10  Venue ID                    9405 non-null   object        
 11  Venue Name                  9405 non-null   object      

Unnamed: 0,Context ID,Booking ID,Session ID,Search At,Search Date,Search Time,Search Time Iso,Search Days Ahead,price_shown,Search Charge Type,Venue ID,Venue Name,Party Size,Source Path,Occasion ID,avail_flag,Reservation Date,Reservation Time Iso,Reservation Days Ahead,Reservation Charge,Was Package Required,Packages Available,Packages,Add Ons Available,Add Ons,Time Extension Available,Time Extension Selected,Personal Info Completed,Personal Info Completed At,Promo Code,Promo Code Applied,Billing Info Completed,Customer ID,Booking Status,Reservation Reference Code,Reservation Tags,Reservation Cost (£),Packages Cost (£),Add Ons Cost (£),Promo Code Discount (£),Total Cost (£),Deposit Amount,search_datetime,reservation_datetime,search_date_for,search_hour_for,search_date_at,search_hour_at,lead_time,day_of_week,is_weekend,month,was_booked
0,202406080018N3QRJA,202406080018N3QRJA,202406080018N3QRJA,2024-08-06 00:18:00,2024-06-13,73800000000000,20:30:00,5,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Canary Wharf,2,/,,0,2024-06-13,20:30:00,5,14.0,True,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,0,,draft,,"[""""]",28.0,0.0,0.0,0.0,28.0,28.0,2024-08-06 00:18:00,2024-06-13,2024-06-13,20,2024-08-06,0,-55.0,1,0,8,1
1,202406080130CZRU1A,202406080130CZRU1A,202406080130CZRU1A,2024-08-06 01:30:00,2024-06-08,58500000000000,16:15:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,The City,2,/,,0,2024-06-08,16:15:00,0,10.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,1,20240812152684CHR3,reservation_success,65G76NW6,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024-08-06 01:30:00,2024-06-08,2024-06-08,16,2024-08-06,1,-60.0,1,0,8,1
2,202406080205H16ZRV,202406080205H16ZRV,202406080205H16ZRV,2024-08-06 02:05:00,2024-08-10,43200000000000,12:00:00,63,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,The City,15,/,,0,2024-08-10,12:00:00,63,10.0,True,"[""""]","[""BU""]","[""""]","[""""]",0,0,0,0,0,0,0,,draft,,"[""""]",150.0,450.0,0.0,0.0,600.0,150.0,2024-08-06 02:05:00,2024-08-10,2024-08-10,12,2024-08-06,2,3.0,1,0,8,1
3,202406080521F6EFWV,202406080521F6EFWV,202406080521F6EFWV,2024-08-06 05:21:00,2024-06-08,47700000000000,13:15:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Canary Wharf,2,/,,0,2024-06-08,13:15:00,0,10.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,0,,draft,,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024-08-06 05:21:00,2024-06-08,2024-06-08,13,2024-08-06,5,-60.0,1,0,8,1
4,202406080522EVC69U,202406080522EVC69U,202406080522EVC69U,2024-08-06 05:22:00,2024-06-08,47700000000000,13:15:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Canary Wharf,2,/,,0,2024-06-08,13:15:00,0,10.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,0,,draft,,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024-08-06 05:22:00,2024-06-08,2024-06-08,13,2024-08-06,5,-60.0,1,0,8,1
5,202406080535DQ6A5M,202406080535DQ6A5M,202406080535DQ6A5M,2024-08-06 05:35:00,2024-06-29,63900000000000,17:45:00,21,14.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,The City,6,/,,0,2024-06-29,17:45:00,21,14.0,True,"[""""]","[""BR""]","[""""]","[""""]",0,0,0,0,0,0,1,202408121527B48I6G,reservation_success,65GNJAHP,"[""""]",84.0,72.0,0.0,0.0,156.0,156.0,2024-08-06 05:35:00,2024-06-29,2024-06-29,17,2024-08-06,5,-39.0,1,0,8,1
6,202406080558ZSA2EP,202406080558ZSA2EP,202406080558ZSA2EP,2024-08-06 05:58:00,2024-06-09,52200000000000,14:30:00,1,12.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,The City,2,/,,0,2024-06-09,14:30:00,1,12.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,0,,draft,,"[""""]",24.0,0.0,0.0,0.0,24.0,24.0,2024-08-06 05:58:00,2024-06-09,2024-06-09,14,2024-08-06,5,-59.0,1,0,8,1
7,202406080610NKQWS8,202406080610NKQWS8,202406080610NKQWS8,2024-08-06 06:10:00,2024-06-10,57600000000000,16:00:00,2,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Canary Wharf,2,/,,0,2024-06-10,16:00:00,2,10.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,0,,draft,,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024-08-06 06:10:00,2024-06-10,2024-06-10,16,2024-08-06,6,-58.0,1,0,8,1
8,202406080627T292T9,202406080627T292T9,202406080627T292T9,2024-08-06 06:27:00,2024-07-27,54000000000000,15:00:00,49,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Canary Wharf,8,/,,0,2024-07-27,15:00:00,49,10.0,True,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,0,,draft,,"[""""]",80.0,0.0,0.0,0.0,80.0,80.0,2024-08-06 06:27:00,2024-07-27,2024-07-27,15,2024-08-06,6,-11.0,1,0,8,1
9,20240608063044CE2E,20240608063044CE2E,20240608063044CE2E,2024-08-06 06:30:00,2024-06-08,56700000000000,15:45:00,0,10.0,person,ahNzfnNldmVucm9vbXMtc2VjdXJlchwLEg9uaWdodGxvb3...,Canary Wharf,2,/,,0,2024-06-08,15:45:00,0,10.0,False,"[""""]","[""""]","[""""]","[""""]",0,0,0,0,0,0,0,,draft,,"[""""]",20.0,0.0,0.0,0.0,20.0,20.0,2024-08-06 06:30:00,2024-06-08,2024-06-08,15,2024-08-06,6,-60.0,1,0,8,1


## 10. Data Quality Report
Double check how the data is looking like before moving forward.


In [89]:
# # 10. Export HTML summary as data_quality.html

# def generate_data_quality_report(
#     input_path: str = "../data/processed/master.parquet",
#     output_path: str = "../outputs/data_quality.html"
# ):
#     # 1. Load your feature-augmented data
#     df = pd.read_parquet(input_path)
    
#     # 2. Create a profiling report
#     profile = ProfileReport(
#         df,
#         title="Clays Data Quality Report",
#         explorative=True,
#         minimal=True  # set True for a slimmer report
#     )
    
#     # 3. Export to HTML
#     profile.to_file(output_path)
#     print(f"✅ Data-quality report written to {output_path}")

# if __name__ == "__main__":
#     generate_data_quality_report()