# 01. Data Exploration

This notebook explores the hotel guests dataset and generates synthetic Airbnb-style data.

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.insert(0, os.path.abspath('..'))

from src import data_loading, synthetic_airbnb, advertisers, utils

SEED = 42
rng = utils.set_random_seed(SEED)
plt.style.use('seaborn-v0_8-darkgrid')
print('Setup complete!')

## 1. Load Hotel Guests Dataset

In [None]:
hotel_path = '../data/raw/hotel_guests_dataset.csv'

if not os.path.exists(hotel_path):
    print('Creating demo hotel dataset...')
    os.makedirs('../data/raw', exist_ok=True)
    demo = pd.DataFrame({
        'adults': rng.integers(1, 4, 1000),
        'children': rng.choice([0,0,0,1,2], 1000),
        'babies': rng.choice([0,0,0,0,1], 1000),
        'stays_in_weekend_nights': rng.integers(0, 3, 1000),
        'stays_in_week_nights': rng.integers(1, 7, 1000),
        'market_segment': rng.choice(['Online TA', 'Direct', 'Corporate'], 1000),
        'adr': rng.normal(100, 50, 1000).clip(20),
        'country': rng.choice(['USA', 'GBR', 'FRA', 'DEU'], 1000)
    })
    demo.to_csv(hotel_path, index=False)

hotel_df = data_loading.load_hotel_guests(hotel_path, seed=SEED)
print(f'Loaded {len(hotel_df)} hotel records')
hotel_df.head()

## 2. Generate Synthetic Airbnb Data

In [None]:
airbnb_template = pd.DataFrame({
    'guest_id': [f'T{i}' for i in range(200)],
    'stay_id': [f'S{i}' for i in range(200)],
    'city': rng.choice(['Paris', 'London', 'Barcelona'], 200),
    'arrival_date': pd.date_range('2020-01-01', periods=200, freq='3D'),
    'nights': rng.choice([1,2,2,3,3,3,4,5,7], 200),
    'adults': rng.choice([1,1,2,2,2,3], 200),
    'children': rng.choice([0,0,0,0,1,2], 200),
    'purpose_of_stay': rng.choice(['leisure', 'business'], 200),
    'price_per_night': rng.normal(120, 60, 200).clip(30),
    'total_price': 0,
    'country': rng.choice(['USA', 'GBR', 'FRA'], 200),
    'device_type': rng.choice(['mobile', 'desktop'], 200)
})
airbnb_template['total_price'] = airbnb_template['price_per_night'] * airbnb_template['nights']

airbnb_synth = synthetic_airbnb.generate_synthetic_airbnb(airbnb_template, n_samples=3000, seed=SEED)
print(f'Generated {len(airbnb_synth)} synthetic Airbnb stays')
airbnb_synth.head()

## 3. Harmonize Datasets

In [None]:
unified_guests = synthetic_airbnb.harmonize_guests(hotel_df, airbnb_synth, seed=SEED)
print(f'Unified: {len(unified_guests)} guests')
print(unified_guests['source'].value_counts())
unified_guests.head()

## 4. Generate Advertisers

In [None]:
ads_df = advertisers.generate_advertisers(n_ads=150, seed=SEED)
print(f'Generated {len(ads_df)} advertisers')
ads_df['advertiser_type'].value_counts()

## 5. Generate Guest-Ad Preferences

In [None]:
sample_guests = unified_guests.sample(min(1000, len(unified_guests)), random_state=SEED)
guest_ad_prefs = advertisers.generate_guest_ad_preferences(sample_guests, ads_df, n_samples_per_guest=40, seed=SEED)
print(f'Generated {len(guest_ad_prefs)} preferences')
guest_ad_prefs.head()

## 6. Save Processed Data

In [None]:
os.makedirs('../data/processed', exist_ok=True)
unified_guests.to_csv('../data/processed/unified_guests.csv', index=False)
ads_df.to_csv('../data/processed/advertisers.csv', index=False)
guest_ad_prefs.to_csv('../data/processed/guest_ad_preferences.csv', index=False)
print('Saved all processed data!')