In [0]:
!pip install faker

In [0]:
import pandas as pd
from datetime import datetime, timedelta
import random
from faker import Faker
import numpy as np

# Initialize Faker with Spanish locale
fake = Faker('es_ES')
Faker.seed(42)
random.seed(42)
np.random.seed(42)

# Configuration
SK_COUNTRY = 1.0  # Venezuela
COUNTRY_NAME = "Venezuela"
START_DATE = datetime(2023, 1, 1)
END_DATE = datetime(2025, 12, 31)

# Sales fact configuration - 3 months of dense data
SALES_START_DATE = datetime(2025, 7, 1)  # July 2025
SALES_END_DATE = datetime(2025, 9, 30)   # September 2025
SALES_PER_STORE_PER_DAY = 50  # Average transactions per store per day

# Helper function to generate timestamps
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

# ==================== DIM_CURRENCY ====================
def generate_dim_currency():
    currencies = [
        {
            'SK_CURRENCY': 1.0,
            'CURRENCY_ABR': 'VES',
            'CURRENCY_NAME': 'Bolivar',
            'SYMBOL': 'Bs.',
            'CREATED_DATE': datetime(2023, 1, 1),
            'CREATED_BY': 'SYSTEM',
            'LAST_UPD': datetime.now(),
            'LAST_UPD_BY': 'SYSTEM',
            'DB_EXTRACTION_TIMESTAMP': datetime.now()
        },
        {
            'SK_CURRENCY': 2.0,
            'CURRENCY_ABR': 'USD',
            'CURRENCY_NAME': 'Dolar Estadounidense',
            'SYMBOL': '$',
            'CREATED_DATE': datetime(2023, 1, 1),
            'CREATED_BY': 'SYSTEM',
            'LAST_UPD': datetime.now(),
            'LAST_UPD_BY': 'SYSTEM',
            'DB_EXTRACTION_TIMESTAMP': datetime.now()
        }
    ]
    
    df = pd.DataFrame(currencies)
    
    # Ensure column order matches table definition
    column_order = [
        'SK_CURRENCY', 'CURRENCY_ABR', 'CURRENCY_NAME', 'CREATED_DATE', 'CREATED_BY',
        'LAST_UPD', 'LAST_UPD_BY', 'SYMBOL', 'DB_EXTRACTION_TIMESTAMP'
    ]
    
    return df[column_order]

# ==================== DIM_SUPPLIER ====================
def generate_dim_supplier(n_suppliers=10):
    suppliers = []
    supplier_names = [
        'Laboratorios Leti', 'Pfizer Venezuela', 'GSK Venezuela', 
        'Bayer Venezuela', 'Sanofi Venezuela', 'Novartis Venezuela',
        'Johnson & Johnson', 'Abbott Venezuela', 'Merck Venezuela',
        'Roche Venezuela', 'AstraZeneca', 'Boehringer Ingelheim'
    ]
    
    for i in range(1, n_suppliers + 1):
        supplier = {
            'SK_SUPPLIER': float(i),
            'SK_COUNTRY': SK_COUNTRY,
            'SUPPLIER': i,
            'SUP_NAME': supplier_names[i-1] if i <= len(supplier_names) else f'Proveedor {i}',
            'SUP_NAME_SECONDARY': f'SUP{i:03d}',
            'SUPPLIER_PARENT': random.choice([0, i]) if i > 3 else 0,
            'SUP_STATUS': random.choice(['A', 'A', 'A', 'I']),
            'PRE_MARK_IND': random.choice(['Y', 'N']),
            'EDI_SUPP_AVAILABLE_IND': random.choice(['Y', 'N']),
            'STATUS_UPD_BY_RMS': 'Y',
            'RET_ALLOW_IND': random.choice(['Y', 'N']),
            'CONTACT_NAME': fake.name(),
            'CONTACT_PHONE': fake.phone_number(),
            'CONTACT_EMAIL': fake.email(),
            'CREATED_DATE': datetime(2023, 1, 1),
            'CREATED_BY': 'SYSTEM',
            'LAST_UPD': datetime.now(),
            'LAST_UPD_BY': 'SYSTEM',
            'DB_EXTRACTION_TIMESTAMP': datetime.now()
        }
        suppliers.append(supplier)
    
    df = pd.DataFrame(suppliers)
    
    # Ensure column order matches table definition
    column_order = [
        'SK_SUPPLIER', 'SK_COUNTRY', 'SUPPLIER', 'SUP_NAME', 'SUP_NAME_SECONDARY',
        'SUPPLIER_PARENT', 'SUP_STATUS', 'PRE_MARK_IND', 'EDI_SUPP_AVAILABLE_IND',
        'STATUS_UPD_BY_RMS', 'RET_ALLOW_IND', 'CONTACT_NAME', 'CONTACT_PHONE',
        'CONTACT_EMAIL', 'CREATED_DATE', 'CREATED_BY', 'LAST_UPD', 'LAST_UPD_BY',
        'DB_EXTRACTION_TIMESTAMP'
    ]
    
    return df[column_order]

# ==================== DIM_PRODUCT ====================
def generate_dim_product(n_suppliers=10, products_per_supplier=100):
    products = []
    sk_product = 1
    
    # Product categories for pharmaceutical retail
    departments = [
        {'dept': 1, 'name': 'Medicamentos', 'division': 1, 'div_name': 'Farmacia'},
        {'dept': 2, 'name': 'Cuidado Personal', 'division': 2, 'div_name': 'Belleza y Cuidado'},
        {'dept': 3, 'name': 'Nutricion', 'division': 3, 'div_name': 'Alimentos y Bebidas'},
        {'dept': 4, 'name': 'Infantil', 'division': 4, 'div_name': 'Bebe y Mama'}
    ]
    
    classes = {
        1: [{'class': 101, 'name': 'Analgesicos'}, {'class': 102, 'name': 'Antibioticos'}, {'class': 103, 'name': 'Vitaminas'}],
        2: [{'class': 201, 'name': 'Shampoo'}, {'class': 202, 'name': 'Jabones'}, {'class': 203, 'name': 'Cremas'}],
        3: [{'class': 301, 'name': 'Suplementos'}, {'class': 302, 'name': 'Bebidas'}, {'class': 303, 'name': 'Snacks'}],
        4: [{'class': 401, 'name': 'Panales'}, {'class': 402, 'name': 'Formulas'}, {'class': 403, 'name': 'Accesorios'}]
    }
    
    brands = ['Generico', 'Premium', 'Natural', 'Essential', 'Advanced', 'Classic']
    
    for supplier_id in range(1, n_suppliers + 1):
        for _ in range(products_per_supplier):
            dept = random.choice(departments)
            class_list = classes[dept['dept']]
            selected_class = random.choice(class_list)
            
            item_code = f"{dept['dept']}{selected_class['class']}{sk_product:06d}"
            
            product = {
                'SK_PRODUCT': float(sk_product),
                'SK_COUNTRY': SK_COUNTRY,
                'ITEM': item_code,
                'ITEM_DESC': f"{fake.word().capitalize()} {selected_class['name']} {random.choice(brands)}",
                'ITEM_NUMBER_TYPE': 'SKU',
                'ITEM_PARENT': item_code if random.random() > 0.3 else None,
                'PACK_IND': random.choice(['Y', 'N']),
                'MERCHANDISE_IND': 'Y',
                'SUBCLASS': random.randint(1, 5),
                'SUB_NAME': f"SubClase {random.randint(1, 5)}",
                'SUBCLASS_ID': random.randint(1000, 1999),
                'CLASS': selected_class['class'],
                'CLASS_NAME': selected_class['name'],
                'CLASS_ID': selected_class['class'],
                'DEPT': dept['dept'],
                'DEPT_NAME': dept['name'],
                'GROUP_NO': dept['dept'],
                'GROUP_NAME': dept['name'],
                'DIVISION': dept['division'],
                'DIV_NAME': dept['div_name'],
                'UNIT': random.randint(1, 10),
                'UNIT_NAME': random.choice(['Unidad', 'Caja', 'Frasco', 'Blister']),
                'BARCODE': f"750{random.randint(10000000, 99999999)}",
                'BARCODE_TYPE': 'UPC',
                'SELLABLE_IND': 'Y',
                'ORDERABLE_IND': 'Y',
                'STATUS': random.choice(['A', 'A', 'A', 'I']),
                'INVENTORY_IND': 'Y',
                'CREATED_DATE': datetime(2023, 1, 1),
                'CREATED_BY': 'SYSTEM',
                'LAST_UPD': datetime.now(),
                'LAST_UPD_BY': 'SYSTEM',
                'HIERARCHICAL_NAME': f"{dept['div_name']}/{dept['name']}/{selected_class['name']}",
                'ITEM_EXCLUDE': random.choice(['Y', 'N']),
                'DESC_ITEM_EXCLUDE': '',
                'HIERARCHICAL_UNIT': float(dept['dept']),
                'X_ORIGEN_SYSTEM': 'RMS',
                'OWN_BRAND': random.choice(['Y', 'N']),
                'BUD_INT': float(supplier_id),
                'BRAND_NAME': random.choice(brands),
                'PPV': random.choice(['Y', 'N']),
                'MIGRADO': 'N',
                'DB_EXTRACTION_TIMESTAMP': datetime.now()
            }
            products.append(product)
            sk_product += 1
    
    df = pd.DataFrame(products)
    
    # Ensure column order matches table definition
    column_order = [
        'SK_PRODUCT', 'SK_COUNTRY', 'ITEM', 'ITEM_DESC', 'ITEM_NUMBER_TYPE', 'ITEM_PARENT',
        'PACK_IND', 'MERCHANDISE_IND', 'SUBCLASS', 'SUB_NAME', 'SUBCLASS_ID', 'CLASS',
        'CLASS_NAME', 'CLASS_ID', 'DEPT', 'DEPT_NAME', 'GROUP_NO', 'GROUP_NAME', 'DIVISION',
        'DIV_NAME', 'UNIT', 'UNIT_NAME', 'BARCODE', 'BARCODE_TYPE', 'SELLABLE_IND',
        'ORDERABLE_IND', 'STATUS', 'INVENTORY_IND', 'CREATED_DATE', 'CREATED_BY', 'LAST_UPD',
        'LAST_UPD_BY', 'HIERARCHICAL_NAME', 'ITEM_EXCLUDE', 'DESC_ITEM_EXCLUDE',
        'HIERARCHICAL_UNIT', 'X_ORIGEN_SYSTEM', 'OWN_BRAND', 'BUD_INT', 'BRAND_NAME', 'PPV',
        'MIGRADO', 'DB_EXTRACTION_TIMESTAMP'
    ]
    
    return df[column_order]

# ==================== DIM_ORGANIZATION ====================
def generate_dim_organization(n_stores=100):
    stores = []
    
    # Venezuelan cities
    cities = [
        {'name': 'Caracas', 'region': 1, 'region_name': 'Capital', 'lat_range': (10.4, 10.5), 'lon_range': (-66.9, -66.8)},
        {'name': 'Maracaibo', 'region': 2, 'region_name': 'Occidente', 'lat_range': (10.6, 10.7), 'lon_range': (-71.7, -71.6)},
        {'name': 'Valencia', 'region': 3, 'region_name': 'Centro', 'lat_range': (10.1, 10.2), 'lon_range': (-68.0, -67.9)},
        {'name': 'Barquisimeto', 'region': 3, 'region_name': 'Centro', 'lat_range': (10.0, 10.1), 'lon_range': (-69.4, -69.3)},
        {'name': 'Maracay', 'region': 3, 'region_name': 'Centro', 'lat_range': (10.2, 10.3), 'lon_range': (-67.6, -67.5)},
        {'name': 'Barcelona', 'region': 4, 'region_name': 'Oriente', 'lat_range': (10.1, 10.2), 'lon_range': (-64.7, -64.6)},
        {'name': 'Maturin', 'region': 4, 'region_name': 'Oriente', 'lat_range': (9.7, 9.8), 'lon_range': (-63.2, -63.1)}
    ]
    
    formats = ['Express', 'Plus', 'Super', 'Mega']
    layers = ['A', 'B', 'C']
    
    for i in range(1, n_stores + 1):
        city = random.choice(cities)
        store_format = random.choice(formats)
        
        store = {
            'SK_ORGANIZATION': float(i),
            'STORE_ID': float(i),
            'STORE_NAME': f'Farmatodo {city["name"]} {i}',
            'DISTRICT': random.randint(1, 10),
            'FORMAT_NAME': store_format,
            'STORE_FORMAT': formats.index(store_format) + 1,
            'DISTRICT_NAME': f'Distrito {random.randint(1, 10)}',
            'REGION': city['region'],
            'REGION_NAME': city['region_name'],
            'SK_COUNTRY': SK_COUNTRY,
            'COUNTRY_NAME': COUNTRY_NAME,
            'CITY': city['name'],
            'STORE_OPEN_DATE': random_date(datetime(2010, 1, 1), datetime(2023, 12, 31)),
            'CREATED_DATE': datetime(2023, 1, 1),
            'CREATED_BY': 'SYSTEM',
            'LAST_UPD': datetime.now(),
            'LAST_UPD_BY': 'SYSTEM',
            'STORE_CLOSE_DATE': None if random.random() > 0.05 else random_date(datetime(2024, 1, 1), datetime.now()),
            'STRATUS': random.choice(['A', 'B', 'C']),
            'STRATUS_H': random.choice(['H1', 'H2', 'H3']),
            'DEFAULT_WH': random.randint(1, 5),
            'DISTRICT_MGR_NAME': fake.name(),
            'MALL_NAME': f'C.C. {fake.company()}' if random.random() > 0.5 else None,
            'REGION_MGR_NAME': fake.name(),
            'SISTER_STORE': random.choice([0, random.randint(1, n_stores)]),
            'STORE_CLASS': random.choice(['A', 'B', 'C']),
            'STORE_MGR_NAME': fake.name(),
            'TRANSFER_ZONE': random.randint(1, 5),
            'VAT_REGION': city['region'],
            'SELLING_SQUARE_ST': random.randint(800, 2500),
            'TOTAL_SQUARE_FT': random.randint(1000, 3000),
            'STORE_PLANNING_OPEN_DATE': random_date(datetime(2010, 1, 1), datetime(2023, 12, 31)),
            'STORE_PLANNING_STATUS': 'OPEN',
            'STORE_NAME_PLANNING': f'FTD_{i:03d}',
            'STORE_ACTIVE': 'Y' if random.random() > 0.05 else 'N',
            'GENERIC_STORE_LAYER': random.choice(layers),
            'LATITUDE': random.uniform(*city['lat_range']),
            'LONGITUDE': random.uniform(*city['lon_range']),
            'CITY_ABR': city['name'][:3].upper(),
            'STORE_INAUGURATION_DATE': random_date(datetime(2010, 1, 1), datetime(2023, 12, 31)),
            'GENERIC_STORE_CODE': f'VE{i:04d}',
            'LOGICAL_DISTRICT': random.randint(1, 10),
            'LOGICAL_DISTRICT_NAME': f'Distrito Logico {random.randint(1, 10)}',
            'LOGICAL_REGION': city['region'],
            'LOGICAL_REGION_NAME': city['region_name'],
            'STORE_LAYER': random.choice(layers),
            'STORE_NAME_SECONDARY': f'FTD{i:03d}',
            'STORE_NAME10': f'FRMTD{i:05d}',
            'STORE_NAME3': f'F{i:02d}',
            'VAT_INCLUDE_IND': 'Y',
            'STORE_SUB_LAYER': f'Sub{random.choice(layers)}',
            'STORE_SUB_LAYER_2': f'Sub2_{random.choice(layers)}',
            'DB_EXTRACTION_TIMESTAMP': datetime.now()
        }
        stores.append(store)
    
    df = pd.DataFrame(stores)
    
    # Ensure column order matches table definition
    column_order = [
        'SK_ORGANIZATION', 'STORE_ID', 'STORE_NAME', 'DISTRICT', 'FORMAT_NAME', 'STORE_FORMAT',
        'DISTRICT_NAME', 'REGION', 'REGION_NAME', 'SK_COUNTRY', 'COUNTRY_NAME', 'CITY',
        'STORE_OPEN_DATE', 'CREATED_DATE', 'CREATED_BY', 'LAST_UPD', 'LAST_UPD_BY',
        'STORE_CLOSE_DATE', 'STRATUS', 'STRATUS_H', 'DEFAULT_WH', 'DISTRICT_MGR_NAME',
        'MALL_NAME', 'REGION_MGR_NAME', 'SISTER_STORE', 'STORE_CLASS', 'STORE_MGR_NAME',
        'TRANSFER_ZONE', 'VAT_REGION', 'SELLING_SQUARE_ST', 'TOTAL_SQUARE_FT',
        'STORE_PLANNING_OPEN_DATE', 'STORE_PLANNING_STATUS', 'STORE_NAME_PLANNING',
        'STORE_ACTIVE', 'GENERIC_STORE_LAYER', 'LATITUDE', 'LONGITUDE', 'CITY_ABR',
        'STORE_INAUGURATION_DATE', 'GENERIC_STORE_CODE', 'LOGICAL_DISTRICT',
        'LOGICAL_DISTRICT_NAME', 'LOGICAL_REGION', 'LOGICAL_REGION_NAME', 'STORE_LAYER',
        'STORE_NAME_SECONDARY', 'STORE_NAME10', 'STORE_NAME3', 'VAT_INCLUDE_IND',
        'STORE_SUB_LAYER', 'STORE_SUB_LAYER_2', 'DB_EXTRACTION_TIMESTAMP'
    ]
    
    return df[column_order]

# ==================== DIM_TIME ====================
def generate_dim_time(start_date=START_DATE, end_date=END_DATE):
    dates = pd.date_range(start=start_date, end=end_date, freq='D')
    time_data = []
    
    for date in dates:
        sk_time = float(date.strftime('%Y%m%d'))
        
        time_record = {
            'SK_TIME': sk_time,
            'TIME_ID': date,
            'NU_DAY': float(date.day),
            'NU_DAY_WEEK': float(date.weekday() + 1),
            'NB_DAY_WEEK': date.strftime('%A'),
            'NU_WEEK_YEAR': float(date.isocalendar()[1]),
            'NB_WEEK_YEAR': f'W{date.isocalendar()[1]:02d}',
            'NU_MONTH': float(date.month),
            'NB_MONTH': date.strftime('%B'),
            'NB_MONTH_N': date.strftime('%b'),
            'ID_MONTH': float(date.strftime('%Y%m')),
            'MONTH_START_DATE': date.replace(day=1),
            'MONTH_END_DATE': (date.replace(day=1) + timedelta(days=32)).replace(day=1) - timedelta(days=1),
            'NU_QUARTER': float((date.month - 1) // 3 + 1),
            'NB_QUARTER': f'Q{(date.month - 1) // 3 + 1}',
            'QUARTER_START_DATE': date.replace(month=((date.month - 1) // 3) * 3 + 1, day=1),
            'QUARTER_END_DATE': (date.replace(month=((date.month - 1) // 3) * 3 + 1, day=1) + timedelta(days=95)).replace(day=1) - timedelta(days=1),
            'NU_YEAR': float(date.year),
            'YEAR_START_DATE': date.replace(month=1, day=1),
            'YEAR_END_DATE': date.replace(month=12, day=31),
            'NU_HALF': float(1 if date.month <= 6 else 2),
            'NB_HALF': f'H{1 if date.month <= 6 else 2}',
            'TIME_CALENDAR': date.strftime('%Y-%m-%d'),
            'TIME_EXCLUDE': None,
            'CREATED_DATE': datetime(2023, 1, 1),
            'CREATED_BY': 'SYSTEM',
            'LAST_UPD': datetime.now(),
            'LAST_UPD_BY': 'SYSTEM',
            'DB_EXTRACTION_TIMESTAMP': datetime.now()
        }
        
        # Add additional fields with None/default values
        for field in ['SK_TIME_WEEK', 'SK_TIME_WEEK_YAGO', 'SK_TIME_YAGO', 'SK_TIME_YAGO_WEEK_454',
                      'NU_YEAR_FISCAL', 'NU_QUARTER_FISCAL', 'NB_QUARTER_FISCAL', 'NU_HALF_FISCAL',
                      'NB_HALF_FISCAL', 'NU_MONTH_454', 'NB_MONTH_454', 'ID_MONTH_454', 'NU_WEEK_454',
                      'NB_WEEK_454', 'NU_YEAR_454', 'NU_MONTH_454_FISCAL', 'NB_MONTH_454_FISCAL',
                      'ID_MONTH_454_FISCAL', 'NU_YEAR_454_FISCAL', 'NU_QUARTER_454_FISCAL',
                      'NB_QUARTER_454_FISCAL', 'NU_HALF_454_FISCAL', 'NB_HALF_454_FISCAL', 'ID_PERIOD',
                      'NU_PERIOD', 'NB_PERIOD', 'NB_MONTH_P', 'NO_OF_WEEKS', 'NU_WEEK_YEAR_FISCAL',
                      'WEEK_START_DATE', 'WEEK_END_DATE', 'MONTH_START_DATE_454', 'MONTH_END_DATE_454',
                      'QUARTER_FISCAL_START_DATE', 'QUARTER_FISCAL_END_DATE', 'HALF_START_DATE',
                      'HALF_END_DATE', 'HALF_FISCAL_START_DATE', 'HALF_FISCAL_END_DATE',
                      'YEAR_FISCAL_START_DATE', 'YEAR_FISCAL_END_DATE']:
            time_record[field] = None
        
        time_data.append(time_record)
    
    df = pd.DataFrame(time_data)
    
    # Ensure column order matches table definition - using available columns
    return df

# ==================== DIM_CHANNEL ====================
def generate_dim_channel():
    channels = [
        {'SK_CHANNEL': 1.0, 'CHANNEL_NAME': 'Tienda Fisica', 'CHANNEL_TYPE': 1.0, 'CHANNEL_TYPE_NAME': 'Retail', 'CHANNEL_ID': 1.0, 'CHANNEL_NAME_ORIGIN': 'STORE'},
        {'SK_CHANNEL': 2.0, 'CHANNEL_NAME': 'E-Commerce', 'CHANNEL_TYPE': 2.0, 'CHANNEL_TYPE_NAME': 'Online', 'CHANNEL_ID': 2.0, 'CHANNEL_NAME_ORIGIN': 'WEB'},
        {'SK_CHANNEL': 3.0, 'CHANNEL_NAME': 'App Movil', 'CHANNEL_TYPE': 2.0, 'CHANNEL_TYPE_NAME': 'Online', 'CHANNEL_ID': 3.0, 'CHANNEL_NAME_ORIGIN': 'APP'},
        {'SK_CHANNEL': 4.0, 'CHANNEL_NAME': 'Call Center', 'CHANNEL_TYPE': 3.0, 'CHANNEL_TYPE_NAME': 'Telefonico', 'CHANNEL_ID': 4.0, 'CHANNEL_NAME_ORIGIN': 'CALL'}
    ]
    
    for channel in channels:
        channel.update({
            'SK_COUNTRY': SK_COUNTRY,
            'CREATED_DATE': datetime(2023, 1, 1),
            'CREATED_BY': 'SYSTEM',
            'LAST_UPD': datetime.now(),
            'LAST_UPD_BY': 'SYSTEM',
            'DB_EXTRACTION_TIMESTAMP': datetime.now()
        })
    
    df = pd.DataFrame(channels)
    
    # Ensure column order matches table definition
    column_order = [
        'SK_CHANNEL', 'CHANNEL_NAME', 'CHANNEL_TYPE', 'CHANNEL_TYPE_NAME', 'CHANNEL_ID',
        'SK_COUNTRY', 'LAST_UPD', 'CREATED_DATE', 'CREATED_BY', 'LAST_UPD_BY',
        'CHANNEL_NAME_ORIGIN', 'DB_EXTRACTION_TIMESTAMP'
    ]
    
    return df[column_order]

# ==================== FACT_SALES_PRODUCT ====================
def generate_fact_sales_product(dim_time, dim_product, dim_organization, dim_channel, 
                                 dim_currency, dim_supplier):
    sales = []
    
    # Get active stores
    active_stores = dim_organization[dim_organization['STORE_ACTIVE'] == 'Y']['SK_ORGANIZATION'].tolist()
    
    # Get active products
    active_products = dim_product[dim_product['STATUS'] == 'A']['SK_PRODUCT'].tolist()
    
    # Filter time dimension for sales period
    sales_dates = dim_time[
        (dim_time['TIME_ID'] >= SALES_START_DATE) & 
        (dim_time['TIME_ID'] <= SALES_END_DATE)
    ]['SK_TIME'].tolist()
    
    print(f"Generating sales for {len(sales_dates)} days across {len(active_stores)} stores...")
    print(f"Expected records: ~{len(sales_dates) * len(active_stores) * SALES_PER_STORE_PER_DAY:,}")
    
    # Generate sales for each day and store
    for sk_time in sales_dates:
        business_date = dim_time[dim_time['SK_TIME'] == sk_time]['TIME_ID'].iloc[0]
        
        for sk_organization in active_stores:
            # Generate multiple transactions per store per day
            n_transactions = random.randint(
                int(SALES_PER_STORE_PER_DAY * 0.7), 
                int(SALES_PER_STORE_PER_DAY * 1.3)
            )
            
            for _ in range(n_transactions):
                sk_product = random.choice(active_products)
                product_info = dim_product[dim_product['SK_PRODUCT'] == sk_product].iloc[0]
                
                units = random.randint(1, 5)  # Most purchases are 1-5 units
                unit_price = round(random.uniform(5, 500), 2)
                total_sale_tax = round(units * unit_price, 2)
                tax_rate = 0.16  # IVA Venezuela
                tax = round(total_sale_tax * tax_rate, 2)
                total_sale = round(total_sale_tax - tax, 2)
                cost_margin = random.uniform(0.4, 0.7)
                total_cost = round(total_sale * cost_margin, 2)
                generic_cost = round(total_cost * random.uniform(0.95, 1.05), 2)
                
                # Weight currency selection - 70% VES, 30% USD for Venezuela
                sk_currency = 1.0 if random.random() < 0.7 else 2.0
                
                sale = {
                    'SK_TIME': sk_time,
                    'SK_COUNTRY': SK_COUNTRY,
                    'SK_ORGANIZATION': sk_organization,
                    'SK_CHANNEL': random.choice(dim_channel['SK_CHANNEL'].tolist()),
                    'SK_PRODUCT': sk_product,
                    'SK_CURRENCY': sk_currency,
                    'SK_SUPPLIER': product_info['BUD_INT'],
                    'BUSINESS_DATE': business_date,
                    'TOTAL_SALE': total_sale,
                    'UNITS': float(units),
                    'LAST_UPD': datetime.now(),
                    'CREATED_DATE': business_date,
                    'CREATED_BY': 'SYSTEM',
                    'LAST_UPD_BY': 'SYSTEM',
                    'SK_MODALITY': 1.0,
                    'TOTAL_SALE_TAX': total_sale_tax,
                    'TAX': tax,
                    'TOTAL_COST': total_cost,
                    'Q_TRANS': 1.0,
                    'GENERIC_TOTAL_COST': generic_cost,
                    'DB_EXTRACTION_TIMESTAMP': datetime.now()
                }
                sales.append(sale)
    
    df = pd.DataFrame(sales)
    
    # Ensure column order matches table definition
    column_order = [
        'SK_TIME', 'SK_COUNTRY', 'SK_ORGANIZATION', 'SK_CHANNEL', 'SK_PRODUCT', 'SK_CURRENCY',
        'SK_SUPPLIER', 'BUSINESS_DATE', 'TOTAL_SALE', 'UNITS', 'LAST_UPD', 'CREATED_DATE',
        'CREATED_BY', 'LAST_UPD_BY', 'SK_MODALITY', 'TOTAL_SALE_TAX', 'TAX', 'TOTAL_COST',
        'Q_TRANS', 'GENERIC_TOTAL_COST', 'DB_EXTRACTION_TIMESTAMP'
    ]
    
    return df[column_order]

# ==================== MAIN EXECUTION ====================
print("Generating dimensional tables...")

# Generate dimensions
print("1. Generating dim_currency...")
dim_currency = generate_dim_currency()

print("2. Generating dim_supplier...")
dim_supplier = generate_dim_supplier(n_suppliers=10)

print("3. Generating dim_product...")
dim_product = generate_dim_product(n_suppliers=10, products_per_supplier=100)

print("4. Generating dim_organization...")
dim_organization = generate_dim_organization(n_stores=100)

print("5. Generating dim_time...")
dim_time = generate_dim_time()

print("6. Generating dim_channel...")
dim_channel = generate_dim_channel()

print("7. Generating fact_sales_product...")
fact_sales = generate_fact_sales_product(
    dim_time, dim_product, dim_organization, 
    dim_channel, dim_currency, dim_supplier
)

print("\n" + "="*50)
print("DATA GENERATION SUMMARY")
print("="*50)
print(f"dim_currency records: {len(dim_currency)}")
print(f"dim_supplier records: {len(dim_supplier)}")
print(f"dim_product records: {len(dim_product)}")
print(f"dim_organization records: {len(dim_organization)}")
print(f"dim_time records: {len(dim_time)}")
print(f"dim_channel records: {len(dim_channel)}")
print(f"fact_sales_product records: {len(fact_sales)}")

# ==================== SAVE TO DATABRICKS ====================
print("\nWriting data to Databricks Delta tables...")

# Convert pandas DataFrames to Spark DataFrames
print("Converting to Spark DataFrames...")
spark_dim_currency = spark.createDataFrame(dim_currency)
spark_dim_supplier = spark.createDataFrame(dim_supplier)
spark_dim_product = spark.createDataFrame(dim_product)
spark_dim_organization = spark.createDataFrame(dim_organization)
spark_dim_time = spark.createDataFrame(dim_time)
spark_dim_channel = spark.createDataFrame(dim_channel)
spark_fact_sales = spark.createDataFrame(fact_sales)

# Create schema if it doesn't exist
print("Ensuring workspace.operations schema exists...")
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.operations")

# Write to Delta tables in workspace.operations schema with overwrite mode
print("Writing gold_dim_currency...")
spark_dim_currency.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("workspace.operations.gold_dim_currency")

print("Writing gold_dim_supplier...")
spark_dim_supplier.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("workspace.operations.gold_dim_supplier")

print("Writing gold_dim_product...")
spark_dim_product.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("workspace.operations.gold_dim_product")

print("Writing gold_dim_organization...")
spark_dim_organization.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("workspace.operations.gold_dim_organization")

print("Writing gold_dim_time...")
spark_dim_time.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("workspace.operations.gold_dim_time")

print("Writing gold_dim_channel...")
spark_dim_channel.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("workspace.operations.gold_dim_channel")

print("Writing gold_fact_sales_product (partitioned by BUSINESS_DATE)...")
spark_fact_sales.write.format("delta").mode("overwrite").option("overwriteSchema", "true").partitionBy("BUSINESS_DATE").saveAsTable("workspace.operations.gold_fact_sales_product")

print("\n" + "="*50)
print("âœ… ALL TABLES SUCCESSFULLY WRITTEN!")
print("="*50)
print("Schema: workspace.operations")
print("Tables created:")
print("  - gold_dim_currency")
print("  - gold_dim_supplier")
print("  - gold_dim_product")
print("  - gold_dim_organization")
print("  - gold_dim_time")
print("  - gold_dim_channel")
print("  - gold_fact_sales_product (partitioned)")
print("="*50)