**1) Environment Setup, Install, and Load Dependencies**


Cell installs faker and defines all necessary paths and utility functions before loading the foreign key IDs.

In [None]:
### 1. Setup, Install, Imports, and Load Foreign Key IDs ###

# Reinstall Faker
%pip install faker

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import csv
import random
import json
import os
from datetime import datetime, timedelta
from faker import Faker

# --- CONFIGURATION (Must match Notebooks 1 & 2) ---
fake = Faker()
Faker.seed(42)
random.seed(42)

output_dir = '/content/drive/MyDrive/SQL Project (Group 2)/Simulating Data (Maya)/Simulated Data Files'
os.makedirs(output_dir, exist_ok=True)

# --- Data Utility Functions (Redefined) ---

def load_ids(filename):
    """Loads a single column of integer IDs from a specified CSV file."""
    ids = []
    filepath = os.path.join(output_dir, filename)
    try:
        with open(filepath, 'r', newline='') as file:
            reader = csv.reader(file)
            next(reader)  # Skip header
            for row in reader:
                ids.append(int(row[0]))
    except FileNotFoundError:
        print(f"Error: {filename} not found. Please ensure preceding notebooks were run and check your file path.")
    return ids

def write_csv(filename, headers, data_rows):
    """
    Writes data to a CSV file.
    CRITICAL STEP: Converts empty strings ('') to None for SQL NULL interpretation.
    """
    filepath = os.path.join(output_dir, filename)
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        clean_rows = [[(val if val != '' else None) for val in row] for row in data_rows]
        writer.writerows(clean_rows)

# --- Load Necessary Foreign Keys ---
# Loads client IDs saved from Notebook 2 for FK assignments
client_ids = load_ids('client_ids.csv')
# Addresses 111-150 are reserved for properties (total 40 IDs for multi-unit simulation)
property_address_ids = list(range(111, 151))

Collecting faker
  Downloading faker-38.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-38.0.0-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.8/2.0 MB[0m [31m25.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.0/2.0 MB[0m [31m32.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-38.0.0
Mounted at /content/drive


### 2) Generate Client Preferences and Property Inventory

Creates the property listing base (`properties`) and the buyer criteria (`client_preferences`). The **`properties`** table forms the foundation for the next data stage by generating 100 primary keys (PKs) that will be referenced by listings and transactions.

* **Client Preferences (Level 4 Dependency):** Generated for the first **80 clients** only, simulating incomplete buyer profiles.
* **Property Inventory (Level 4 Dependency):** 100 properties are generated, deliberately reusing a pool of 40 addresses (IDs 111-150) to simulate dense urban environments.

**3) Generate Client Preferences and Properties**


Cell executes the logic to generate the client preference and property data, and saves the CSVs.

In [None]:
# --- 6. Generate Client Preferences (80) ---
preferences = []
# Generate for 80 clients only to simulate missing data
for i in range(80):
    client_id = client_ids[i] # FK to clients.client_id
    min_price = round(random.uniform(200000, 500000), 2)
    max_price = round(min_price + random.uniform(200000, 1000000), 2)
    bedrooms = random.randint(1, 5)
    bathrooms = random.choice([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0])
    property_type = random.randint(1, 5) # FK to property_types.property_type_id

    preferences.append([client_id, min_price, max_price, bedrooms, bathrooms, property_type])

write_csv('client_preferences.csv',
    ['client_id', 'min_price', 'max_price', 'desired_bedrooms', 'desired_bathrooms', 'property_type_id'],
    preferences)


# --- 7. Generate Properties (100) ---
properties = []
property_ids = list(range(1, 101))
for i in range(100):
    address_id = property_address_ids[i % len(property_address_ids)] # FK to addresses.address_id (111-150)
    property_type = random.randint(1, 5)
    bedrooms = random.randint(0, 6)
    bathrooms = random.choice([1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])
    sqft = random.randint(500, 5000)
    year_built = random.randint(1920, 2024)

    # Simplified and dynamic descriptions
    base_descriptions = [
        "property with modern amenities and great natural light",
        "sq ft home featuring hardwood floors and updated kitchen",
        "Charming property in prime location with easy access to transportation",
        "Renovated unit with stainless steel appliances and granite countertops",
        "Stunning property with panoramic views and premium finishes throughout"
    ]
    if random.random() < 0.5:
        desc_template = random.choice([f"Beautiful {bedrooms}-bedroom {{}}.", f"Spacious {sqft} sq ft {{}}."])
        # Format the description using a random generic description part
        description = desc_template.format(random.choice(base_descriptions).split(' ', 1)[-1])
    else:
        description = random.choice(base_descriptions)

    # 70% chance of having an owner_client_id (FK to clients.client_id)
    owner_client_id = random.choice(client_ids) if random.random() < 0.7 else ''

    properties.append([i+1, address_id, property_type, bedrooms, bathrooms, sqft,
                      year_built, description, owner_client_id])

write_csv('properties.csv',
    ['property_id', 'address_id', 'property_type_id', 'bedrooms', 'bathrooms', 'square_feet',
     'year_built', 'description', 'owner_client_id'],
    properties)


# --- Save FK References as simple CSVs for the next notebook ---
# This list is crucial for generating Listings and all subsequent transactional data
write_csv('property_ids.csv', ['property_id'], [[id_] for id_ in property_ids])

print("SUCCESS: client_preferences.csv and properties.csv saved.")

SUCCESS: client_preferences.csv and properties.csv saved.


**4)Next Steps**

The property and preference data has been generated and saved. The `property_ids.csv` reference file is ready.

**Proceed to Notebook 4: `04_Activity_Gen.ipynb`**