**1) Environment Setup and Drive Mount**

Cell ensures all necessary components are ready: the Faker library is installed, Google Drive is mounted, and required Python modules imported.

In [None]:
### 1. Setup, Imports, and Drive Mount ###

# Reinstall Faker
%pip install faker

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import csv
import random
import json
import os
from datetime import datetime, timedelta
from faker import Faker

# --- CONFIGURATION (Must match Notebook 1) ---
fake = Faker()
# Re-set seed
Faker.seed(42)
random.seed(42)

# Set the OUTPUT DIRECTORY
output_dir = '/content/drive/MyDrive/SQL Project (Group 2)/Simulating Data (Maya)/Simulated Data Files'
os.makedirs(output_dir, exist_ok=True)

# --- Data Utility Functions ---
def load_ids(filename):
    """Loads a single column of integer IDs from a specified CSV file."""
    ids = []
    filepath = os.path.join(output_dir, filename)
    try:
        with open(filepath, 'r', newline='') as file:
            reader = csv.reader(file)
            next(reader)  # Skips header
            for row in reader:
                ids.append(int(row[0]))
    except FileNotFoundError:
        # The only required read in this section is the implicit need for addresses,
        # but we rely on sequential writing for PKs/FKs here, so explicit load
        # functions are added for clarity in later steps.
        pass
    return ids

def write_csv(filename, headers, data_rows):
    """
    Writes data to a CSV file.
    CRITICAL STEP: Converts empty strings ('') to None for SQL NULL interpretation.
    """
    filepath = os.path.join(output_dir, filename)
    with open(filepath, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(headers)
        clean_rows = [[(val if val != '' else None) for val in row] for row in data_rows]
        writer.writerows(clean_rows)

Collecting faker
  Downloading faker-38.0.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-38.0.0-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/2.0 MB[0m [31m16.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-38.0.0
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**2) Generate Organizational and People Data**


Section generates the core personnel and organizational entities: `offices`, `users`, `agents`, and `clients`. Tables establish the primary key structure for the rest of the database (Level 2 & 3 dependencies).

* **Foreign Key Reliance:** Uses `address_id` (1-10) for offices and `address_id` (11-110) for clients, established in `addresses.csv`.
* **Data Integrity:** Emails and roles are generated as lowercase to respect PostgreSQL's `CHECK` constraints.

**3) Generate Offices, Users, Agents, and Clients**

Generates and saves four core personnel/organization tables, while also creating simple lists of IDs (agent_ids, client_ids) needed for the next notebook.

In [None]:
# --- Generate Offices (10) ---
offices = []
for i in range(10):
    name = f"Dream Homes {fake.city()} Office"
    phone = fake.numerify('##########') # Simplified 10-digit format
    email = fake.company_email().lower() # Standardized lowercase
    address_id = i + 1 # FK to addresses.address_id (1-10)

    # manager_agent_id is empty/NULL initially, will be manually assigned later if needed
    offices.append([i+1, name, phone, email, address_id, ''])

write_csv('offices.csv',
    ['office_id', 'name', 'phone', 'email', 'address_id', 'manager_agent_id'],
    offices)


# --- Generate Users (100) ---
users = []
user_ids = list(range(1, 101))
for i in range(100):
    username = fake.user_name() + str(i)
    email = fake.email().lower() # Standardized lowercase
    role = random.choices(['agent', 'manager', 'admin'], weights=[85, 12, 3]) [0]
    created_at = fake.date_time_between(start_date='-2y', end_date='now').strftime('%Y-%m-%d %H:%M:%S')

    users.append([i+1, username, email, role, created_at])

write_csv('users.csv', ['user_id', 'username', 'email', 'role', 'created_at'], users)


# --- Generate Agents (100) ---
agents = []
agent_ids = list(range(1, 101))
for i in range(100):
    first_name = fake.first_name()
    last_name = fake.last_name()
    email = f"{first_name}.{last_name}{i}@dreamhomesnyc.com".lower()
    phone = fake.numerify('##########') # Simplified 10-digit format
    license = fake.bothify('NY-######-????').upper()
    emp_type = random.choice(['Full-Time', 'Part-Time'])
    commission = round(random.uniform(0.02, 0.06), 4)
    hire_date = fake.date_between(start_date='-5y', end_date='today').strftime('%Y-%m-%d')
    active = random.choices([True, False], weights=[90, 10]) [0]
    user_id = i + 1 # FK to users.user_id (1:1 mapping)
    role = random.choices(['agent', 'manager', 'admin'], weights=[85, 12, 3]) [0]
    office_id = random.randint(1, 10) # FK to offices.office_id

    agents.append([i+1, first_name, last_name, email, phone, license, emp_type,
                   commission, hire_date, active, user_id, role, office_id])

write_csv('agents.csv',
    ['agent_id', 'first_name', 'last_name', 'email', 'phone', 'license_number', 'employment_type',
     'commission_rate', 'hire_date', 'active', 'user_id', 'role', 'office_id'], agents)


# --- Generate Clients (100) ---
clients = []
client_ids = list(range(1, 101))
for i in range(100):
    first_name = fake.first_name()
    last_name = fake.last_name()
    email = fake.email().lower()
    phone = fake.numerify('##########')
    client_type = random.choice(['Buyer', 'Seller', 'Renter', 'Buyer/Seller'])

    demographics = {
        'age_range': random.choice(['25-34', '35-44', '45-54', '55-64', '65+']),
        'household_income': random.choice(['$50k-$75k', '$75k-$100k', '$100k-$150k', '$150k-$200k', '$200k+']),
        'household_size': random.randint(1, 5)
    }

    mailing_address_id = 11 + i # FK to addresses.address_id (11-110)

    clients.append([i+1, first_name, last_name, email, phone, client_type,
                   json.dumps(demographics), mailing_address_id])

write_csv('clients.csv',
    ['client_id', 'first_name', 'last_name', 'email', 'phone', 'client_type', 'demographics', 'mailing_address_id'],
    clients)


# --- Save FK References as simple CSVs for the next notebook (Level 3 Dependencies) ---
# Ensures Notebook 3 can read the available IDs without re-running logic.
write_csv('agent_ids.csv', ['agent_id'], [[id_] for id_ in agent_ids])
write_csv('client_ids.csv', ['client_id'], [[id_] for id_ in client_ids])

print("SUCCESS: Offices, Users, Agents, and Clients data saved.")

SUCCESS: Offices, Users, Agents, and Clients data saved.


**4) Next Steps**

All foundational people and organizational data (`offices.csv`, `users.csv`, `agents.csv`, `clients.csv`) has been successfully saved. Additionally, two simple reference files (`agent_ids.csv` and `client_ids.csv`) were saved for foreign key lookups.

**Proceed to Notebook 3: `03_Property_Prep.ipynb`**