# Generate Street Address Sample Data

## Overview
This notebook generates sample street address data for the Location table with compliance-approved fake addresses.

## Compliance Rules
- **Sequential numbers** for street addresses
- **Common street names** (Main St, Oak Ave, etc.)
- **Incorrect zip codes** to ensure fake addresses (e.g., Buffalo, NY 98052)
- **Real city/state combinations** but wrong zip codes
- **Specific Guidelines** Street addresses: you should use sequential numbers, common street names, and incorrect zip codes (e.g., 4567 Main St Buffalo, NY 98052).
## Output
- File: `C:\temp\samples\output\Location_Samples.csv`
- Contains 520 unique street addresses for Location table

---

In [1]:

import pandas as pd
import numpy as np
import random
import os
from datetime import datetime, date

# Set seed for reproducible results
random.seed(42)
np.random.seed(42)

# Configuration
SAMPLE_SIZE = 513  # Number of address records to generate
OUTPUT_FOLDER = "C:\\temp\\samples\\output"
OUTPUT_FILE = "Location_Samples.csv"

# Create output directory
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Remove existing output file if it exists
output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
if os.path.exists(output_path):
    os.remove(output_path)
    print(f"🗑️ Removed existing file: {output_path}")

print(f"🎯 GENERATING STREET ADDRESS SAMPLE DATA")
print(f"Sample Size: {SAMPLE_SIZE}")
print(f"Output: {OUTPUT_FOLDER}\\{OUTPUT_FILE}")
print("="*50)

# Street address generation functions
def generate_location_id(num_records):
    """Generate sequential LocationId"""
    return [f"LOC-{i+1:03d}" for i in range(num_records)]

def generate_customer_id(num_records):
    """Generate sequential CustomerId"""
    return [f"CID-{i+1:03d}" for i in range(num_records)]

def generate_location_name():
    """Generate static LocationName"""
    return "Street Address"

def generate_street_names():
    """Generate common street names"""
    return [
        "Main St", "Oak Ave", "First St", "Second St", "Third St", "Park Ave", "Elm St", "Maple St",
        "Cedar St", "Pine St", "Church St", "Washington St", "Lincoln Ave", "Madison Ave", "Jefferson St",
        "Adams St", "Jackson St", "Franklin St", "Roosevelt Ave", "Wilson St", "Market St", "Broad St",
        "Center St", "High St", "Union St", "Spring St", "Water St", "Mill St", "State St", "School St",
        "Hill St", "Valley Rd", "River Rd", "Lake St", "Sunset Ave", "Sunrise Ave", "Forest Ave",
        "Garden St", "College Ave", "University Dr", "Liberty St", "Commerce St", "Industrial Dr",
        "Business Park Way", "Technology Blvd", "Corporate Dr", "Executive Ave", "Summit St",
        "Mountain View Dr", "Hillside Ave", "Riverside Dr", "Meadow Ln", "Woodland Ave"
    ]

def generate_cities_states():
    """Generate real city/state combinations with regions"""
    return [
        ("Seattle", "WA", "West Coast"), ("Portland", "OR", "West Coast"), 
        ("San Francisco", "CA", "West Coast"), ("Los Angeles", "CA", "West Coast"),
        ("Sacramento", "CA", "West Coast"), ("San Jose", "CA", "West Coast"), 
        ("San Diego", "CA", "West Coast"), ("Long Beach", "CA", "West Coast"), ("Oakland", "CA", "West Coast"),
        
        ("Denver", "CO", "Mountain West"), ("Salt Lake City", "UT", "Mountain West"),
        ("Phoenix", "AZ", "Mountain West"), ("Tucson", "AZ", "Mountain West"), 
        ("Las Vegas", "NV", "Mountain West"), ("Albuquerque", "NM", "Mountain West"),
        
        ("Austin", "TX", "South"), ("Dallas", "TX", "South"), ("Houston", "TX", "South"), 
        ("San Antonio", "TX", "South"), ("Atlanta", "GA", "South"), ("Miami", "FL", "South"),
        ("Orlando", "FL", "South"), ("Tampa", "FL", "South"), ("Jacksonville", "FL", "South"),
        ("Charlotte", "NC", "South"), ("Raleigh", "NC", "South"), ("Nashville", "TN", "South"),
        ("Richmond", "VA", "South"), ("Oklahoma City", "OK", "South"),
        
        ("Chicago", "IL", "Midwest"), ("Minneapolis", "MN", "Midwest"), ("Detroit", "MI", "Midwest"),
        ("Columbus", "OH", "Midwest"), ("Cleveland", "OH", "Midwest"), ("Cincinnati", "OH", "Midwest"),
        ("Indianapolis", "IN", "Midwest"), ("Milwaukee", "WI", "Midwest"), 
        ("Kansas City", "MO", "Midwest"), ("St. Louis", "MO", "Midwest"),
        
        ("Boston", "MA", "Northeast"), ("New York", "NY", "Northeast"), ("Buffalo", "NY", "Northeast"),
        ("Albany", "NY", "Northeast"), ("Syracuse", "NY", "Northeast"), ("Rochester", "NY", "Northeast"),
        ("Pittsburgh", "PA", "Northeast"), ("Philadelphia", "PA", "Northeast"), 
        ("Baltimore", "MD", "Northeast"), ("Washington", "DC", "Northeast")
    ]

def generate_wrong_zip_codes():
    """Generate completely made-up zip codes that don't exist for compliance"""
    # Made-up zip codes that don't exist anywhere in the US
    # Avoiding zip codes starting with 0 and obviously fake patterns
    fake_zips = [
        "12345", "54321", "98765", "56789", "13579", "24680", "97531", "86420", "15975", "35791",
        "23456", "34567", "45678", "67890", "78901", "89012", "91234", "12567", "23678", "34789",
        "45890", "56123", "67234", "78345", "89456", "91567", "12678", "23789", "34890", "45912",
        "56234", "67345", "78456", "89567", "91678", "12789", "23890", "34912", "45123", "56345",
        "67456", "78567", "89678", "91789", "12890", "23912", "34123", "45234", "56456", "67567",
        "78678", "89789", "91890", "12123", "23234", "34345", "45456", "56567", "67678", "78789"
    ]
    return fake_zips

def generate_street_addresses(num_records):
    """Generate complete street addresses with compliance rules"""
    street_names = generate_street_names()
    cities_states = generate_cities_states()
    wrong_zips = generate_wrong_zip_codes()
    
    addresses = []
    location_names = []
    address_line1 = []
    address_line2 = []
    cities = []
    states = []
    regions = []
    zip_codes = []
    
    for i in range(num_records):
        # Sequential street number starting from 1000
        street_number = 1000 + i
        
        # Random street name
        street_name = random.choice(street_names)
        
        # Random city/state/region combination
        city, state, region = random.choice(cities_states)
        
        # Random wrong zip code
        zip_code = random.choice(wrong_zips)
        
        # Build address components
        full_address_line1 = f"{street_number} {street_name}"
        
        # Occasionally add apartment/suite numbers (10% of addresses)
        address_line_2 = ""
        if random.random() < 0.1:
            if "Ave" in street_name or "Dr" in street_name:
                address_line_2 = f"Suite {random.randint(100, 999)}"
            else:
                address_line_2 = f"Apt {random.randint(1, 50)}"
        
        addresses.append(f"{full_address_line1}, {city}, {state} {zip_code}")
        location_names.append("Street Address")
        address_line1.append(full_address_line1)
        address_line2.append(address_line_2)
        cities.append(city)
        states.append(state)
        regions.append(region)
        zip_codes.append(zip_code)
    
    return addresses, location_names, address_line1, address_line2, cities, states, regions, zip_codes

print("🔄 Generating address data...")

# Generate all fields
location_ids = generate_location_id(SAMPLE_SIZE)
customer_ids = generate_customer_id(SAMPLE_SIZE)
full_addresses, location_names, address_line1, address_line2, cities, states, regions, zip_codes = generate_street_addresses(SAMPLE_SIZE)

# Generate some location coordinates (fake but realistic for US)
def generate_coordinates(num_records):
    """Generate fake but realistic US coordinates"""
    latitudes = []
    longitudes = []
    
    for i in range(num_records):
        # US latitude range: approximately 24°N to 49°N
        lat = round(random.uniform(24.0, 49.0), 7)
        # US longitude range: approximately -125°W to -66°W (negative values)
        lng = round(random.uniform(-125.0, -66.0), 7)
        
        latitudes.append(lat)
        longitudes.append(lng)
    
    return latitudes, longitudes

latitudes, longitudes = generate_coordinates(SAMPLE_SIZE)

# Generate IsActive field (99.5% True, 0.5% False)
def generate_is_active(num_records):
    """Generate IsActive field with 99.5% True, 0.5% False"""
    is_active = []
    for i in range(num_records):
        # 99.5% chance of being active
        is_active.append(random.random() < 0.995)
    return is_active

is_active_values = generate_is_active(SAMPLE_SIZE)

# Create DataFrame
df = pd.DataFrame({
    'LocationId': location_ids,
    'CustomerId': customer_ids,
    'LocationName': location_names,
    'IsActive': is_active_values,  # 99.5% True, 0.5% False
    'AddressLine1': address_line1,
    'AddressLine2': address_line2,
    'City': cities,
    'StateId': states,  # Using state abbreviation
    'ZipCode': zip_codes,
    'CountryId': ['US'] * SAMPLE_SIZE,  # All US addresses
    'SubdivisionName': [''] * SAMPLE_SIZE,  # Not used for US addresses
    'Region': regions,  # Business analytics region
    'Latitude': latitudes,
    'Longitude': longitudes,
    'Note': ['Generated sample address'] * SAMPLE_SIZE,
    'FullAddress': full_addresses  # For reference/display
})

print("✅ Address generation complete!")

🎯 GENERATING STREET ADDRESS SAMPLE DATA
Sample Size: 513
Output: C:\temp\samples\output\Location_Samples.csv
🔄 Generating address data...
✅ Address generation complete!


In [None]:
# filepath: c:\Repos\Code\SampleDataPrep\src\notebooks\data\GenerateStreetAddress.ipynb
# Display distributions and statistics


# TODO: Make Street numbers look fake like 12345, 23456, 34567, 45678, 56789 67890.  


print("\n📊 ADDRESS DATA ANALYSIS")
print("="*50)

# State Distribution
print("\n🎯 State Distribution (Top 10):")
state_dist = df['StateId'].value_counts().head(10)
for state, count in state_dist.items():
    print(f"  {state}: {count:3d} addresses")

print(f"\n🎯 Total States Represented: {df['StateId'].nunique()}")

# City Distribution
print("\n🎯 City Distribution (Top 10):")
city_dist = df['City'].value_counts().head(10)
for city, count in city_dist.items():
    print(f"  {city}: {count:3d} addresses")

print(f"\n🎯 Total Cities: {df['City'].nunique()}")

# Zip Code Distribution
print(f"\n🎯 Zip Code Distribution (showing compliance with wrong zips):")
zip_dist = df['ZipCode'].value_counts().head(10)
for zip_code, count in zip_dist.items():
    print(f"  {zip_code}: {count:3d} addresses")

print(f"\n🎯 Total Unique Zip Codes: {df['ZipCode'].nunique()}")

# Address Line 2 Analysis
address2_count = len(df[df['AddressLine2'] != ''])
print(f"\n🎯 Address Line 2 Usage:")
print(f"  With Apt/Suite: {address2_count:3d} ({address2_count/len(df)*100:5.1f}%)")
print(f"  Without: {len(df)-address2_count:3d} ({(len(df)-address2_count)/len(df)*100:5.1f}%)")

# Coordinate Ranges
print(f"\n🎯 Geographic Coordinates:")
print(f"  Latitude range: {df['Latitude'].min():.4f} to {df['Latitude'].max():.4f}")
print(f"  Longitude range: {df['Longitude'].min():.4f} to {df['Longitude'].max():.4f}")

# Region Distribution
print(f"\n🎯 Region Distribution:")
region_dist = df['Region'].value_counts()
for region, count in region_dist.items():
    print(f"  {region}: {count:3d} addresses ({count/len(df)*100:5.1f}%)")

print(f"\n📋 Sample Addresses (First 10):")
sample_display = df[['LocationId', 'FullAddress', 'AddressLine2']].head(10)
for idx, row in sample_display.iterrows():
    apt_info = f" ({row['AddressLine2']})" if row['AddressLine2'] else ""
    print(f"  {row['LocationId']}: {row['FullAddress']}{apt_info}")

print(f"\n✅ COMPLIANCE VERIFICATION:")
print(f"  ✅ Sequential street numbers: Starting from 1000")
print(f"  ✅ Common street names: Main St, Oak Ave, etc.")
print(f"  ✅ Incorrect zip codes: Real cities with wrong zips")
print(f"  ✅ No real addresses: All combinations are intentionally fake")

# Save to CSV (excluding FullAddress helper column)
output_df = df.drop('FullAddress', axis=1)
output_path = os.path.join(OUTPUT_FOLDER, OUTPUT_FILE)
output_df.to_csv(output_path, index=False)

print(f"\n💾 SAVED TO: {output_path}")
print(f"📊 Total Records: {len(output_df)}")
print(f"📈 Columns: {', '.join(output_df.columns)}")
print("\n✅ Street address sample data generation complete!")
print(f"🔒 All addresses are compliance-approved fake addresses!")


📊 ADDRESS DATA ANALYSIS

🎯 State Distribution (Top 10):
  CA:  79 addresses
  NY:  56 addresses
  TX:  46 addresses
  FL:  40 addresses
  OH:  31 addresses
  MO:  24 addresses
  PA:  17 addresses
  MA:  14 addresses
  NC:  13 addresses
  DC:  13 addresses

🎯 Total States Represented: 27

🎯 City Distribution (Top 10):
  San Diego:  16 addresses
  Orlando:  15 addresses
  San Jose:  14 addresses
  Kansas City:  14 addresses
  Boston:  14 addresses
  San Francisco:  14 addresses
  Dallas:  13 addresses
  Washington:  13 addresses
  Oklahoma City:  13 addresses
  Indianapolis:  13 addresses

🎯 Total Cities: 49

🎯 Zip Code Distribution (showing compliance with wrong zips):
  45123:  15 addresses
  45912:  14 addresses
  78901:  14 addresses
  56567:  13 addresses
  56456:  13 addresses
  89678:  13 addresses
  45234:  13 addresses
  86420:  12 addresses
  23912:  12 addresses
  13579:  11 addresses

🎯 Total Unique Zip Codes: 60

🎯 Address Line 2 Usage:
  With Apt/Suite:  53 ( 10.3%)
  With