<a href="https://colab.research.google.com/github/michaelwnau/consequential-products/blob/main/synthetic_21_526ez.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faker

In [None]:
import json
import random
from faker import Faker
from datetime import datetime, timedelta
from google.colab import drive
import os

In [None]:
# Initialize Faker for generating synthetic data
faker = Faker()

In [None]:
# Define historical service periods for clustering
HISTORICAL_PERIODS = [
    ("1968-01-01", "1975-04-30"),  # Vietnam War
    ("1990-08-02", "1991-07-31"),  # Gulf War
    ("2002-11-15", "2021-01-01")   # Post-9/11 conflicts
]

In [None]:
# Function to generate a date within a given period
def generate_date(start_date, end_date, min_days=180, max_years=20):
    """Generate a random date between start_date and end_date, ensuring at least `min_days` apart."""

    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    # Ensure start date is not later than end date
    if start > end:
        start, end = end, start  # Swap values if needed

    # Generate a start date within the allowed range, considering min_days
    max_start_date = end - timedelta(days=min_days)
    service_start = faker.date_between(start_date=start, end_date=max_start_date)

    # Ensure end date is at least `min_days` later and within allowed range
    earliest_end_date = service_start + timedelta(days=min_days)
    end_date_adjusted = min(end.date(), service_start + timedelta(days=(max_years * 365)))

    service_end = faker.date_between(start_date=earliest_end_date, end_date=end_date_adjusted)

    return service_start.strftime("%Y-%m-%d"), service_end.strftime("%Y-%m-%d")

In [None]:
# Function to randomly select a military service period
def select_service_period():
    """Select a valid historical service period and ensure the start date is at least 180 days before the end date."""
    start_date, end_date = random.choice(HISTORICAL_PERIODS)  # Pick a historical service period
    service_start, service_end = generate_date(start_date, end_date)  # Generate valid dates

    return service_start, service_end


In [None]:
# Function to generate a synthetic VA claim
def generate_synthetic_claim():
    """Generate a single synthetic VA Form 21-526EZ claim submission."""

    # Select a military service period
    service_start, service_end = select_service_period()

    # Generate synthetic claim details
    claim = {
        "veteran_info": {
            "name": faker.name(),
            "ssn": faker.ssn(),
            "dob": generate_date("1968-01-01", "2003-12-31"),  # Ensuring no dates before 1968
            "phone": faker.phone_number(),
            "email": faker.email(),
            "address": faker.address()
        },
        "claim_type": random.choice(["FDC", "Standard"]),
        "homeless_status": random.choice(["Yes", "No"]),
        "exposure_info": {
            "toxic_exposures": random.sample(["Agent Orange", "Radiation", "Burn Pits", "Asbestos"], k=random.randint(0, 2)),
            "dates": generate_date(service_start, service_end)
        },
        "disabilities": [
            {
                "condition": random.choice(["Hearing Loss", "Diabetes", "PTSD", "Knee Injury"]),
                "service_connection": random.choice(["Direct", "Presumptive", "Secondary"]),
                "date_of_onset": generate_date(service_start, service_end)
            } for _ in range(3)  # Exactly three rows in Section V
        ],
        "additional_claims": [
            {
                "condition": random.choice(["Lung Disease", "Back Injury", "TBI"]),
                "date_of_onset": generate_date(service_start, service_end)
            } for _ in range(3)  # Exactly three rows in Section XIII
        ],
        "service_info": {
            "branch": random.choice(["Army", "Navy", "Air Force", "Marines", "Coast Guard"]),
            "service_dates": {
                "start": service_start,
                "end": service_end
            },
            "combat_zone": random.choice(["Yes", "No"])
        },
        "financial_info": {
            "receives_military_retirement": random.choice(["Yes", "No"]),
            "direct_deposit": {
                "bank": faker.company(),
                "account_type": random.choice(["Checking", "Savings"]),
                "account_number": faker.bban()
            }
        },
        "certification": {
            "signed": True,
            "date_signed": faker.date_this_year().strftime("%Y-%m-%d")
        }
    }
    return claim

In [None]:
# Function to save the JSON file in Google Drive
def save_to_google_drive(data, filename="synthetic_claim.json"):
    """Save JSON data to a Google Drive folder named 'synthetic-data-21-526ez'."""

    # Mount Google Drive
    drive.mount('/content/drive')

    # Define the folder path in Google Drive
    folder_path = "/content/drive/My Drive/synthetic-data-21-526ez/"

    # Ensure the directory exists
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Save the JSON file
    file_path = os.path.join(folder_path, filename)
    with open(file_path, "w") as f:
        json.dump(data, f, indent=4)

    print(f"File saved to Google Drive: {file_path}")

# Generate a single synthetic claim
synthetic_data = generate_synthetic_claim()
save_to_google_drive(synthetic_data)

# Uncomment the following lines to generate multiple claims (up to 999)
# synthetic_claims = [generate_synthetic_claim() for _ in range(999)]
# save_to_google_drive(synthetic_claims, filename="multiple_synthetic_claims.json")


In [None]:
# Generate a single synthetic claim
synthetic_data = generate_synthetic_claim()
save_to_google_drive(synthetic_data)

# Uncomment the following lines to generate multiple claims (up to 999)
# synthetic_claims = [generate_synthetic_claim() for _ in range(999)]
# save_to_google_drive(synthetic_claims, filename="multiple_synthetic_claims.json")