## Data Generation Base

In [None]:
import random
import string
import base64
import csv
import os
from tqdm import trange

# -----------------------------
# CONFIGURATION
# -----------------------------
NUM_OPTIONS_PER_ENTITY_1_6 = 50000   # For Entities 1,2,3,4,5,6
NUM_OPTIONS_PER_ENTITY_7_8 = 50000   # For Entities 7,8
NUM_STATEMENTS = 1000000              # Total synthetic statements to generate


# ---------------------------------------------------------------------
#   1) EXPAND LEGENDS (for Entity 8) to 5,000 items
# ---------------------------------------------------------------------

# Building blocks for new legend items
legend_adjectives = [
    "cursed", "long-lost", "wailing", "ancient", "silent", "drifting", "forbidden",
    "dark", "shattered", "twisted", "sunken", "wandering", "maddening", "sleepless",
    "decaying", "haunted", "fractured", "siren's", "everburning", "frozen",
    "vengeful", "sacred", "ethereal", "cackling", "mourning", "gilded", "whispering",
    "bloodstained", "ashen", "hallowed", "rusted", "whirling", "celestial", "ominous",
    "obsidian", "corrupted", "flaming", "trembling", "nightmarish", "moonlit",
    "gloomy", "sorrowful", "spectral", "phantasmal", "arcane", "enchanted", "harrowing",
    "lifeless", "radiant", "abandoned", "eerie", "shimmering", "sorrowful", "whispered",
    "crystal", "murmuring", "shivering", "bleak", "ironbound", "ravenous", "doomed",
    "venomous", "sinister", "echoing", "poisoned", "time-worn", "ageing",
    "blood-tinged", "nameless", "dreaded", "unearthly", "ashen-bound", "crimson",
    "forgotten", "eldritch", "smoldering", "flickering", "timeless", "shaded",
    "obsessive", "burning", "soulbound", "eternal", "wavering", "raging", "grim",
    "spectacular", "crackling", "mournful", "dim", "rattling", "wretched", "desolate",
    "chilling", "torrid", "serene", "boundless", "dreadful", "horrific", "ghastly",
    "nightshade", "howling", "deathly", "anguished", "lonely", "seething", "vengeful",
    "unending", "unhallowed", "damned", "phantom", "ancient-bound", "vengeful-bound",
    "seraphic", "tormented", "flayed", "arcadian", "undying", "storm-touched",
    "faintly-glowing", "storm-ravaged", "weeping", "ever-changing", "scarred",
    "sin-woven", "flayed", "broken", "ivy-covered", "charred", "ravaged", "ashen-veiled",
    "bright-cast", "tarnished", "tainted", "glimmering", "tempest-tossed",
    "death-kissed", "vile", "endless", "forgotten-blood", "oath-bound", "battle-scarred",
    "creeping", "ruined", "corruptive", "dust-laden", "wicked", "phantasmic",
    "shiver-bound", "stained", "moon-veiled", "loneliness-steeped", "dusk-borne",
    "ghoul-marked", "thorn-bound", "remorseless", "ever-chilled", "eclipsed",
    "dawn-touched", "shard-bound", "time-shaken", "betrayal-bound", "night-born",
    "starry", "madness-forged", "swirling", "mist-shrouded", "twilight-bound",
    "drowned", "echo-touched", "star-etched", "flawed", "blade-stained", "chain-bound",
    "sunless", "glow-forged", "tide-lost", "song-bound", "vengeful-marked",
    "void-touched", "labyrinthine", "sundering", "storm-bound", "veil-touched",
    "beacon-torn", "storm-crafted", "hollow-marked", "soul-pierced", "stone-forged",
    "veil-marked", "eclipse-wreathed", "fate-torn", "web-spun", "ever-changing",
    "eternity-steeped", "spirit-worn", "shiver-bound", "ice-carved", "death-hollowed",
    "sun-forged", "moon-bound", "glass-cut", "shadow-bound", "spine-laden", "curse-forged",
    "feather-bound", "night-steeped", "iron-fused", "starborne", "fire-marked",
    "void-laced", "storm-lost", "scythe-marked", "ember-crowned", "mortal-bound",
    "ocean-forged", "abyss-crafted", "storm-fallen", "candle-flamed", "pyre-bound",
    "night-bleeding", "serenity-etched", "haunt-carved", "star-scattered",
    "cry-bound", "void-etched", "cradle-stolen", "harbinger-wrapped",
    "dawnwoven", "gilded-venom", "spite-bound", "light-sundered",
    "silver-tempered", "brine-bound", "sky-wreathed", "sun-lit", "coal-laden",
    "petal-bound", "thunder-slashed", "iron-webbed", "mirror-bound", "rage-marked",
    "ivy-grown", "vine-shadowed", "wave-lost", "flame-pierced", "ice-bound",
    "gleam-bound", "coastal-shadowed", "horizon-bound", "storm-touched", "cry-forged",
    "dread-laden", "thorn-scarred", "gale-bound", "rapture-wrought", "twilight-etched",
    "flame-steeped", "song-touched", "torch-lit", "dagger-laden", "spell-wrought",
    "sorrow-touched", "fate-marked", "sea-shadowed", "beacon-bound", "ripple-marked",
    "tide-struck", "mist-veiled", "shadow-stained", "hollow-bound", "curse-etched",
    "light-pierced", "grief-wrought", "vengeance-sealed", "dread-wrapped",
    "storm-ripped", "wound-marked", "pyre-seared", "dust-shadowed", "wandering-shard"
]


legend_nouns = [
    "bells", "prophecy", "ghost", "plague", "watchers", "shadows", "feasts",
    "ritual", "throne", "labyrinth", "fortress", "knight", "chorus", "druid",
    "cathedral", "moor", "obelisk", "lament", "pyre", "phantoms", "curse", "scriptures",
    "apparition", "shrine", "spawn", "monolith", "caverns", "hollows", "idol", "coven",
    "sepulcher", "reliquary", "effigy", "fissure", "offspring", "specter", "carrion",
    "legion", "golem", "citadel", "relic", "vortex", "cairn", "hymn", "shaman", "reaper",
    "pyramid", "eulogy", "ruins", "library", "grimoire", "trove", "husk", "leviathan",
    "maelstrom", "vista", "tome", "maze", "catacombs", "colossus", "phoenix", "raven",
    "phylactery", "titan", "lullaby", "ramparts", "puppet", "statue", "oracle", "chimera",
    "retribution", "ascendant", "revelation", "archon", "sepulchre", "veil", "altar",
    "crypt", "inferno", "behemoth", "pantheon", "furnace", "censer", "shade", "chimney",
    "tapestry", "mantle", "scion", "avatar", "conclave", "doom", "beacon", "scepter",
    "spire", "watchtower", "abyss", "harbinger", "altar", "echoes", "vault", "banner",
    "monument", "scar", "seraph", "celestial", "cascade", "storm", "covenant", "sanctuary",
    "abyss", "graves", "rites", "pillar", "oracle", "cyclone", "mystic", "seer", "crown",
    "cairns", "maw", "torment", "dungeon", "paladin", "creed", "blight", "altar", "shadow",
    "passage", "watch", "enclave", "glade", "wyrm", "sepulchre", "grave", "gallows",
    "citadel", "scarab", "thorns", "vestige", "ritual", "obelisk", "cinder", "requiem",
    "temple", "blood", "shard", "harvest", "covenant", "pyre", "tribunal", "cataclysm",
    "phoenix", "visage", "sigil", "tundra", "inferno", "shroud", "shrine", "chasm", "phantasm",
    "pinnacle", "rapture", "scourge", "sovereign", "horde", "tyrant", "arc", "scarlet",
    "starfall", "eternity", "empyrean", "paragon", "redoubt", "gallows", "rift", "herald",
    "torch", "plinth", "reaper", "prison", "omen", "graves", "sarcophagus", "obelisk",
    "flame", "storm", "chorus", "dread", "cascade", "ember", "souls", "dirge", "calamity",
    "cascade", "glacier", "echo", "void", "forge", "oath", "sentinel", "monolith",
    "omen", "revelation", "spirit", "shroud", "passage", "shard", "zephyr", "spire",
    "labyrinth", "sorcery", "citadel", "skyfall", "sepulchre", "reaver", "veil", "archive",
    "typhoon", "phoenix", "dawn", "ember", "pyre", "archangel", "moonveil", "eclipse",
    "spire", "darkness", "dawn", "razor", "eternity", "fortress", "labyrinth", "oblivion",
    "phantasm", "beast", "crypt", "hymn", "dusk", "rune", "hunter", "hollow", "dawnspire",
    "monk", "oath", "winds", "rift", "eternal", "phantom", "gravesong", "citadel",
    "pillar", "unending", "monk", "beacon", "frostbite", "ashes", "prophecy", "ward",
    "stormspire", "ashes", "dawnveil", "warlord", "voidcaller", "rapture", "embers",
    "tide", "ascent", "sovereign", "empyrean", "glory", "soul", "mantle", "citadel",
    "razor", "altar", "thunder", "sarcophagus", "gallows", "harrow", "vault", "warden",
    "cairn", "hymn", "crusader", "void", "sepulchre", "lament", "shard", "inferno",
    "silhouette", "storm", "cryptic", "torment", "basilica", "lunar", "pyre", "wrath",
    "eternal", "shard", "ramparts", "seal", "grim", "stormkeeper", "dreadnought", "emberforge",
    "eclipse", "spire", "gatekeeper", "cairn", "stormcaller", "phantasm", "reliquary",
    "reaver", "shrouds", "ironclad", "cascade", "vestige", "voidsong", "phantom", "arcana",
    "scar", "rapture", "siren", "seal", "altar", "tribute", "summoner", "dirge", "legends"
]


def generate_legend_item():
    """
    Randomly combines one adjective + one noun into a single phrase:
    e.g. "cursed bells", "frozen labyrinth", etc.
    """
    adj = random.choice(legend_adjectives)
    noun_ = random.choice(legend_nouns)
    return f"{adj} {noun_}"

legends_set = set()
while len(legends_set) < NUM_OPTIONS_PER_ENTITY_7_8:
    legends_set.add(generate_legend_item())

legends = sorted(legends_set)

# ---------------------------------------------------------------------
#   2) EXPAND LOCATIONS (for local rumors) to 5,000 items
# ---------------------------------------------------------------------

original_locations = [
    "Ravenmarch", "Dreadhollow", "Ironreach", "Frostshade", "Duskridge", "Shadowmere",
    "Blighthaven", "Ashthorn", "Winterspire", "Gravenwood", "Darkreach", "Thornfell",
    "Blackspire", "Drakemire", "Stormhaven", "Brightmoor", "Crowspire", "Gloamscar",
    "Emberfen", "Nightward", "Frosthollow", "Silverglen", "Ebonshade", "Cinderfall",
    "Stonehaven", "Grimhold", "Wraithmoor", "Wyldethorn", "Ruincrest", "Nightvale",
    "Fellshade", "Brimstone", "Duskspire", "Ebonmire", "Thornward", "Windmere",
    "Oakenshadow", "Marrowspire", "Duskwatch", "Wolfridge", "Frostfen", "Cinderreach",
    "Stormglen", "Darkhollow", "Rimewatch", "Bloodfen", "Ironshade", "Gloamhaven",
    "Crowshade", "Ebonmere", "Ruinwood", "Dreadmere", "Silverhollow", "Fellward",
    "Wolvenreach", "Frostward", "Nightshade", "Ashenvale", "Windspire", "Thornwood",
    "Driftshade", "Wyvernwatch", "Stoneward", "Ruinspire", "Emberward", "Stormshade",
    "Duskmire", "Brightspire", "Shadowglen", "Frostspire", "Crowwood", "Blightmoor",
    "Silvercrest", "Ebonwood", "Ashenguard", "Wraithspire", "Grimspire", "Darkmoor",
    "Bloodward", "Marrowward", "Duskwood", "Thornspire", "Wyvernfen", "Cinderpeak",
    "Wolvenhollow", "Fellglen", "Gloomspire", "Ironmoor", "Ashridge", "Brightward",
    "Gloamspire", "Windhaven", "Ruinfen", "Stormfen", "Silverward", "Nightwatch",
    "Ravenward", "Oakenshade", "Crowhaven", "Bloodwatch", "Duskhaven", "Blackthorn",
    "Gravencrest", "Brightwood", "Ebonpeak", "Thornmoor", "Cinderwood", "Darkreach",
    "Wyvernspire", "Blightspire", "Wraithwood", "Emberwood", "Stonehollow", "Frostreach",
    "Nightglen", "Shadowward", "Gloomwood", "Silvermoor", "Ashwatch", "Duskmarch",
    "Windward", "Wolfridge", "Ironwatch", "Dreadspire", "Ruinwatch", "Brightglen",
    "Fellwatch", "Stormcrest", "Crowward", "Grimmoor", "Gloamward", "Ebonshade",
    "Rimewood", "Bloodmoor", "Shadowfen", "Marrowwatch", "Blightglen", "Wolvenshade",
    "Ironpeak", "Duskfen", "Silverreach", "Thornfen", "Ashmoor", "Brightmere",
    "Cinderhaven", "Stormward", "Darkspire", "Wraithhaven", "Gloamhollow", "Ruinward",
    "Emberwatch", "Frostmoor", "Nightward", "Wolvenpeak", "Oakenshade", "Crowward",
    "Wyvernwood", "Ravenwood", "Silverwatch", "Gravenward", "Eboncrest", "Bloodshade",
    "Dreadward", "Ashward", "Fellpeak", "Blighthaven", "Gloomwatch", "Windspire",
    "Rimewatch", "Thornward", "Stormshade", "Duskmire", "Ironspire", "Marrowfen",
    "Nightmoor", "Ruinwood", "Wraithward", "Crowmoor", "Brightshade", "Cinderwood",
    "Darkward", "Frostward", "Blightwood", "Gravenpeak", "Emberwood", "Silverfen",
    "Ashspire", "Dreadwood", "Wolvenward", "Ebonshade", "Bloodglen", "Fellmoor",
    "Stormfen", "Ruinreach", "Wyvernspire", "Brightmoor", "Nightglen", "Cinderreach",
    "Ironmere", "Ravenmoor", "Blighthollow", "Gloamshade", "Marrowspire", "Darkfen",
    "Windmere", "Shadowmere", "Brightspire", "Crowspire", "Duskwatch", "Gloomglen",
    "Frostspire", "Silverwatch", "Bloodward", "Emberfen", "Grimwatch", "Wyldethorn",
    "Thorncrest", "Rimeward", "Stormhollow", "Ebonscar", "Fellglen", "Ashenguard",
    "Blighthaven", "Nightfen", "Cinderfall", "Wolvenwatch", "Frostmere", "Silverhollow",
    "Ironwood", "Marrowhaven", "Ravenward", "Stormwatch", "Crowwood", "Gloamspire",
    "Grimreach", "Dreadfen", "Ashenward", "Shadowward", "Bloodspire", "Brightshade",
    "Windglen", "Duskwatch", "Ruinmoor", "Thornwood", "Blightmoor", "Ebonward",
    "Wolvenshade", "Darkmoor", "Silverglen", "Brightwatch", "Rimeglen", "Gloamwood",
    "Marrowward", "Frostward", "Wyvernfen", "Cinderpeak", "Emberwatch", "Nightshade",
    "Stormward", "Thornreach", "Crowshade", "Gravenmoor", "Ruinward", "Wolvenpeak",
    "Fellwood", "Ashfall", "Silvermoor", "Brightglen", "Blightshade", "Rimewatch",
    "Wraithward", "Darkreach", "Stormhaven", "Duskspire", "Ebonscar", "Windshade",
    "Gloomwood", "Wyldeglen", "Nightcrest", "Grimward", "Silverward", "Shadowward"
]


location_prefixes = [
    "Black", "Stone", "Night", "Sky", "Raven", "Ashen", "Silver", "Shadow", "Frost",
    "Storm", "Green", "Gloom", "Crow", "Iron", "Drift", "Bright", "Dusk", "White",
    "Bracken", "Morrow", "Hollow", "Thorn", "Glow", "Deep", "High", "Low", "Marrow",
    "Moon", "Sun", "Crystal", "Wind", "Dark", "Mist", "Wood", "Oak", "Rune", "Ebon",
    "Fell", "Wolf", "Flame", "Blood", "Mire", "River", "Ice", "Cloud", "Quill", "Cinder",
    "Fire", "Sable", "Blaze", "Amber", "Dusken", "Golden", "Sapphire", "Bronze", "Char",
    "Burnt", "Grim", "Blue", "Red", "Ivory", "Steel", "Ruby", "Onyx", "Jade", "Emerald",
    "Topaz", "Copper", "Dust", "Ironwood", "Pale", "Twilight", "Autumn", "Spring", "Summer",
    "Winter", "Nightfall", "Morning", "Evening", "Sunrise", "Sunset", "Azure", "Umber",
    "Obsidian", "Brimstone", "Lunar", "Solar", "Seafrost", "Violet", "Cobalt", "Crest",
    "Echo", "Vine", "Rift", "Dawn", "Shatter", "Oaken", "Misty", "Ever", "Bitter", "Hearth",
    "Stormy", "Chill", "Rime", "Ember", "Charred", "Crumbling", "Feral", "Wild", "True",
    "Ashwood", "Steelshade", "Coal", "Quake", "Tempest", "Midnight", "Ruin", "Ironshade",
    "Haven", "Wraith", "Ancient", "Fallow", "Sablethorn", "Goldenwood", "Darkwater",
    "Shrouded", "Forsaken", "Thistle", "Shadowstone", "Cloaked", "Broken", "Shattered",
    "Glass", "Star", "Riverthorn", "Moonlight", "Windspire", "Ironwing", "Whisper", "Hollowshade",
    "Duskwind", "Briar", "Dew", "Glacier", "Frostthorn", "Flicker", "Velvet", "Hollowheart",
    "Ashforge", "Stormveil", "Flint", "Ravencrest", "Sundown", "Bramble", "Darkspire", "Frostwood",
    "Still", "Swift", "Windfall", "Evergreen", "Dread", "Brightstone", "Deepwater", "Sablewood",
    "Cloudspire", "Everdark", "Brightforge", "Wildthorn", "Ivoryspire", "Ravenshade", "Flare",
    "Skyward", "Sunchase", "Eclipsed", "Frostwing", "Crown", "Blighted", "Hallowed", "Silent",
    "Crisp", "Barren", "Stillwater", "Gravestone", "Ironflame", "Ashfall", "Silverthorn",
    "Stormpeak", "Nightsong", "Dapple", "Thornwild", "Coldspire", "Brightshade", "Wolfstone",
    "Ironheart", "Windstorm", "Silverveil", "Wildspire", "Frostfire", "Stormcrown", "Moonspire",
    "Ashcrown", "Brightstar", "Dusklight", "Shadewood", "Dreadstone", "Brighthollow", "Highspire",
    "Stormcliff", "Blightwood", "Fellstone", "Lowthorn", "Deepcrest", "Whispering", "Darkthorn",
    "Silverwisp", "Greenwood", "Crimson", "Bloodthorn", "Ebonspire", "Frostshadow", "Windthorn",
    "Ashspire", "Fallen", "Darkshade", "Moondrift", "Driftwood", "Wolfshade", "Ravenstone",
    "Everflame", "Frostfell", "Silentwing", "Crowshade", "Cindervale", "Brightvale", "Ashvale",
    "Ruinspire", "Thornshade", "Hearthstone", "Duskhaven", "Silverkeep", "Shadowvale", "Oakvale",
    "Flamecrest", "Bloodspire", "Rimeward", "Stormvale", "Ebonward", "Bitterspire", "Cinderward",
    "Thorncliff", "Ravenshadow", "Shimmering", "Burning", "Frostshade", "Havenward", "Brightwind",
    "Wraithvale", "Mistward", "Cloudward", "Shadowward", "Firevale", "Ashward", "Moonward",
    "Glowthorn", "Iceward", "Windward", "Deepward", "Crestward", "Brightward", "Stormward",
    "Hollowward", "Cindercrest", "Bloodward", "Wolfward", "Darkward", "Rimecrest", "Silverward",
    "Oakward", "Flameward", "Quillward", "Windcrest", "Stoneward", "Thornward", "Crowward",
    "Dawnward", "Fellward", "Ebonward", "Sunward", "Riverward", "Glowward", "Crystalward",
    "Stormward", "Darkward", "Runecrest", "Hollowcrest", "Brightcrest", "Silvercrest", "Wolfcrest",
    "Bloodcrest", "Ashcrest", "Nightcrest", "Mooncrest", "Icecrest", "Thorncrest", "Firecrest",
    "Windcrest", "Glowcrest", "Rivercrest", "Suncrest", "Crystalcrest", "Dawncrest", "Highcrest",
    "Lowcrest", "Brightwood", "Duskwood", "Ravenwood", "Ashenwood", "Silverwood", "Shadowwood",
    "Frostwood", "Stormwood", "Gloomwood", "Crowwood", "Ironwood", "Brightwood", "Darkwood",
    "Mistwood", "Oakwood", "Runewood", "Ebonwood", "Fellwood", "Wolfwood", "Flamewood",
    "Bloodwood", "Icewood", "Cloudwood", "Quillwood", "Cinderwood", "Moonwood", "Sunwood",
    "Crystalwood", "Windwood", "Hollowwood", "Rimewood", "Burnwood", "Wildwood", "Frostpine",
    "Silverpine", "Wolfpine", "Ravenpine", "Darkpine", "Ironpine", "Bloodpine", "Moonpine",
    "Ashpine", "Ebonpine", "Thornpine", "Brightpine", "Cinderpine", "Gloompine", "Fellpine"
]


location_roots = [
    "glen", "vale", "ford", "moor", "mire", "peak", "ward", "dale", "field", "holt",
    "haven", "wick", "borough", "bridge", "fell", "wood", "wraith", "fen", "scar",
    "ton", "hollow", "hold", "shire", "gate", "burg", "wald", "crest", "grove",
    "cove", "mirth", "ridge", "port", "bank", "strand", "heath", "watch", "heights",
    "fall", "steppe", "pine", "wold", "bend", "reach", "thorn", "brook", "spire",
    "march", "knoll", "cliff", "stone", "ledge", "loch", "rift", "meadow", "spring",
    "valley", "frost", "shade", "shore", "brink", "pass", "arch", "flame", "bay",
    "bayou", "pool", "grove", "thicket", "drift", "hill", "bluff", "crag", "pike",
    "hollow", "fort", "holt", "strand", "chasm", "pillar", "cairn", "plains", "run",
    "mill", "hearth", "den", "glade", "circle", "bramble", "rift", "ledge", "flow",
    "delta", "beacon", "wharf", "haven", "harbor", "summit", "heights", "ridge",
    "rift", "firth", "bramble", "arch", "wild", "forge", "moorland", "veil", "grove",
    "sanctum", "cairn", "burn", "knob", "plateau", "basin", "path", "haven", "outlook",
    "overlook", "crossing", "glacier", "marsh", "slough", "clearing", "outcrop",
    "bastion", "dell", "cairn", "beacon", "keep", "way", "kirk", "holt", "village",
    "warren", "station", "stead", "holdfast", "lair", "ascent", "descent", "cape",
    "cliffside", "fortress", "citadel", "wall", "mound", "slope", "bough", "spire",
    "grove", "hollow", "rift", "meadow", "cradle", "ridge", "copse", "burn", "head",
    "scar", "brink", "dune", "strand", "moor", "scar", "bluff", "fen", "thorn", "port",
    "crest", "holm", "fenland", "havenwood", "trench", "range", "cascade", "cleft",
    "cradle", "flats", "grove", "outpost", "havenwatch", "scar", "brim", "summit",
    "blaze", "dawn", "eve", "grotto", "peak", "tower", "archway", "gatehouse", "forge",
    "woodland", "woodside", "hillside", "glenwood", "star", "mount", "cliffview",
    "overpass", "steppe", "valewood", "frostwood", "brights", "cinders", "dunes",
    "brookside", "oak", "pinewood", "maplewood", "ashwood", "crown", "stonefield",
    "dawnward", "driftwood", "midfield", "wade", "crossroad", "wolf", "spirewood",
    "ridgefall", "wildpath", "moonridge", "darkwood", "starlight", "everwood",
    "gladecrest", "windridge", "stormpath", "rune", "beaconwood", "stormheath",
    "cragwood", "thornridge", "seaview", "bramblewood", "ashenhold", "windwatch",
    "hollowcrest", "starlight", "willow", "riverbend", "stream", "evergreen",
    "hearthwood", "flint", "hawk", "goldenhold", "eagleridge", "brightheath",
    "silverpine", "blackthorn", "lowlands", "tide", "stormfront", "thornfield",
    "rimehold", "snowhold", "highlands", "starlight", "frozenwood", "lakeside",
    "frosthill", "moonlight", "starpath", "ashvale", "dreadmoor", "runeforge",
    "ridgeview", "stormrock", "ironcrest", "mistyvale", "oakridge", "brightridge",
    "stormhold", "scarwood", "emberward", "shadowfall", "willowvale", "runestone",
    "driftpeak", "wildgrove", "starlight", "frosthaven", "windward", "moondale",
    "thornspire", "ashvale", "brightwater", "mistfield", "frostview", "mistwatch",
    "ironview", "wildshade", "thornpath", "brightmoor", "frostwood", "stormvale",
    "cliffwood", "oakfall", "bramblehold", "willowwatch", "dawnspire", "frostmoor",
    "rimewatch", "starlight", "firewood", "darkheath", "stormcliff", "ashriver",
    "gladewatch", "moorcrest", "ridgeway", "wildhaven", "stormhaven", "thornhaven",
    "shadowvale", "willowwood", "mistheath", "frosthaven", "wildrock", "oakforge",
    "willowridge", "stormhold", "wildvale", "willowgrove", "brightwood", "rimehill",
    "bramblegrove", "thornrock", "runewood", "dreadfort", "oakspire", "wolfwatch",
    "dawnheath", "stormheath", "willowwatch", "ironmoor", "shadowglen", "willowthorn",
    "ironspire", "oakwood", "starlight", "dawnview", "wildwatch", "bramblevale",
    "ashstone", "pinefall", "brightrust", "wildthorn", "willowfield", "bramblemoor",
    "stormcrest", "oakfield", "bramblethorn", "wildspire", "willowmoor", "oakvale",
    "thornwood", "brightpine", "frostglen", "oakdale", "bramblewatch", "stormspire",
    "wildheights", "thornwatch", "frostwood", "rimeglen", "bramblewood", "wildview"
]


def generate_location_item():
    """
    Randomly combines one prefix + one root to form a location,
    e.g. "Blackwood", "Frosthaven", "Crowmire".
    """
    prefix = random.choice(location_prefixes)
    root = random.choice(location_roots)
    return (prefix + root).capitalize()

locations_set = set(original_locations)
while len(locations_set) < NUM_OPTIONS_PER_ENTITY_7_8:
    locations_set.add(generate_location_item())

locations = sorted(locations_set)

# ---------------------------------------------------------------------
#   3) Build out Entities 1–6 with 5,000 items each
# ---------------------------------------------------------------------

# ENTITY 1: Unique fictional names
def generate_fictional_name():
    consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m",
                  "n", "p", "r", "s", "t", "v", "w", "x", "z"]
    vowels = ["a", "e", "i", "o", "u"]
    syllable_count = random.randint(2, 4)
    parts = []
    for _ in range(syllable_count):
        c_cluster = "".join(random.choices(consonants, k=random.randint(1, 2)))
        v_cluster = "".join(random.choices(vowels, k=1))
        parts.append(c_cluster + v_cluster)
    return "".join(parts).capitalize()

entity1_set = set()
while len(entity1_set) < NUM_OPTIONS_PER_ENTITY_1_6:
    entity1_set.add(generate_fictional_name())
entity1_list = list(entity1_set)

# ENTITY 2: Broken word-formation names
def generate_broken_word_formation():
    length = random.randint(6, 10)
    name = "".join(random.choices(string.ascii_letters, k=length))
    return name.capitalize()

entity2_set = set()
while len(entity2_set) < NUM_OPTIONS_PER_ENTITY_1_6:
    entity2_set.add(generate_broken_word_formation())
entity2_list = list(entity2_set)

# ENTITY 3: Base64-encoded 16 random bytes
def generate_base64_16():
    rb = bytes([random.randint(0, 255) for _ in range(16)])
    return base64.b64encode(rb).decode('utf-8')

entity3_set = set()
while len(entity3_set) < NUM_OPTIONS_PER_ENTITY_1_6:
    entity3_set.add(generate_base64_16())
entity3_list = list(entity3_set)

# ENTITY 4: City name (mash-up)
city_fragments = [
    "fros", "yorc", "haven", "brook", "frost", "strath", "wick", "dorf", "werth",
    "den", "caster", "feld", "hagen", "bur", "chester", "wal", "moor", "crest",
    "nor", "rime", "thorne", "storm", "wold", "bron", "kirk", "vale", "dorn",
    "fen", "grim", "tarn", "marn", "heath", "vren", "bran", "lusk", "drift",
    "harn", "cove", "skarn", "vesh", "holt", "bray", "karth", "glen", "tarn",
    "thal", "lorne", "cairn", "aven", "rynn", "thorpe", "bryth", "wynd",
    "keln", "rend", "vord", "lir", "stryn", "vann", "wreth", "bryn", "marn",
    "lusk", "helm", "fall", "rast", "lorn", "velth", "fenrick", "reth", "lornick",
    "clyth", "vorn", "hirn", "dryth", "wrick", "lyth", "vorak", "tarnick",
    "brynd", "zorn", "hovick", "dram", "velk", "thornvale", "worick", "farn",
    "dros", "valdr", "thrynn", "grimthorpe", "vyrn", "clydorn", "sarn",
    "whelm", "kelthor", "varst", "strain", "worlen", "cynrick", "dranor",
    "velmor", "lairn", "thesk", "vyrnack", "drenith", "karnell", "falk", 
    "holbeck", "drenwold", "brysk", "thorak", "gresk", "tornhelm", "fram",
    "rynthor", "halthorn", "brindell", "velmoth", "trask", "zalden", "brost",
    "gelnar", "kythen", "arngale", "kyrath", "lesthor", "fynrick", "helthorn",
    "draven", "grendel", "trenwold", "vesthor", "blithen", "krynn", "tharlen",
    "vosmark", "brenthor", "zorhelm", "drenthor", "larkholm", "hythorn",
    "grenwick", "wroth", "falden", "morwick", "dryntor", "sarnhelm", "tarnath",
    "velross", "brynor", "drathorn", "harthyr", "frask", "zenthor", "strathen",
    "vorwyn", "drenvale", "kornhill", "brenthar", "crynhelm", "thyssen", "wrathorn",
    "lyndar", "bristol", "ravenden", "stormyr", "grynnor", "kylnar", "thalden",
    "frosk", "hildar", "mornval", "grathor", "drenwyn", "lorthal", "vestyl",
    "wrenholm", "brynder", "thalrin", "krenwyn", "morneth", "tarnholm", "zandor",
    "frithen", "glenfyr", "thorlin", "vornest", "wrynnal", "stralden", "velten",
    "zorval", "lenthor", "harken", "drynwald", "thurnwick", "stormar", "triven",
    "gravenhall", "thorhelm", "vestwick", "falkryn", "bryndale", "zelthor",
    "ardyn", "wynthal", "vralden", "fenthor", "kravorn", "morthal", "zorath",
    "thyrell", "wrothen", "frenmor", "dravelle", "gryswold", "vryngale", "helwrith",
    "morncairn", "trenval", "lyndwold", "zerfall", "wrosk", "gravell", "ryndhall",
    "brythor", "morren", "thrasdal", "velkin", "grinwold", "kyland", "rynwood",
    "faldor", "tornryn", "helgar", "bryskorn", "wrethorn", "traskyr", "zorland",
    "lorrick", "drayth", "krynnal", "stravel", "havick", "thaldor", "vorhill",
    "velthor", "marnhelm", "froswyn", "rynmar", "dreskorn", "thyren", "wrenvald",
    "brython", "strenholm", "lornvale", "dryncrest", "morvorn", "thorhall",
    "vralhelm", "fynnor", "krynthar", "drathmar", "velthor", "trenwick",
    "zornholm"
]


def generate_city_name():
    num_frags = random.randint(2, 3)
    chosen = random.sample(city_fragments, k=num_frags)
    return "".join(chosen).capitalize()

entity4_set = set()
while len(entity4_set) < NUM_OPTIONS_PER_ENTITY_1_6:
    entity4_set.add(generate_city_name())
entity4_list = list(entity4_set)

# ENTITY 5: 10-digit number
def generate_10_digit_number():
    return str(random.randint(10**9, 10**10 - 1))

entity5_set = set()
while len(entity5_set) < NUM_OPTIONS_PER_ENTITY_1_6:
    entity5_set.add(generate_10_digit_number())
entity5_list = list(entity5_set)

# ENTITY 6: Language name (fragments)
lang_parts_a = [
    "nor", "goth", "teu", "angl", "sax", "deric", "ish", "landic", "ger",
    "fran", "holl", "dan", "celt", "ven", "rom", "ald", "mar", "ester",
    "thr", "gor", "welf", "ul", "brim", "vin", "val", "wyn", "rol", "high",
    "low", "lux", "zel", "grim", "dry", "av", "ael", "lyr", "tor", "zhar",
    "var", "ryn", "kas", "thy", "sol", "dorn", "kal", "zelk", "mor", "brak",
    "aeg", "tan", "oth", "vor", "dra", "fen", "zar", "krin", "eon", "qui",
    "ess", "ten", "ral", "cor", "fel", "zer", "dar", "lor", "tur", "garn",
    "venn", "il", "nas", "thur", "wyn", "rex", "gel", "mal", "rix", "tas",
    "vos", "yth", "zan", "tar", "lil", "eor", "vel", "jor", "vyn", "nur",
    "sil", "lir", "arc", "wyn", "del", "fir", "lath", "eol", "quor", "is",
    "rin", "tha", "wer", "zor", "bil", "uth", "kyr", "syl", "fer", "nox",
    "zaf", "hal", "mir", "thar", "varl", "lum", "nar", "kyr", "kor", "fy",
    "vol", "eth", "nyr", "aen", "myr", "dyn", "bel", "tri", "sel", "aon",
    "vyr", "kir", "al", "bar", "ryl", "zyl", "bra", "hor", "sar", "ior",
    "mer", "ven", "ran", "gre", "yor", "len", "vir", "lad", "aer", "tos",
    "gil", "sun", "horl", "bre", "lan", "cal", "tol", "zerk", "rinth",
    "morn", "flyn", "eld", "dyr", "solr", "frey", "ain", "ilth", "kynd",
    "mun", "ore", "bal", "xim", "ros", "lyn", "vo", "wynl", "jyn", "kar",
    "zar", "zelm", "ond", "luxr", "gol", "ythr", "von", "thym", "aeth",
    "alor", "eryn", "uthr", "phyr", "cel", "bryn", "quarl", "myn", "tarn",
    "xar", "mol", "cam", "wyrl", "sta", "torl", "skel", "kaln", "grel",
    "lorn", "fim", "virl", "zynt", "durn", "thol", "moln", "wynth", "jorl",
    "zarith", "shyn", "valk", "trel", "run", "kelth", "rynth", "brul",
    "garth", "lorch", "zen", "dorr", "glym", "wurn", "thain", "uthil",
    "ver", "briml", "krinl", "fyrl", "brarn", "taln", "zenn", "worn", "stael",
    "norv", "brith", "kalv", "vern", "yrl", "nith", "ar", "iv", "zem",
    "vrol", "thrak", "garb", "loft", "ulv", "cro", "pel", "dran", "wynar",
    "saril", "kalor", "thil", "delv", "jar", "marn", "thal", "nalm", "zol",
    "relm", "dorth", "morth", "vis", "gorv", "halm", "noxr", "thyn", "rynn",
    "velm", "kaleth", "zoril", "lorn", "shorn", "drill", "bor", "thynl",
    "torl", "mornl", "quith", "fyln", "joril", "aelth", "alorn", "valkr",
    "zarith", "loril", "moryn", "garnl", "thorl", "zemn", "wyther", "arn",
    "fyrn", "loril", "vinth", "coril", "torryn", "dorl", "jorn", "lorryn",
    "zilm", "vral", "morl", "brynth", "larn", "thoril", "zylth", "garnil",
    "wynil", "jern", "thorn", "delil", "velorn", "zorn", "brimril", "norim",
    "soril", "goryn", "krinil", "wynil", "joril", "zilorn", "kalorn",
    "thornil", "brorn", "zilryn", "zaral", "throrn", "moril", "brithil",
    "naral", "woril", "lorthil", "loral", "deril", "zerorn", "bryl", "vernil"
]

lang_parts_b = [
    "burg", "ling", "ic", "ton", "vale", "fen", "grad", "helm", "thorpe",
    "chester", "ridge", "ward", "thol", "march", "holt", "darin", "oid",
    "glen", "dorn", "ord", "brek", "heim", "ver", "vick", "borough", "land",
    "stead", "wick", "shire", "lund", "holm", "gard", "feld", "wald", "haven",
    "borg", "stan", "thor", "croft", "loch", "mere", "vane", "holm", "fell",
    "port", "lund", "thorn", "stead", "mont", "brun", "clough", "moor",
    "brig", "wynd", "harth", "lund", "brith", "march", "vard", "staal",
    "karn", "vord", "grund", "feld", "brecht", "durth", "wald", "velt",
    "durn", "waith", "glen", "bard", "lath", "ward", "holt", "brun",
    "nock", "holm", "wold", "thorp", "hald", "land", "grim", "brand",
    "loch", "wynn", "strand", "mund", "borg", "hurst", "fort", "wyck",
    "hold", "lyn", "thor", "lor", "vyn", "der", "lorn", "var", "wyn",
    "crest", "fen", "ridge", "mere", "brook", "bridge", "tholl", "lor",
    "dorn", "hall", "wythe", "vell", "thorn", "ward", "karn", "velt",
    "marsh", "strand", "firth", "mere", "thorn", "ley", "grad", "wyke",
    "ward", "lund", "feld", "gard", "ward", "wynd", "tarn", "vold",
    "lund", "stead", "holm", "brig", "staad", "bryn", "vark", "thor",
    "berth", "loch", "wold", "brook", "land", "hirst", "lore", "croft",
    "wyrd", "heim", "holt", "stead", "moor", "wynn", "wyne", "holm",
    "loch", "ford", "hollow", "wynd", "gryn", "thorn", "hall", "ward",
    "croft", "valk", "helm", "holt", "mere", "stead", "march", "glen",
    "land", "ward", "berg", "glen", "stan", "mund", "croft", "ward",
    "holm", "stead", "tarn", "fell", "wold", "borg", "glen", "wythe",
    "land", "mire", "ward", "veld", "thorn", "grad", "vane", "wold",
    "loch", "thorpe", "wick", "march", "wald", "stead", "holt", "glen",
    "burg", "thor", "holm", "brig", "mere", "land", "thor", "stead",
    "croft", "wald", "holm", "tarn", "vale", "ward", "brun", "loch",
    "holm", "feld", "thor", "shire", "ward", "stead", "harth", "wyke",
    "croft", "berg", "loch", "land", "thorpe", "stead", "fell", "moor",
    "feld", "holm", "ward", "thorn", "lore", "vale", "mund", "thor",
    "thorpe", "holm", "holt", "strand", "ward", "wynd", "harth", "veld",
    "thorn", "shire", "loch", "stead", "vold", "gard", "ward", "holm",
    "holm", "croft", "ford", "land", "mere", "bridge", "veld", "ward",
    "hollow", "ward", "strand", "croft", "holm", "wynd", "thorn", "ward",
    "land", "ford", "thor", "shire", "stead", "holt", "mund", "vale",
    "thor", "borg", "thorn", "mund", "loch", "stead", "holm", "grad"
]

def create_language_name():
    frag_count = random.randint(2, 3)
    chosen = []
    for _ in range(frag_count):
        if random.random() < 0.5:
            chosen.append(random.choice(lang_parts_a))
        else:
            chosen.append(random.choice(lang_parts_b))
    return "".join(chosen).capitalize()

entity6_set = set()
while len(entity6_set) < NUM_OPTIONS_PER_ENTITY_1_6:
    entity6_set.add(create_language_name())
entity6_list = list(entity6_set)

# ---------------------------------------------------------------------
#   4) ENV DESCRIPTORS (Entity 7) + FINAL RUMORS (Entity 8) (5,000 each)
# ---------------------------------------------------------------------
descriptive_adjectives = [
    "bitter", "glowing", "swirling", "howling", "ominous", "relentless",
    "dusty", "amber", "crackling", "serene", "whirling", "shimmering",
    "luminous", "electric", "ethereal", "roaring", "perpetual", "choking",
    "misty", "distant", "suffocating", "foreboding", "glacial", "violent",
    "smoldering", "drifting", "vibrant", "gloomy", "raging", "silvered",
    "eerie", "subterranean", "shadowy", "turbulent", "spiraling", "ghostly",
    "feral", "tattered", "rustling", "blistering", "pale", "decrepit",
    "frosty", "melancholy", "radiant", "sinister", "fractured", "fleeting",
    "cascading", "fractious", "brazen", "frenzied", "verdant", "bleak",
    "burnished", "gleaming", "arid", "pristine", "majestic", "effervescent",
    "verdant", "bristling", "soaring", "desolate", "murky", "vivid",
    "translucent", "hazy", "looming", "thundering", "silken", "cavernous",
    "scorching", "crystalline", "velvety", "glinting", "serrated", "delicate",
    "teeming", "dusky", "craggy", "spindly", "flickering", "ominous",
    "jeweled", "stormy", "glossy", "bleeding", "ashen", "smoky",
    "simmering", "infused", "spacious", "ancient", "quivering", "wistful",
    "radiating", "savage", "lush", "mossy", "hollow", "gritty",
    "gleeful", "ethereal", "boundless", "subdued", "untamed", "wistful",
    "elegant", "flaming", "mottled", "drab", "smeared", "dim",
    "harrowing", "brilliant", "murky", "sublime", "sapphire", "craggy",
    "dappled", "towering", "radiant", "smoky", "shadowed", "prickly",
    "gilded", "lurid", "piercing", "rugged", "gnarled", "iridescent",
    "obscured", "shrouded", "beaming", "hidden", "ferocious", "imposing",
    "mist-laden", "ashen", "amber-lit", "opaline", "gossamer", "trembling",
    "muffled", "burgeoning", "polished", "brimming", "sagging", "wavering",
    "warped", "scarred", "corroded", "gleeful", "snarling", "unyielding",
    "impervious", "blaring", "vividly-hued", "cackling", "parched", "muffled",
    "muted", "haunting", "kaleidoscopic", "splintered", "tearing", "savage",
    "weathered", "gleaming", "shearing", "searing", "fluid", "shredded",
    "emerald", "coiling", "saturating", "plummeting", "gleaming", "crackling",
    "dripping", "enchanted", "fluid", "rippling", "flaring", "ashen",
    "veiled", "fluttering", "penetrating", "burnished", "softened", "fiery",
    "hushed", "strained", "splattered", "fractured", "entwined", "brittle",
    "shattering", "tender", "drenched", "elevated", "vaulted", "etched",
    "parched", "scratched", "scorched", "inviting", "enveloping", "flushed",
    "melting", "twisting", "vaporous", "lacquered", "smoky", "soft",
    "winking", "singed", "radiant", "prickling", "seething", "brimming",
    "diffused", "undulating", "looming", "cindered", "infinitesimal",
    "billowing", "eclipsed", "dappled", "luminous", "meandering",
    "leaping", "crumbling", "spangled", "warped", "etched", "flecked",
    "imbued", "livid", "beaded", "bristled", "gnawed", "sibilant",
    "churned", "spattered", "submerged", "prickled", "converging", "swamped",
    "luminescent", "tepid", "burnt", "slick", "torrid", "pebbled",
    "darting", "rolling", "wavering", "irregular", "fractured", "cracked",
    "plumed", "smeared", "muted", "bleeding", "streaming", "streaked",
    "serene", "peeling", "patched", "dampened", "gleaming", "incandescent",
    "pebbled", "marbled", "stained", "weathered", "polished", "flitting",
    "swinging", "tingling", "echoing", "clattering", "shimmering", "rolling"
]

environment_nouns = [
    "fog", "storm", "wildfire", "twilight", "eclipse", "tempest", "cinder rain",
    "downpour", "gale", "dust cloud", "sunset haze", "moonlit gloom",
    "midnight wind", "frost winds", "ember glow", "ash-laden sky",
    "lightning haze", "aurora shimmer", "seismic quake", "rime frost",
    "cloudburst", "glacier drift", "sun-scorched desert", "tidal wave",
    "thunderclap", "whirlpool", "hailstorm", "desert mirage", "rain shadow",
    "ocean surge", "lava flow", "frostbite chill", "starlit tundra",
    "blizzard winds", "storm surge", "glacial valley", "volcanic ash",
    "morning dew", "crimson horizon", "choking haze", "night fog",
    "frozen spires", "storm clouds", "wind shear", "rolling thunder",
    "dawn mist", "sandstorm", "wind-swept plains", "tundra wasteland",
    "meteor shower", "volcanic plume", "sea foam", "dense canopy",
    "moonlit waters", "drifting dunes", "tidal basin", "ancient grove",
    "rainforest gloom", "sunlit meadow", "rippling tide", "raging inferno",
    "thick briars", "murky swamp", "shadowed forest", "crystal cavern",
    "ember field", "smoldering earth", "broken cliffs", "cascading falls",
    "desert spires", "verdant glade", "shimmering sands", "coral reef",
    "prismatic sky", "sunlit valley", "snow-capped peak", "lichen patch",
    "starry expanse", "rolling hills", "floodplain", "basalt ridge",
    "silvered lake", "crimson dusk", "ashen wasteland", "marshlands",
    "rain-drenched woods", "sea cliff", "twinkling void", "hurricane eye",
    "ocean abyss", "misty plateau", "thunderhead", "jungle undergrowth",
    "echoing cavern", "stalactite chamber", "tidal reef", "lichen-covered stones",
    "glacial crevasse", "lava bed", "burning fields", "frosted pines",
    "ashen cliffs", "midnight frost", "sapphire lagoon", "swirling eddies",
    "sea spray", "rushing river", "morning haze", "volcanic caldera",
    "salt flats", "jagged peaks", "ebon spires", "aurora veil",
    "searing winds", "clouded skies", "cracked earth", "emerald glade",
    "sun-bleached stones", "frozen waterfall", "withered grove", "molten lake",
    "flaming ridge", "silver mist", "haunted woods", "seething waters",
    "lonely island", "red sands", "midnight dunes", "hidden cove",
    "amber glow", "rocky shoal", "sun-baked earth", "twilight sky",
    "billowing ash", "blazing fields", "underground spring", "starless night",
    "howling winds", "charred forest", "stormy seas", "wind-torn cliffs",
    "glimmering caves", "dappled glen", "rushing falls", "briny waves",
    "quaking earth", "snowy tundra", "hidden lagoon", "cobalt depths",
    "golden fields", "ashen peaks", "serpent river", "gnarled roots",
    "misty fjord", "frozen expanse", "ancient cairn", "shadow vale",
    "thundering cascade", "withered meadow", "cinderstorm", "hidden ravine",
    "churning sea", "smoky crests", "flickering horizon", "scorched plains",
    "haunted marsh", "obsidian crag", "crimson embers", "misty bog",
    "stormy abyss", "burnt forest", "ancient chasm", "torrid cliffs",
    "serene oasis", "golden haze", "twilight grove", "silver cascade",
    "horizon blaze", "fogbound coast", "moonstruck waters", "crystalline shore",
    "glistening fjord", "sunlit grove", "shifting sands", "rippled expanse",
    "burning sky", "frosted glen", "ashen dunes", "smoke-filled air",
    "sapphire horizon", "storm-rolled plains", "moonlit veil", "ember-touched ground",
    "molten river", "midnight tundra", "swirling fog", "ebon depths",
    "hidden hollow", "stormlit horizon", "glowing sea", "snow-bound forest",
    "thunder-laden air", "shimmering ocean", "dusk-lit glen", "crimson forest",
    "raging maelstrom", "shadowed ridge", "vivid sunset", "ashen lake",
    "cinder plains", "starry cove", "hazy cliffs", "sunlit beach",
    "sunset waters", "aurora blaze", "searing sands", "snow-draped peaks",
    "ancient tundra", "rippled dunes", "burnished earth", "oceanic crest",
    "quivering fields", "tempest skies", "winding canyon", "firelit ridge",
    "rolling dunes", "seaside cliffs", "ashen fields", "glacial crest",
    "volcanic ridge", "billowing smoke", "canyon chasm", "starry grotto",
    "pristine wilderness", "frostbitten woods", "rushing rapids", "hidden vale",
    "raging tempest", "sea-soaked cliffs", "desolate outcrop", "shrouded fjord"
]

def create_environment_item():
    adj = random.choice(descriptive_adjectives)
    noun = random.choice(environment_nouns)
    # short_code = "".join(random.choices(string.ascii_lowercase, k=8))
    article = random.choice(["a", "the"])
    # return f"{article} {adj} {noun} ({short_code})"
    return f"{article} {adj} {noun}"

entity7_set = set()
while len(entity7_set) < NUM_OPTIONS_PER_ENTITY_7_8:
    entity7_set.add(create_environment_item())
entity7_list = list(entity7_set)

def create_local_rumor():
    legend_item = random.choice(legends)
    loc = random.choice(locations)
    # suffix_num = random.randint(1, 9999)
    # return f"the {legend_item} of {loc} #{suffix_num}"
    return f"the {legend_item} of {loc}"

entity8_set = set()
while len(entity8_set) < NUM_OPTIONS_PER_ENTITY_7_8:
    entity8_set.add(create_local_rumor())
entity8_list = list(entity8_set)

# ---------------------------------------------------------------------
#   5) GENERATE STATEMENTS & WRITE TO CSV (WITH PROGRESS BAR)
# ---------------------------------------------------------------------
def resample_master_data(type):
    OUTPUT_CSV = f"path"
    output_dir = os.path.dirname(OUTPUT_CSV)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    with open(OUTPUT_CSV, mode='w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            "entity1", "entity2", "entity3", "entity4",
            "entity5", "entity6", "entity7", "entity8",
            "generated_text"
        ])
        
        # Modify tqdm configuration to show single line
        pbar = tqdm(total=NUM_STATEMENTS, 
                   desc="Generating statements",
                   bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]',
                   position=0,
                   leave=True)
        
        for _ in range(NUM_STATEMENTS):
            e1 = random.choice(entity1_list)
            e2 = random.choice(entity2_list)
            e3 = random.choice(entity3_list)
            e4 = random.choice(entity4_list)
            e5 = random.choice(entity5_list)
            e6 = random.choice(entity6_list)
            e7 = random.choice(entity7_list)
            e8 = random.choice(entity8_list)
            
            if type == "long": 
                sentence = (
                    f"On day {e5}, {e1} convened with {e2} to debate the secret message "
                    f"they found in an obscure cave in {e4}, "
                    f"while whispering in {e6}, which reads '{e3}.' "
                    f"{e7.capitalize()} covered the scene, fueled by dark rumors of {e8}."
                )
            elif type == "short":
                sentence = (
                    f"{e5}, {e1}, {e2}, {e4}, {e6}, {e3}, {e7.capitalize()}, {e8}"
                )
            
            writer.writerow([e1, e2, e3, e4, e5, e6, e7, e8, sentence])
            pbar.update(1)
            
        pbar.close()
    print(f"\nDone! {NUM_STATEMENTS:,} statements saved to '{OUTPUT_CSV}'.")

import pandas as pd
import random

def resample_master_data_table(type, num_statements):
    # Initialize lists to store column data
    data = {
        "entity1": [],
        "entity2": [],
        "entity3": [],
        "entity4": [],
        "entity5": [],
        "entity6": [],
        "entity7": [],
        "entity8": [],
        "generated_text": []
    }
    
    # Generate statements
    for _ in range(num_statements):
        e1 = random.choice(entity1_list)
        e2 = random.choice(entity2_list)
        e3 = random.choice(entity3_list)
        e4 = random.choice(entity4_list)
        e5 = random.choice(entity5_list)
        e6 = random.choice(entity6_list)
        e7 = random.choice(entity7_list)
        e8 = random.choice(entity8_list)
        
        if type == "long":
            sentence = (
                f"On day {e5}, {e1} convened with {e2} to debate the secret message "
                f"they found in an obscure cave in {e4}, "
                f"while whispering in {e6}, which reads '{e3}.' "
            )
                            # f"{e7.capitalize()} covered the scene, fueled by dark rumors of {e8}."
        elif type == "short":
            sentence = (
                f"{e1}, {e2}, {e3}, {e4}, {e5}, {e5}, {e7}, {e8}"
            )
        
        # Add data to respective columns
        data["entity1"].append(e1)
        data["entity2"].append(e2)
        data["entity3"].append(e3)
        data["entity4"].append(e4)
        data["entity5"].append(e5)
        data["entity6"].append(e6)
        data["entity7"].append(e7)
        data["entity8"].append(e8)
        data["generated_text"].append(sentence)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    return df

## Prompt Generation Function

In [None]:
import numpy as np
import pandas as pd
import os
import random 
from datetime import datetime
import time
import names
from collections import defaultdict
import warnings
from tqdm import tqdm
from hmmlearn import hmm
from typing import List, Tuple
import re 
from collections import Counter
import scipy.stats
from scipy.stats import pareto
import scipy.stats as stats
random.seed(42)
from nltk.probability import FreqDist

def sample_training_data(F_dataset, size, mono_percent, nonmono_min, pareto_alpha):
    all_facts = F_dataset['generated_text'].tolist()
    total_unique_facts = len(all_facts)
    
    # Calculate number of mono-facts (M)
    num_mono_facts = int(size * mono_percent)
    remaining_size = size - num_mono_facts
    
    if num_mono_facts > total_unique_facts:
        raise ValueError(
            f"Cannot select {num_mono_facts} mono-facts from {total_unique_facts} total facts."
        )
    
    # Randomly select mono-facts
    mono_facts = random.sample(all_facts, num_mono_facts)
    
    # Remaining facts pool for non-mono sampling
    remaining_facts_pool = list(set(all_facts) - set(mono_facts))
    random.shuffle(remaining_facts_pool)  # Shuffle to randomize selection order
    
    # Parameters for Pareto distribution
    scale = nonmono_min  # Minimum number of repetitions
    alpha = pareto_alpha  # Shape parameter
    
    nonmono_facts_repeated = []
    current_sum = 0
    
    for fact in remaining_facts_pool:
        if current_sum >= remaining_size:
            break
        reps = scipy.stats.pareto.rvs(b=alpha, scale=scale)
        reps = int(np.ceil(reps))
        reps = max(reps, 1)  # Ensure at least 1 repetition
        
        # If adding these repetitions overshoots remaining_size
        if current_sum + reps > remaining_size:
            reps = remaining_size - current_sum
        
        # Add these repetitions
        nonmono_facts_repeated.extend([fact] * reps)
        current_sum += reps
        
        if current_sum >= remaining_size:
            break
    
    # Check if we managed to fill the remaining_size
    if current_sum < remaining_size:
        # To handle the remaining, assign 1 repetition to random non-mono facts
        needed = remaining_size - current_sum
        additional_facts = random.choices(remaining_facts_pool, k=needed)
        nonmono_facts_repeated.extend(additional_facts)
        current_sum += needed
    
    # Combine mono and non-mono facts
    O_facts = mono_facts + nonmono_facts_repeated
    
    # Shuffle to ensure a mixed distribution
    random.shuffle(O_facts)
    
    # Create DataFrame
    O_dataset = pd.DataFrame({'fact': O_facts})
    
    # Print statistics
    num_nonmono_unique = len(set(nonmono_facts_repeated))

    # Count occurrences of each fact and find the maximum
    fact_counts = {}
    for fact in O_facts:
        fact_counts[fact] = fact_counts.get(fact, 0) + 1
    max_repetitions = max(fact_counts.values())
    
    # print(f"Mono facts: {num_mono_facts:,}")
    # print(f"Non-mono unique facts: {num_nonmono_unique:,}")

    # print(f"Maximum repetitions of any fact: {max_repetitions:,}")
    # print(f"Total facts: {len(O_facts):,}")
    
    return O_dataset

def process_csv_to_prompt(df) -> str:
    try:
        all_data = "\n".join(df)
        prompt = f"{all_data}."
        return prompt
    except Exception as e:
        print(f"Error processing DataFrame to prompt: {e}")
        return "Error generating prompt"

def process_csv_to_statements(df, entity_cols: list) -> pd.DataFrame:
    try:             
        data_rows = []
        for _, row in df.iterrows():
            # Create column names in the format 'entity1', 'entity2', 'entity4'
            entities = [str(row[f'entity{col}']) for col in entity_cols]
            data_rows.append({"fact": ", ".join(entities)})
        
        return pd.DataFrame(data_rows)
    except Exception as e:
        print(f"Error processing CSV to statements: {e}")
        return pd.DataFrame(columns=["fact"])

def generate_prompt_data(F_dataset, size):
    prompts = []
    
    statements = random.sample(F_dataset, size)
    mono_pct = mono_calc(statements)
    prompt = process_csv_to_prompt(statements)
    return (prompt, mono_pct)

def create_powerlaw_p(F_dataset, pareto_alpha):
    #create p
    all_facts = F_dataset['generated_text']
    new_facts = []
    for fact in all_facts:
        reps = pareto.rvs(b=pareto_alpha, scale=1)
        reps = int(np.ceil(reps))
        new_facts.extend([fact] * reps)
    ##calculate monofact rate
    print(f'Monofact % in p is: {mono_calc(new_facts)}')
    return new_facts

def create_normal_p(F_dataset, mean, std_dev):
    #create p
    all_facts = F_dataset['generated_text']
    new_facts = []
    for fact in all_facts:
        reps = np.random.normal(loc=mean, scale=std_dev)
        reps = max(1, reps)
        reps = int(np.ceil(reps))
        new_facts.extend([fact] * reps)
    ##calculate monofact rate
    print(f'Monofact % in p is: {mono_calc(new_facts)}')
    return new_facts

def mono_calc(new_facts):
    fact_counts = Counter(new_facts)
    num_mono = sum(1 for count in fact_counts.values() if count == 1)
    mono_pct = num_mono / len(new_facts)
    return mono_pct

## Generation Process

In [None]:
## set-up 
import google.generativeai as genai
import gc
import re
import pandas as pd
import numpy as np
import random
import csv
import os
import scipy.stats
from scipy.stats import pareto
import scipy.stats as stat
from tqdm import tqdm 
import pandas as pd 
import random
from collections import Counter
import scipy.stats as stats

#####CONTROLS####
YOUR_API_KEY = "key"
genai.configure(api_key=YOUR_API_KEY)
TYPE = "long"
path = f"path"
size = 1000

####EXPERIMENTAL INPUTS####
MEANS = [10, 20, 30, 40, 50, 60, 70, 80]
ITERATIONS = list(range(1, 101,1))
gen_size = 100
temp = 1.0 

####HELPER FUNCTIONS###
def hallucination_rate(prompt, response_text):
    prompt_list = prompt.split("\n")
    response_list = response_text.split("\n")
    #captures partical correctness 
    hallucinations = set(response_list) - set(prompt_list)
    hallucination_rate = len(hallucinations) / len(response_list)
    return hallucination_rate

def exact_allucination_rate(prompt, response_text):
    # Split into lines
    prompt_list = prompt.split("\n")
    response_list = response_text.split("\n")
    
    # Exact correctness
    exact_matches = sum(1 for r in response_list if r in prompt_list)
    exact_hallucination_rate = 1 - (exact_matches / len(response_list))
    
    return exact_hallucination_rate

def all_entities(prompt):
    """
    Extract a set of all unique entities and valid combinations from the prompt.
    """
    # Compile the updated pattern (using "while speaking in", capturing the period).
    pattern = re.compile(
        r"On day (\d+), (\w+) convened with (\w+) to debate the secret message they found in an obscure cave in (\w+), while whispering in (\w+), which reads '([^']+)'"
    )

    entity_set = set()
    valid_combinations = []

    for row in prompt.split("\n"):
        row = row.strip()
        if not row:
            continue

        # Search for a match in the current line
        match = pattern.search(row)
        if match:
            day, name1, name2, location, language, code = match.groups()
            # Add all entities to the set
            for ent in [day, name1, name2, location, language, code]:
                entity_set.add(ent.strip(".,!?;:'\"").lower())
            # Store the valid combination as a list (or tuple)
            valid_combinations.append([day.strip(".,!?;:'\"").lower(), name1.strip(".,!?;:'\"").lower(), name2.strip(".,!?;:'\"").lower(), location.strip(".,!?;:'\"").lower(), language.strip(".,!?;:'\"").lower(), code.strip(".,!?;:'\"").lower()])
    return entity_set, valid_combinations

def entity_hallucination_rate(prompt, response_text):
    """
    Calculates hallucination rate for responses:
      hallucination_rate = (# hallucinated lines) / (total # of non-empty response lines).
    """
    # 1) Extract entities & combinations
    entity_set, valid_combinations = all_entities(prompt)

    # 2) Split and clean response lines
    response_lines = [line.strip() for line in response_text.split("\n") if line.strip()]
    total_lines = len(response_lines)
    if total_lines == 0:
        return 0.0  # If no non-empty lines, return 0.0 or whichever default you prefer

    hallucination_count = 0

    for line in response_lines:
        # Gather the entities from this line
        words = line.split()
        cleaned_words = [word.strip(".,!?;:'\"").lower() for word in words]
        # print(cleaned_words)
        found_entities = {word for word in cleaned_words if word in entity_set}
        # print(found_entities)
        
        # If no known entities are found, skip counting this line as hallucinated or not
        # (If you want lines with zero known entities to be "hallucinated," adjust logic here)
        if not found_entities:
            continue

        # Check if found_entities is a subset of at least one valid combination
        line_is_hallucination = True
        for combo in valid_combinations:
            if found_entities.issubset(combo):
                line_is_hallucination = False
                break

        if line_is_hallucination:
            hallucination_count += 1
    print(hallucination_count)
    return hallucination_count / total_lines

#####EXPERIMENT####
mono_pcts = []
means = []
prompts = []
responses = []
iterations = []
h_rates = []
exact_h_rates = []
gen_lengths = []
entity_h_rates = []

# Calculate total iterations for progress bar
total_iterations = len(MEANS) * len(ITERATIONS)

# Create progress bar
with tqdm(total=total_iterations, desc="Generating responses") as pbar:
    for i in ITERATIONS: 
        print(f"Iteration is: {i}")
        for mean_miu in MEANS:
            F_dataset = resample_master_data_table(TYPE, 5000) 
            p = create_normal_p(F_dataset, mean_miu, mean_miu * 0.5)
            (prompt, mono_pct) = generate_prompt_data(p, size)
            print(f"mean is {mean_miu}")
            print(f"monofact is {mono_pct}")
            feed_prompt = f"""The following is a corpus of {size} facts. 
                          Please generate {gen_size} statements separated by new lines."  
                          Duplicates are allowed."
                          Facts: {prompt}"""
            model = genai.GenerativeModel("gemini-1.5-flash") 
            response = model.generate_content(feed_prompt, 
                                              generation_config = genai.GenerationConfig(temperature=temp), 
                                              request_options={"timeout": 1000})
            response_text = response.text
            h_rate = hallucination_rate(prompt, response_text)
            print(f"Hallucination rate is: {h_rate}")
            exact_h_rate = exact_allucination_rate(prompt, response_text)
            print(f"Hallucination rate (exact) is: {exact_h_rate}")
            response_list = response_text.split("\n")
            gen_length = len(response_list)
            print(f"Generation length is: {gen_length}")
            entity_h_rate = entity_hallucination_rate (prompt, response_text)
            print(f"Hallucination rate (entity) is: {entity_h_rate}")

            # Update the table 
            mono_pcts.append(mono_pct)
            means.append(mean_miu)
            prompts.append(prompt)
            responses.append(response_text)
            iterations.append(i) 
            h_rates.append(h_rate)
            exact_h_rates.append(exact_h_rate)
            gen_lengths.append(gen_length)
            entity_h_rates.append(entity_h_rate)
            
            #make sure each call is independent of each other
            del response
            del model
            gc.collect() 
            
            # Update progress bar
            pbar.update(1)
        
        # Final table 
        output = pd.DataFrame({
            'mono_pcts': mono_pcts,
            'means': means,
            'responses': responses,
            'iterations': iterations,
            'h_rates': h_rates,
            'exact_h_rates': exact_h_rates,
            'entity_h_rates': entity_h_rates
        })
        output.to_csv(f"path")

## Data Merging

In [None]:
import pandas as pd
import glob
def merge_results():
    all_files = glob.glob("path")
    dfs = [pd.read_csv(f) for f in all_files]
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df.drop(columns=['Unnamed: 0'], inplace=True)
    combined_df.sort_values(by=['mono_pcts', 'means'], ascending=[True, True], inplace=True)
    combined_df.reset_index(drop=True, inplace=True)
    combined_df.to_csv("path", index=False)
if __name__ == "__main__":
    merge_results()

## Boxplot

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

# Set style
mpl.rcParams['font.family'] = 'Times New Roman'
mpl.rcParams['font.size'] = 9
mpl.rcParams['text.color'] = 'black'
mpl.rcParams['axes.labelcolor'] = 'black'
mpl.rcParams['xtick.color'] = 'black'
mpl.rcParams['ytick.color'] = 'black'

# Load and prepare data
generation = pd.read_csv("path")
# generation = generation[generation["paretos"] >= 1]

# Create bins and group data
generation['mono_bin'] = pd.cut(generation['mono_pcts'], 
                              bins=np.arange(0, 1.1, 0.1),
                              labels=[f'{i:.1f}-{i+0.1:.1f}' for i in np.arange(0, 1.0, 0.1)])

# Filter out empty bins
grouped_data = []
bin_labels = []
for name, group in generation.groupby('mono_bin'):
    if not group.empty:
        grouped_data.append(group['h_rates'].values)
        bin_labels.append(name)

# Create plot
plt.figure(figsize=(11, 3), dpi=300)

boxplot = plt.boxplot(grouped_data,
                     labels=bin_labels,
                     patch_artist=True)

# Style the plot
for box in boxplot['boxes']:
    box.set(facecolor='lightblue', alpha=0.6, edgecolor='black')
for element in ['whiskers', 'caps']:
    for item in boxplot[element]:
        item.set(color='black')
for median in boxplot['medians']:
    median.set(color='red')
for flier in boxplot['fliers']:
    flier.set(marker='o', markerfacecolor='gray', alpha=0.5, markersize=4)

plt.xlabel('Monofact Rate Range', fontsize=11, labelpad=5)
plt.ylabel('Hallucination Rate', fontsize=11, labelpad=5)
plt.title('Distribution of Hallucination Rates by Monofact Rate Range', fontsize=11)
plt.xticks(rotation=0, fontsize = 11)
plt.yticks(rotation=0, fontsize = 11)
plt.grid(True, linestyle='--', alpha=0.2)
plt.tight_layout()

plt.savefig("path")
plt.show()