In [4]:
# Import Cell
# from gensim.models import KeyedVectors
from wordfreq import word_frequency
import random
import re
import os
import sys
from pathlib import Path
import xml.etree.ElementTree as ET

In [5]:
# Aux
def write_wordle_txt(filepath, words):
    with open(filepath, "w") as f:
        for word in words:
            f.write(f"{word}\n")

def clear_and_return_words(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        words = {line.strip() for line in file if line.strip()}
    with open(filepath, 'w', encoding='utf-8') as file:
        pass 
    return words

def get_words(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        words = {line.strip() for line in file if line.strip()}
    return words

In [None]:
# Model Load Cell
model = KeyedVectors.load_word2vec_format('bioword.vec.bin', binary=True)
print("✅ Model loaded successfully from ./bioword.vec.bin")

In [None]:
# Load MeSH descriptor terms from XML
mesh_path = "desc2025.xml"  # located in src/ alongside the notebook
tree = ET.parse(mesh_path)
root = tree.getroot()

# Extract MeSH descriptor terms
mesh_terms = set()
for record in root.findall(".//DescriptorRecord"):
    term = record.findtext("DescriptorName/String")
    if term:
        mesh_terms.add(term.lower())  # match lowercase words in BioWordVec

print(f"✅ Loaded {len(mesh_terms)} MeSH terms")


In [12]:
# Filtered Vocabulary Cell
country_blocklist = {
    "spain", "china", "india", "chile", "kenya", "ghana", "egypt", "nepal",
    "niger", "iran", "iraq", "syria", "haiti", "italy", "japan", "qatar",
    "yemen", "libya", "rwanda", "norway", "serbia", "sweden", "guinea",
    "israel", "belize", "bhutan", "greece", "poland", "romania", "mexico",
    "canada", "zambia", "swazis", "angola", "malta", "latvia", "togo",
    "benin", "samoa", "sudan", "oman", "peru", "laos", "cuba", "fiji",
    "chad", "mali", "tonga", "palau", "nauru", "yemen", "malta", "samoa",
    "france", "brazil", "russia", "korea", "vietnam", "gabon", "zimbabwe",
    "uganda", "tunisia", "thailand", "taiwan", "switzerland", "slovakia",
    "slovenia", "singapore", "senegal", "portugal", "philippines", "pakistan",
    "newzealand", "netherlands", "morocco", "mongolia", "malaysia", "luxembourg",
    "lithuania", "lebanon", "kuwait", "kazakhstan", "jamaica", "indonesia",
    "hungary", "finland", "estonia", "denmark", "czech", "croatia", "colombia",
    "bulgaria", "belgium", "austria", "australia", "argentina", "algeria", "panama",
    "europe", "africa", "asia", "america", "oceania", "kosovo", "ukraine",
    "scotland", "wales", "england", "ireland", "iceland", "sweden", "norway",
}

non_bio_terms = {
    "spain", "iran", "china", "politics", "crime", "theft", "war", "terrorism",
    "economics", "education", "law", "insurance", "banking", "religion", "computers",
    "marketing", "advertising", "journalism", "philosophy", "rape", "role", "jews", "sexism",
    "fascism", "capitalism", "communism", "socialism", "feminism", "racism",
    "homophobia", "transphobia", "xenophobia", "sexism", "ageism", "ableism", "minors",
    "police", "violence", "monaco", "prison", "prisons", "jail", "jails", "court", "courts",
    "judge", "judges", "lawyer", "lawyers", "attorney", "attorneys", "prosecutor", "theft",
}

food_terms = {
    'salads', 'drink', 'sweet', 'snack',
    'sweets', 'soup', 'sauces', 'sauce', 'salad', 'snacks', 'drinks',
    'dessert', 'desserts', 'dinner', 'breakfast', 'lunch', 'lunches', 'dinner', 'dinners'
    'dinners', 'breakfasts', 'cooking', 'cook', 'bake', 'baking',
    'baker', 'bakers', 'chef', 'chefs', 'cooks', 'cooked', 'cooking',
    'baked', 'baking', 'bakeries', 'bakery', 'restaurant', 'restaurants',
}

cities = {
    "london", "paris", "tokyo", "berlin", "madrid", "dubai", "delhi", "sydney", "seoul", "milan",
    "osaka", "vienna", "zurich", "prague", "dublin", "miami", "boston", "chicago", "houston", "atlanta",
    "toronto", "detroit", "seattle", "denver", "orlando", "phoenix", "dallas", "austin", "tampa", "vegas",
    "lisbon", "munich", "warsaw", "naples", "brasil", "quito", "havana", "cairo", "beirut", "jakarta",
    "manila", "hanoi", "oslo", "sofia", "riga", "varna", "geneva", "brno", "porto", "malaga",
    "nice", "lyon", "crete", "varna", "goa", "pune", "kyoto", "nagoya", "kobe", "sapporo",
    "chennai", "hyderabad", "ahmedabad", "surat", "jaipur", "kanpur", "nagpur",
    'rome', 'roma', 'napoli', 'naples', 'bologna', 'palermo', 'genova', 'firenze',
    'catania', 'venezia', 'torino', 'milano', 'trieste', 'verona', 'bari',
    'londres', 'parigi', 'berlino', 'madrid', 'barcelona', 'dublino',
}

us_states = {
    "alabama", "alaska", "arizona", "arkansas", "california", "colorado",
    "connecticut", "delaware", "florida", "georgia", "hawaii", "idaho",
    "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana",
    "maine", "maryland", "massachusetts", "michigan", "minnesota",
    "mississippi", "missouri", "montana", "nebraska", "nevada",
    "newhampshire", "newjersey", "newmexico", "newyork",
    "northcarolina", "northdakota", "ohio", "oklahoma",
    "oregon", "pennsylvania", "rhodeisland", "southcarolina", "southdakota", "tennessee", "texas", 
    "utah", "vermont", "virginia", "washington", "westvirginia", "wisconsin", "wyoming"
}

colors = {
    "red", "blue", "green", "black", "white", "yellow", "purple", "orange", "violet", "indigo", "silver", "golden", "bronze"
}

# Make a Set out of all the sets above
blocklist = set()
blocklist.update(country_blocklist)
blocklist.update(non_bio_terms)
blocklist.update(cities)
blocklist.update(us_states)
blocklist.update(colors)

def bio_score(word):
    bio_sim = model.similarity(word, "biology")

    life_sim = model.similarity(word, "life") ** 1.5
    cell_sim = model.similarity(word, "cell") ** 1.5
    chem_sim = model.similarity(word, "chemistry") ** 1.5
    disease_sim = model.similarity(word, "disease") ** 1.5
    org_sim = model.similarity(word, "organism") ** 1.5
    animal_sim = model.similarity(word, "animal") ** 1.5
    plant_sim = model.similarity(word, "plant") ** 1.5
    microbe_sim = model.similarity(word, "microbe") ** 1.5
    anatomy_sim = model.similarity(word, "anatomy") ** 1.5
    human_sim = model.similarity(word, "human") ** 1.5
    
    other_sims = sorted([life_sim, cell_sim, chem_sim, disease_sim, org_sim, animal_sim, plant_sim, microbe_sim, anatomy_sim, human_sim], reverse=True)[:7]
    similarity = (2 * bio_sim + sum(other_sims)) / 10
    return round(similarity, 4)

def is_clean_word(w, b=blocklist):
    return (
        4 <= len(w) <= 7
        and w.isalpha()
        and re.match(r"^[a-z]{4,7}$", w)
        and any(c in 'aeiou' for c in w)
        and w not in blocklist
        and word_frequency(w, 'en') < 1e-3
        and word_frequency(w, 'en') > 1e-8
        # and w in mesh_terms
        # and bio_score(w) > 0.15
    )

In [None]:
filtered_vocab_list = [w for w in model.key_to_index if is_clean_word(w)]
bio_words = set(filtered_vocab_list)
print(f"✅ Filtered vocab size after MeSH & wordfreq filtering: {len(bio_words)}")

In [None]:
# Select 200 words from the filtered vocabulary
sampled_words = random.sample(filtered_vocab_list, 400)

# Print the words in a 20x20 grid
for i in range(0, 400, 20):
    print("\t".join(sampled_words[i:i+20]))

In [7]:
output_dir = Path("../public/assets/data")
output_dir.mkdir(parents=True, exist_ok=True)
wordle_path = output_dir / "bio_words.txt"

In [8]:
current_words = get_words(wordle_path)

In [9]:
print(f"Length of current_words: {len(current_words)}")
print("Preview of current_words:", list(current_words)[:10])  # Display the first 10 words

Length of current_words: 817
Preview of current_words: ['octopus', 'drive', 'apium', 'mink', 'mitosi', 'mars', 'golgi', 'aids', 'joints', 'horses']


In [49]:
# Extra Sets of Words to Add
bonus_random_bio1 = {
    'gout', 'myopia', 'mitogen', 'domain', 'enzyme', 'corvus', 'anole',
    'zebra', 'estrus', 'krait', 'goat', 'embryo', 'spine', 'indri',
    'tonsil', 'swine', 'baleen', 'nucleus', 'apical', 'hepatic',
    'minnow', 'cactus', 'olive', 'capsule', 'deer', 'lion', 'trunk',
    'uterus', 'rabies', 'gaur', 'leptin', 'aorta', 'rana', 'clone',
    'pyrus', 'rice', 'mussel', 'melanin', 'nyala', 'palea', 'moss',
    'fever', 'cicada', 'axil', 'taiga', 'carapace', 'cervus', 'siren',
    'sucrose', 'octopus', 'larva', 'mange', 'dingo', 'sponge', 'bract',
    'carotene', 'capsid', 'viper', 'claw', 'cornea', 'pupa', 'whale',
    'toxin', 'lobster', 'helix', 'codon', 'catfish', 'phloem', 'soma',
    'cuticle', 'equine', 'redox', 'saliva', 'molar', 'klippe', 'polyp',
    'fly', 'gable', 'mollusk', 'sapling', 'grotto', 'sapiens', 'puffer',
    'zonation', 'larvae', 'fungus', 'cytosol', 'retina', 'ribose', 'cough'
}

bonus_random_bio2 = {
    'absorb', 'acari', 'algaes', 'amoeba', 'amylas', 'anther', 'angios', 'archae',
    'asthma', 'bamboo', 'bacter', 'beetle', 'bronch', 'cancer', 'cholera', 'collag',
    'corn', 'cortex', 'cotyle', 'cyto', 'cytosol', 'dopami', 'ebola', 'embryo',
    'enzyme', 'euglen', 'fever', 'ferns', 'filter', 'flower', 'flagel', 'fungi',
    'fungus', 'fusion', 'giardi', 'glands', 'glucos', 'golgi', 'herpes', 'hormone',
    'horse', 'human', 'induce', 'insect', 'insuli', 'kinase', 'lipid', 'lipids',
    'malari', 'mammal', 'mango', 'measle', 'meiosi', 'mitosi', 'monera',
    'monkey', 'moss', 'mouse', 'muscle', 'neuron', 'nucleus', 'onions', 'orange',
    'osmosis', 'ovary', 'parrot', 'patella', 'petals', 'phloem', 'pollen',
    'primate', 'quorum', 'rabbit', 'rabies', 'reptil', 'replic', 'retina', 'ribose',
    'rodent', 'rotavi', 'snail', 'sepals', 'spiroc', 'tibia', 'tomato', 'toxopl',
    'toxin', 'tree', 'tulip', 'urease', 'ureter', 'vesicl', 'villi', 'virus', 'whale',
    'xylem', 'zebra'
}

bonus_cell_words = {
    'nucleus', 'ribose', 'cytosol', 'golgi', 'lysine', 'mitose', 'actins',
    'villin', 'tubule', 'myosin', 'kinesin', 'lipids', 'spines', 'dynein',
    'cilia', 'vacuole', 'matrix', 'lamina', 'fibers', 'membrane', 'capsid',
    'pore', 'axonem', 'fibrin', 'gating', 'vessel', 'tissue', 'signal',
    'glands', 'recept', 'phagol', 'lumen', 'cytokin', 'fusion', 'protein',
    'lipid', 'vessels', 'cortex', 'mitosis', 'fiber', 'sugar'
}

bonus_plant_words = {
    'leaf', 'stem', 'root', 'flower', 'fruit', 'seed', 'bark', 'bud', 'petal', 'stamen',
    'pollen', 'spore', 'xylem', 'stomata', 'cuticle',
}

bonus_molecule_words = {
    'glucose', 'sucrose', 'oxygen', 'carbon', 'hydrogen', 'nitrogen', 'phosphate',
    'amino', 'acid'
}

bonus_symptom_words = {
    'fever', 'cough', 'pain', 'nausea', 'vomiting', 'diarrhea', 'fatigue',
    'headache', 'dizziness', 'sore', 'throat', 'rash', 'swelling',
    'inflammation', 'infection'
}

bonus_evolution_words = {
    'evolution', 'mutation', 'adaptation', 'natural', 'selection', 'species',
    'phylogeny', 'diversity', 'speciation', 'extinction', 'gene', 'genome',
    'chromosome', 'allele', 'population'
}

bonus_sick_words = {
    'disease', 'infection', 'virus', 'bacteria', 'fungus', 'parasite',
    'pathogen', 'syndrome', 'disorder', 'condition', 'illness', 'malady',
    'ailment', 'sickness', 'epidemic'
}

bonus_disease_words = {
    'ebola', 'malaria', 'tuberculosis', 'cholera', 'influenza',
    'hepatitis', 'measles', 'dengue', 'zika', 'hiv', 'aids', 'syphilis',
    'tetanus', 'rabies', 'leprosy', 'lupus', 'arthritis', 'diabetes',
    'asthma', 'cancer', 'hypertension', 'cardiovascular', 'stroke',
    'obesity', 'anemia', 'epilepsy', 'autism', 'schizophrenia', 'depression',
    'bipolar', 'anxiety', 'ptsd', 'ocd', 'adhd', 'add', 'dementia',
    'alzheimer', 'parkinson', 'huntington',
    'crohn', 'colitis', 'celiac', 'ibs', 'ibd', 'fibromyalgia', 'chronic',
    'fatigue', 'syndrome', 'chronic', 'pain', 'migraine',
}

full_bonus_words = set()
full_bonus_words.update(bonus_random_bio1)
full_bonus_words.update(bonus_random_bio2)
full_bonus_words.update(bonus_cell_words)
full_bonus_words.update(bonus_plant_words)
full_bonus_words.update(bonus_molecule_words)
full_bonus_words.update(bonus_symptom_words)
full_bonus_words.update(bonus_evolution_words)
full_bonus_words.update(bonus_sick_words)
full_bonus_words.update(bonus_disease_words)

print(f"✅ Bonus Words before double check: {len(full_bonus_words)}")
final_bonus_words = {w for w in full_bonus_words if is_clean_word(w)}
print(f"✅ Bonus Words after double check: {len(final_bonus_words)}")

✅ Bonus Words before double check: 305
✅ Bonus Words after double check: 216


### Last Chat-GPT Attempt

In [13]:
molecules = [
    # Nucleic acids
    "Adenine", "Thymine", "Cytosine", "Guanine", "Uracil", "ATP", "GTP", "CTP", "TTP", "UTP", "cAMP", "cGMP",
    "DNA", "RNA", "mRNA", "tRNA", "rRNA",

    # Amino acids
    "Alanine", "Arginine", "Asparagine", "Aspartic acid", "Cysteine", "Glutamic acid", "Glutamine", "Glycine",
    "Histidine", "Isoleucine", "Leucine", "Lysine", "Methionine", "Phenylalanine", "Proline", "Serine",
    "Threonine", "Tryptophan", "Tyrosine", "Valine",

    # Carbohydrates
    "Glucose", "Fructose", "Galactose", "Ribose", "Deoxyribose", "Maltose", "Lactose", "Sucrose", "Cellulose",
    "Starch", "Glycogen", "Chitin",

    # Lipids
    "Cholesterol", "Phosphatidylcholine", "Phosphatidylethanolamine", "Triglyceride", "Oleic acid", "Linoleic acid",
    "Palmitic acid", "Stearic acid", "Arachidonic acid", "Ceramide", "Sphingomyelin",

    # Vitamins
    "Vitamin A", "Vitamin B1", "Vitamin B2", "Vitamin B3", "Vitamin B5", "Vitamin B6", "Vitamin B7",
    "Vitamin B9", "Vitamin B12", "Vitamin C", "Vitamin D", "Vitamin E", "Vitamin K",

    # Hormones
    "Insulin", "Glucagon", "Adrenaline", "Noradrenaline", "Dopamine", "Serotonin", "Melatonin", "Cortisol",
    "Testosterone", "Estrogen", "Progesterone", "Thyroxine", "Triiodothyronine", "Calcitonin",

    # Metabolites and intermediates
    "Pyruvate", "Lactate", "Acetyl-CoA", "Citrate", "Isocitrate", "Alpha-ketoglutarate", "Succinyl-CoA",
    "Succinate", "Fumarate", "Malate", "Oxaloacetate", "NAD+", "NADH", "NADP+", "NADPH", "FAD", "FADH2",
    "Coenzyme Q", "Cytochrome c", "Glutathione",

    # Other signaling and structural molecules
    "cAMP", "cGMP", "Nitric oxide", "Histamine", "Prostaglandin E2", "Thromboxane", "Leukotriene",
    "Hemoglobin", "Myoglobin", "Collagen", "Elastin", "Keratin", "Actin", "Myosin", "Tubulin",

    # Ions and buffers
    "Bicarbonate", "Carbonic acid", "Phosphate buffer", "Ammonium", "Ammonia",

    # Secondary metabolites / Plant / Microbial
    "Chlorophyll", "Beta-carotene", "Xanthophyll", "Anthocyanin", "Lignin", "Pectin", "Flavonoid", "Alkaloid",
    "Terpene", "Phenol", "Tannin", "Saponin", "Resveratrol", "Curcumin", "Capsaicin", "Quercetin", "Catechin",

    # Others
    "Urea", "Creatine", "Creatinine", "Bilirubin", "Bile acid", "Surfactant", "Lysozyme", "Amylase",
    "Pepsin", "Trypsin", "Lipase", "DNA polymerase", "RNA polymerase", "Helicase", "Topoisomerase",
    "Ligase", "Ribosome", "Proteasome", "Interleukin-6", "Tumor necrosis factor", "Interferon",
]

biology_related_elements = {
    "Hydrogen", "Carbon", "Nitrogen", "Oxygen", "Phosphorus", "Sulfur",
    "Sodium", "Potassium", "Calcium", "Magnesium", "Chlorine",
    "Iron", "Zinc", "Copper", "Manganese", "Cobalt", "Molybdenum",
    "Selenium", "Iodine", "Fluorine", "Boron", "Chromium", "Nickel",
    "Vanadium", "Silicon", "Arsenic", "Tin"
}

deep_search = {
    'abdomen', 'abies', 'abscess', 'abyssal', 'acer', 'actin', 'adenine', 'adrenal', 'aerobic', 'agonist', 
    'alanine', 'algae', 'allele', 'allergy', 'alveoli', 'amnion', 'amniote', 'amoeba', 'amylase', 'anas', 
    'anemia', 'animal', 'annelid', 'anther', 'antigen', 'anura', 'aorta', 'apidae', 'apis', 'araneae', 
    'archaea', 'artery', 'ascaris', 'asexual', 'atria', 'atrium', 'atrophy', 'auxin', 'aves', 'axon',
    'bacilli', 'benign', 'benthic', 'benthos', 'bile', 'biofilm', 'biology', 'biome', 'biopsy', 'bird', 
    'blood', 'bolus', 'bovidae', 'brain', 'bronchi', 'bufo', 'buteo',
    'cactus', 'calyx', 'cambium', 'canidae', 'cancer', 'canine', 'capia', 'capra', 'capsid', 'capsule', 
    'carnivora', 'carrion', 'cavia', 'caudata', 'cavity', 'cell', 'cecum', 'cervid', 'chlorophyll', 'chitin',
    'chorion', 'cicada', 'cilla', 'cimex', 'clade', 'cloaca', 'clone', 'coccus', 'codon', 'coelom', 
    'cortisol', 'cougar', 'cranial', 'crocus', 'cranium', 'crocus', 'cyan', 'cyst', 'cytosis', 'cytosol',
    'decapoda', 'dermis', 'desert', 'detritus', 'diastole', 'digest', 'dino', 'diurnal', 'dorsal', 
    'drosophila', 'dryad', 'duck', 'ecto', 'ecology', 'ecotone', 'edema', 'eggs', 'egret', 
    'elastic', 'elixir', 'embryo', 'endemic', 'energy', 'enzyme', 'equinox', 'equidae', 'estrus', 
    'ethyl', 'eukary', 'euglena', 'excyst', 'extinct', 'felidae', 'felis', 'femur', 'feral', 'ferret', 
    'fiber', 'fibula', 'filament', 'filter', 'fish', 'fission', 'flacid', 'flatus', 'flora', 'flower', 
    'folate', 'food', 'foregut', 'forest', 'formic', 'fossa', 'frond', 'fruit', 'fungi', 'fungus', 
    'gamete', 'ganglia', 'garden', 'gastrin', 'gel', 'gene', 'genome', 'genus', 'germ', 'gibbon',
    'giardia', 'gills', 'ginger', 'gland', 'glia', 'glucose', 'glycogen', 'gonad', 'goose', 'gorilla', 
    'gout', 'granum', 'gravid', 'growth', 'guppy', 'hair', 'haploid', 'hapten', 'heart', 'heifer', 
    'helix', 'hem', 'hemp', 'heredity', 'heron', 'hippo', 'histone', 'hominid', 'honey', 'hormone', 
    'host', 'human', 'humerus', 'hybrid', 'hydra', 'hymen', 'hyphae', 'icterus', 'ileum', 'incisors',
    'incus', 'insect', 'instinct', 'insulin', 'iris', 'isopod', 'isotope', 'jaguar', 'jelly', 
    'kidney', 'kinase', 'kingdom', 'larva', 'larynx', 'latex', 'leaf', 'leech', 'lemur', 'lens',
    'leptin', 'leucine', 'lichen', 'ligand', 'ligase', 'limbic', 'limpet', 'lineage', 'linkage', 'liver', 
    'locus', 'loess', 'lotus', 'lung', 'lupus', 'lysis', 'lytic',
    'macaw', 'magpie', 'malaria', 'malus', 'mammal', 'mantis', 'maxilla', 'medusa', 'melanin', 'membrane', 
    'meristem', 'mesoderm', 'metis', 'microbe', 'mimosa', 'mitosis', 'molars', 'mollusc', 'monera', 'monkey', 
    'morula', 'moss', 'mucus', 'muridae', 'musca', 'muscle', 'muskox', 'mutant', 'myosin', 'nectar', 
    'nemesis', 'neuron', 'neuropil', 'nevus', 'niche', 'nitric', 'noctua', 'nostoc', 'nuclei', 'nucleus',
    'oocyte', 'ocean', 'octopus', 'odonto', 'ophrys', 'orchid', 'ornate', 'ostium', 'ostrac', 'ovary', 
    'oviduct', 'ovule', 'oxygen', 'paddle', 'pagoda', 'palate', 'pandan', 'papaya', 'papaver', 'paramecia', 
    'parasite', 'parent', 'parrot', 'passer', 'patrix', 'pepsin', 'perch', 'petals', 'phage', 'phloem', 
    'photon', 'phylum', 'pigeon', 'pilus', 'pineal', 'pinus', 'piston', 'pith', 'plasma', 'plate', 
    'plover', 'pollen', 'polyp', 'pongo', 'porpoise', 'prairie', 'primate', 'prion', 'proline', 'proteus', 
    'protist', 'protozoa', 'pseud', 'pterid', 'puffer', 'pupa', 'pus', 'quagga', 'quercus', 'rabbit', 
    'raccoon', 'radius', 'raptor', 'rectum', 'renin', 'reptile', 'resin', 'rhino', 'rhizoid', 'ribbon', 
    'rival', 'rodent', 'rosa', 'rubella', 'sacrum', 'saline', 'salix', 'salmon', 'salvia', 'sand', 
    'saphenous', 'scapula', 'scutum', 'seabird', 'sepsis', 'serine', 'serum', 'shark', 'sheep', 'shrew', 
    'shrimp', 'siren', 'skull', 'sloth', 'slug', 'smallpox', 'snail', 'snake', 'solanum', 'solvent', 
    'soma', 'sorus', 'spleen', 'spore', 'squid', 'squama', 'squamata', 'stamen', 'stapes', 'starch', 
    'stasis', 'stator', 'sternum', 'stigma', 'stipe', 'stomach', 'stomata', 'stroma', 'style', 'sugar', 
    'sun', 'swamp', 'sweat', 'sylvan', 'symbios', 'synapse', 'systole',
    'tabetic', 'talon', 'talus', 'tandem', 'tarsus', 'taxonomy', 'teal', 'teeth', 'tendon', 'tenia', 
    'teredo', 'testis', 'thallus', 'thalus', 'theca', 'thermo', 'thorax', 'thymine', 'thymus', 'thyroid', 
    'tibia', 'tissue', 'tongue', 'torpor', 'toxin', 'trachea', 'tsetse', 'tuber', 'tumor', 'tundra', 
    'typhus', 'ulmus', 'ulcer', 'ulnae', 'umbra', 'uracil', 'urate', 'ureter', 'urethra', 'ursus', 
    'uterus', 'uvula', 'vacuole', 'vagus', 'valine', 'valve', 'varroa', 'vector', 'veins', 'venom', 
    'ventral', 'venus', 'vertebra', 'vesicle', 'vibrio', 'viola', 'virus', 'viscera', 'vitalis', 'vitis', 
    'vivax', 'volvox', 'vulva', 'xenopus', 'xylem', 'yaws', 'yeast', 'yolk', 'yurine', 'zebra', 
    'zoology', 'zygote'
}


molecule_words = set(molecules)
element_words = set(biology_related_elements)

gpt_words = set()
gpt_words.update(molecule_words)
gpt_words.update(element_words)
gpt_words.update(deep_search)

print(f"✅ GPT Words before double check: {len(gpt_words)}")
final_gpt_words = {w for w in gpt_words if is_clean_word(w, b=blocklist)}
print(f"✅ GPT Words after double check: {len(final_gpt_words)}")

✅ GPT Words before double check: 668
✅ GPT Words after double check: 431


In [None]:
words_cl = clear_and_return_words(wordle_path)
print(f"Words Cleared: {len(words_cl)}")

In [None]:
print(words_cl)

In [None]:
# Add more sets
words_cl.update(final_bonus_words)
words_cl.update(final_gpt_words)

In [None]:
def double_check(word, blocklist):
    return (
        4 <= len(word) <= 7
        and word.isalpha()
        and re.match(r"^[a-z]{4,7}$", word)
        and any(c in 'aeiou' for c in word)
        and word not in blocklist
    )

print("Total words before final check:", len(words_cl))
final_words = {word for word in words_cl if double_check(word, blocklist=blocklist)}
print("Total words after final check:", len(final_words))
print("Preview of final words:", list(final_words)[:10])  # Display the first 10 words

Total words before final check: 825
Total words after final check: 817
Preview of final words: ['olive', 'sleep', 'reishi', 'dahlia', 'insect', 'fever', 'plasma', 'ananas', 'women', 'acari']


In [None]:
write_wordle_txt(filepath=wordle_path, words=final_words)
print(f"✅ Final words written to {wordle_path}")