In [15]:
# Import Cell
from gensim.models import KeyedVectors
from wordfreq import word_frequency
import random
import re
import os
import sys
from pathlib import Path
import xml.etree.ElementTree as ET

In [5]:
# Model Load Cell
model = KeyedVectors.load_word2vec_format('bioword.vec.bin', binary=True)
print("✅ Model loaded successfully from ./bioword.vec.bin")

✅ Model loaded successfully from ./bioword.vec.bin


In [None]:
# Load MeSH descriptor terms from XML
mesh_path = "desc2025.xml"  # located in src/ alongside the notebook
tree = ET.parse(mesh_path)
root = tree.getroot()

# Extract MeSH descriptor terms
mesh_terms = set()
for record in root.findall(".//DescriptorRecord"):
    term = record.findtext("DescriptorName/String")
    if term:
        mesh_terms.add(term.lower())  # match lowercase words in BioWordVec

print(f"✅ Loaded {len(mesh_terms)} MeSH terms")


✅ Loaded 30956 MeSH terms


In [73]:
# Filtered Vocabulary Cell
country_blocklist = {
    "spain", "china", "india", "chile", "kenya", "ghana", "egypt", "nepal",
    "niger", "iran", "iraq", "syria", "haiti", "italy", "japan", "qatar",
    "yemen", "libya", "rwanda", "norway", "serbia", "sweden", "guinea",
    "israel", "belize", "bhutan", "greece", "poland", "romania", "mexico",
    "canada", "zambia", "swazis", "angola", "malta", "latvia", "togo",
    "benin", "samoa", "sudan", "oman", "peru", "laos", "cuba", "fiji",
    "chad", "mali", "tonga", "palau", "nauru", "yemen", "malta", "samoa",
    "france", "brazil", "russia", "korea", "vietnam", "gabon", "zimbabwe",
    "uganda", "tunisia", "thailand", "taiwan", "switzerland", "slovakia",
    "slovenia", "singapore", "senegal", "portugal", "philippines", "pakistan",
    "newzealand", "netherlands", "morocco", "mongolia", "malaysia", "luxembourg",
    "lithuania", "lebanon", "kuwait", "kazakhstan", "jamaica", "indonesia",
    "hungary", "finland", "estonia", "denmark", "czech", "croatia", "colombia",
    "bulgaria", "belgium", "austria", "australia", "argentina", "algeria", "panama",
    "europe", "africa", "asia", "america", "oceania",
}

non_bio_terms = {
    "spain", "iran", "china", "politics", "crime", "theft", "war", "terrorism",
    "economics", "education", "law", "insurance", "banking", "religion", "computers",
    "marketing", "advertising", "journalism", "philosophy", "rape", "role", "jews", "sexism",
    "fascism", "capitalism", "communism", "socialism", "feminism", "racism",
    "homophobia", "transphobia", "xenophobia", "sexism", "ageism", "ableism", "minors",
    "police", "violence", "monaco", "prison", "prisons", "jail", "jails", "court", "courts",
    "judge", "judges", "lawyer", "lawyers", "attorney", "attorneys", "prosecutor", "theft",
}

cities = {
    "london", "paris", "tokyo", "berlin", "madrid", "dubai", "delhi", "sydney", "seoul", "milan",
    "osaka", "vienna", "zurich", "prague", "dublin", "miami", "boston", "chicago", "houston", "atlanta",
    "toronto", "detroit", "seattle", "denver", "orlando", "phoenix", "dallas", "austin", "tampa", "vegas",
    "lisbon", "munich", "warsaw", "naples", "brasil", "quito", "havana", "cairo", "beirut", "jakarta",
    "manila", "hanoi", "oslo", "sofia", "riga", "varna", "geneva", "brno", "porto", "malaga",
    "nice", "lyon", "crete", "varna", "goa", "pune", "kyoto", "nagoya", "kobe", "sapporo"
}

us_states = {
    "alabama", "alaska", "arizona", "arkansas", "california", "colorado",
    "connecticut", "delaware", "florida", "georgia", "hawaii", "idaho",
    "illinois", "indiana", "iowa", "kansas", "kentucky", "louisiana",
    "maine", "maryland", "massachusetts", "michigan", "minnesota",
    "mississippi", "missouri", "montana", "nebraska", "nevada",
    "newhampshire", "newjersey", "newmexico", "newyork",
    "northcarolina", "northdakota", "ohio", "oklahoma",
    "oregon", "pennsylvania", "rhodeisland", "southcarolina", "southdakota", "tennessee", "texas", 
    "utah", "vermont", "virginia", "washington", "westvirginia", "wisconsin", "wyoming"
}

colors = {
    "red", "blue", "green", "black", "white", "yellow", "purple", "orange", "violet", "indigo", "silver", "golden", "bronze"
}

# Make a Set out of all the sets above
blocklist = set()
blocklist.update(country_blocklist)
blocklist.update(non_bio_terms)
blocklist.update(cities)
blocklist.update(us_states)
blocklist.update(colors)

def bio_score(word):
    bio_sim = model.similarity(word, "biology")

    life_sim = model.similarity(word, "life") ** 1.5
    cell_sim = model.similarity(word, "cell") ** 1.5
    chem_sim = model.similarity(word, "chemistry") ** 1.5
    disease_sim = model.similarity(word, "disease") ** 1.5
    org_sim = model.similarity(word, "organism") ** 1.5
    animal_sim = model.similarity(word, "animal") ** 1.5
    plant_sim = model.similarity(word, "plant") ** 1.5
    microbe_sim = model.similarity(word, "microbe") ** 1.5
    anatomy_sim = model.similarity(word, "anatomy") ** 1.5
    human_sim = model.similarity(word, "human") ** 1.5
    
    other_sims = sorted([life_sim, cell_sim, chem_sim, disease_sim, org_sim, animal_sim, plant_sim, microbe_sim, anatomy_sim, human_sim], reverse=True)[:7]
    similarity = (2 * bio_sim + sum(other_sims)) / 10
    return round(similarity, 4)

def is_clean_word(w):
    return (
        4 <= len(w) <= 6
        and w.isalpha()
        and re.match(r"^[a-z]{4,6}$", w)
        and any(c in 'aeiou' for c in w)
        and w not in blocklist
        and word_frequency(w, 'en') < 1e-3
        and word_frequency(w, 'en') > 1e-8
        and w in mesh_terms
        and bio_score(w) > 0.16
    )

filtered_vocab_list = [w for w in model.key_to_index if is_clean_word(w)]
bio_words = set(filtered_vocab_list)
print(f"✅ Filtered vocab size after MeSH & wordfreq filtering: {len(bio_words)}")

✅ Filtered vocab size after MeSH & wordfreq filtering: 621


In [None]:
# Select 200 words from the filtered vocabulary
sampled_words = random.sample(filtered_vocab_list, 400)

# Print the words in a 20x20 grid
for i in range(0, 400, 20):
    print("\t".join(sampled_words[i:i+20]))

In [None]:
# extras del chac...
extra_bio_words = {
    'gout', 'myopia', 'mitogen', 'domain', 'enzyme', 'corvus', 'anole',
    'zebra', 'estrus', 'krait', 'goat', 'embryo', 'spine', 'indri',
    'tonsil', 'swine', 'baleen', 'nucleus', 'apical', 'hepatic',
    'minnow', 'cactus', 'olive', 'capsule', 'deer', 'lion', 'trunk',
    'uterus', 'rabies', 'gaur', 'leptin', 'aorta', 'rana', 'clone',
    'pyrus', 'rice', 'mussel', 'melanin', 'nyala', 'palea', 'moss',
    'fever', 'cicada', 'axil', 'taiga', 'carapace', 'cervus', 'siren',
    'sucrose', 'octopus', 'larva', 'mange', 'dingo', 'sponge', 'bract',
    'carotene', 'capsid', 'viper', 'claw', 'cornea', 'pupa', 'whale',
    'toxin', 'lobster', 'helix', 'codon', 'catfish', 'phloem', 'soma',
    'cuticle', 'equine', 'redox', 'saliva', 'molar', 'klippe', 'polyp',
    'fly', 'gable', 'mollusk', 'sapling', 'grotto', 'sapiens', 'puffer',
    'zonation', 'larvae', 'fungus', 'cytosol', 'retina', 'ribose', 'cough'
}

other_bio_words = {
    'absorb', 'acari', 'algaes', 'amoeba', 'amylas', 'anther', 'angios', 'archae',
    'asthma', 'bamboo', 'bacter', 'beetle', 'bronch', 'cancer', 'cholera', 'collag',
    'corn', 'cortex', 'cotyle', 'cyto', 'cytosol', 'dopami', 'ebola', 'embryo',
    'enzyme', 'euglen', 'fever', 'ferns', 'filter', 'flower', 'flagel', 'fungi',
    'fungus', 'fusion', 'giardi', 'glands', 'glucos', 'golgi', 'herpes', 'hormone',
    'horse', 'human', 'induce', 'insect', 'insuli', 'kinase', 'lipid', 'lipids',
    'malari', 'mammal', 'mango', 'measle', 'meiosi', 'mitosi', 'monera',
    'monkey', 'moss', 'mouse', 'muscle', 'neuron', 'nucleus', 'onions', 'orange',
    'osmosis', 'ovary', 'parrot', 'patella', 'petals', 'phloem', 'pollen',
    'primate', 'quorum', 'rabbit', 'rabies', 'reptil', 'replic', 'retina', 'ribose',
    'rodent', 'rotavi', 'snail', 'sepals', 'spiroc', 'tibia', 'tomato', 'toxopl',
    'toxin', 'tree', 'tulip', 'urease', 'ureter', 'vesicl', 'villi', 'virus', 'whale',
    'xylem', 'zebra'
}

# Add extra words to the bio_words set
bio_words.update(extra_bio_words)
bio_words.update(other_bio_words)

# Print the total number of bio words
print(f"✅ Total number of bio words: {len(bio_words)}")

✅ Total number of bio words: 767


In [79]:
# Transcribe to root/assets/data/bio_words.txt
output_dir = Path("../assets/data")
output_dir.mkdir(parents=True, exist_ok=True)
output_file = output_dir / "bio_words.txt"
with open(output_file, "w") as f:
    for word in bio_words:
        f.write(f"{word}\n")