In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063

[notice] A new release of pip is available: 23.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import nltk
from nltk.corpus import words

In [4]:
def generate_meaningful_five_letter_words():
    # Download the wordnet corpus (the "dictionary" with meanings)
    print("Connecting to dictionary database...")
    nltk.download('words')
    
    # Use a set to store unique words
    all_words = words.words()
    meaningful_words = set()
    
    # wordnet.all_lemma_names() provides words that actually have definitions
    print("Filtering meaningful 5-letter words...")
    for word in all_words:
        # Filter for 5 letters and ensure it's alphabetic only
        if len(word) == 5 and word.isalpha():
            meaningful_words.add(word.upper())
    
    # Sort them alphabetically
    sorted_words = sorted(list(meaningful_words))
    
    # Save to file
    with open('solutions.txt', 'w') as f:
        for word in sorted_words:
            f.write(word + '\n')
            
    print(f"Done! Created a list of {len(sorted_words)} words.")
    print("Saved to: solutions.txt")

if __name__ == "__main__":
    generate_meaningful_five_letter_words()

Connecting to dictionary database...


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\aadip\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Filtering meaningful 5-letter words...
Done! Created a list of 9972 words.
Saved to: solutions.txt


In [5]:
from nltk.corpus import wordnet, brown

def filter_common_words(input_file, output_file):
    # 1. Download necessary data
    print("Loading dictionary and frequency data...")
    nltk.download('wordnet')
    nltk.download('brown')
    
    # 2. Get a set of common words from the Brown corpus
    # This identifies words actually used in literature/news
    common_usage = set(w.upper() for w in brown.words())
    
    # 3. Read your current 10,000 words
    with open(input_file, 'r') as f:
        my_words = [line.strip().upper() for line in f]
    
    final_list = []
    
    print("Filtering...")
    for word in my_words:
        # Check A: Does it have a definition in WordNet?
        has_definition = len(wordnet.synsets(word)) > 0
        
        # Check B: Is it used in the common usage corpus?
        is_common = word in common_usage
        
        # We keep it if it has a definition AND is commonly used
        if has_definition and is_common:
            final_list.append(word)
            
    # 4. Save the high-quality list
    with open(output_file, 'w') as f:
        for word in sorted(final_list):
            f.write(word + '\n')
            
    print(f"Refining complete! Reduced list to {len(final_list)} common words.")

if __name__ == "__main__":
    filter_common_words('words.txt', 'solutions.txt')

Loading dictionary and frequency data...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aadip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\aadip\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


Filtering...
Refining complete! Reduced list to 2162 common words.


In [6]:
def generate_validated_five_letter_words():
    # 1. Download necessary data
    print("Loading English word lists...")
    nltk.download('words')
    nltk.download('wordnet')
    
    # 2. Start with the comprehensive word list
    all_english_words = words.words()
    
    # 3. Filter for length and meaningfulness
    print("Validating words against dictionary definitions...")
    valid_words = set()
    
    for word in all_english_words:
        # Check A: Length is exactly 5 and it's all letters
        if len(word) == 5 and word.isalpha():
            upper_word = word.upper()
            
            # Check B: Does WordNet recognize it as having a definition?
            # wordnet.synsets() handles plurals like 'BALLS' by linking them to 'BALL'
            if wordnet.synsets(upper_word):
                valid_words.add(upper_word)
    
    # 4. Sort and Save
    sorted_list = sorted(list(valid_words))
    
    with open('solutions.txt', 'w') as f:
        for word in sorted_list:
            f.write(word + '\n')
            
    print(f"Success! Created a list of {len(sorted_list)} words.")
    print("Saved to: solutions.txt")

if __name__ == "__main__":
    generate_validated_five_letter_words()

Loading English word lists...


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\aadip\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aadip\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Validating words against dictionary definitions...
Success! Created a list of 4288 words.
Saved to: solutions.txt
