In [1]:
# Load the words from the file
with open("words_250000_train.txt", "r") as file:
    words = file.read().splitlines()

# Find words containing 'elder', not containing 'k', 'i', or 'a', and having a length of 6
filtered_words = [word for word in words if 'elder' in word and all(ch not in word for ch in 'kia') and len(word) == 6]

filtered_words

['eldern', 'elders', 'welder']

In [2]:
import random

# Set the seed for reproducibility
random.seed(42)

# Shuffle the words
random.shuffle(words)

# Split the words into training and validation sets
training_words = words[:200000]
validation_words = words[200000:]

# Save the training words to a new file
with open("words_200000_train.txt", "w") as train_file:
    for word in training_words:
        train_file.write(word + "\n")

# Save the validation words to a new file
with open("words_50000_val.txt", "w") as val_file:
    for word in validation_words:
        val_file.write(word + "\n")

In [4]:
from collections import Counter

# Join all the words in the training set into a single string
all_letters = ''.join(training_words)

# Count the frequency of each letter
letter_counts = Counter(all_letters)

# Sort the letters by frequency in decreasing order
sorted_letters_by_frequency = sorted(letter_counts.items(), key=lambda item: item[1], reverse=True)

# Extract just the letters in sorted order
sorted_letters = [letter for letter, count in sorted_letters_by_frequency]

sorted_letters

['e',
 'i',
 'a',
 'n',
 'o',
 'r',
 's',
 't',
 'l',
 'c',
 'u',
 'd',
 'p',
 'm',
 'h',
 'g',
 'y',
 'b',
 'f',
 'v',
 'k',
 'w',
 'z',
 'x',
 'q',
 'j']

In [13]:
from collections import Counter

# Read the words from the file
with open("words_train_split.txt", "r") as file:
    words_split = file.read().splitlines()

# Generate bigrams from the words
bigrams = [word[i:i+2] for word in words_split for i in range(len(word) - 1)]

# Count the frequency of each bigram
bigram_counts = Counter(bigrams)

# Get the 100 most common bigrams
most_common_bigrams = bigram_counts.most_common(100)

most_common_bigrams = [bigram for bigram, count in most_common_bigrams]

most_common_bigrams

['er',
 'in',
 'ti',
 'on',
 'es',
 'te',
 'an',
 're',
 'at',
 'al',
 'en',
 'ed',
 'le',
 'ri',
 'is',
 'ra',
 'ic',
 'st',
 'ar',
 'ne',
 'ng',
 'li',
 'ro',
 'or',
 'nt',
 'la',
 'un',
 'it',
 'co',
 'el',
 'de',
 'se',
 'll',
 'ni',
 'ca',
 'to',
 'ta',
 'ss',
 'io',
 'ma',
 'ch',
 'ou',
 'ia',
 'he',
 'lo',
 'tr',
 'us',
 'no',
 'si',
 'ly',
 'me',
 'di',
 'na',
 'ol',
 'et',
 've',
 'il',
 'as',
 'ac',
 'mi',
 'th',
 'ea',
 'pe',
 'nd',
 'ha',
 'om',
 'ce',
 'os',
 'hi',
 'ph',
 'ho',
 'ur',
 'pr',
 'ns',
 'id',
 'ie',
 'op',
 'ul',
 'nc',
 'ec',
 'ot',
 'sh',
 'ge',
 'mo',
 'pa',
 'em',
 'ab',
 'po',
 'bl',
 'am',
 'rs',
 'ci',
 'ad',
 'pi',
 'oc',
 'ap',
 'be',
 'su',
 'og',
 'sa']

In [14]:
with open("words_train_split.txt", "r") as file:
    words_split = file.read().splitlines()

# Generate bigrams from the words
trigrams = [word[i:i+3] for word in words_split for i in range(len(word) - 3)]

# Count the frequency of each bigram
trigram_counts = Counter(trigrams)

# Get the 100 most common bigrams
most_common_trigrams = trigram_counts.most_common(100)

most_common_trigrams = [trigram for trigram, count in most_common_trigrams]

most_common_trigrams

['ati',
 'tio',
 'nes',
 'ter',
 'ica',
 'all',
 'ent',
 'tin',
 'non',
 'per',
 'eri',
 'ver',
 'ant',
 'ate',
 'abl',
 'ali',
 'pre',
 'tra',
 'lin',
 'ing',
 'con',
 'nte',
 'pro',
 'sti',
 'ion',
 'nti',
 'ste',
 'tri',
 'rat',
 'ell',
 'oni',
 'nde',
 'ist',
 'res',
 'rin',
 'the',
 'ari',
 'ine',
 'ene',
 'ill',
 'lat',
 'ove',
 'iti',
 'lit',
 'str',
 'ere',
 'ran',
 'tic',
 'cal',
 'int',
 'men',
 'era',
 'gra',
 'ili',
 'min',
 'dis',
 'olo',
 'ast',
 'ona',
 'tro',
 'est',
 'ani',
 'mat',
 'chi',
 'ero',
 'sta',
 'der',
 'ato',
 'and',
 'tiv',
 'oph',
 'ect',
 'her',
 'che',
 'und',
 'ina',
 'tor',
 'for',
 'nat',
 'log',
 'rea',
 'pho',
 'cti',
 'ess',
 'ori',
 'emi',
 'nis',
 'cat',
 'lli',
 'cha',
 'sto',
 'ous',
 'lis',
 'rop',
 'ula',
 'par',
 'ele',
 'eli',
 'les',
 'ers']

In [15]:
with open("words_train_split.txt", "r") as file:
    words_split = file.read().splitlines()

# Generate bigrams from the words
qgrams = [word[i:i+4] for word in words_split for i in range(len(word) - 4)]

# Count the frequency of each bigram
qgram_counts = Counter(qgrams)

# Get the 100 most common bigrams
most_common_qgrams = qgram_counts.most_common(100)

most_common_qgrams = [qgram for qgram, count in most_common_qgrams]

most_common_qgrams

['atio',
 'over',
 'tion',
 'nter',
 'ical',
 'enes',
 'inte',
 'call',
 'olog',
 'anti',
 'tica',
 'atin',
 'unde',
 'nder',
 'rati',
 'logi',
 'ingl',
 'grap',
 'iona',
 'ogra',
 'ilit',
 'isti',
 'ther',
 'bili',
 'alis',
 'ativ',
 'enti',
 'uper',
 'ster',
 'icat',
 'lati',
 'mati',
 'teri',
 'raph',
 'supe',
 'ment',
 'ines',
 'erin',
 'ulat',
 'stra',
 'enta',
 'erat',
 'self',
 'tati',
 'esse',
 'semi',
 'snes',
 'ight',
 'dnes',
 'peri',
 'inat',
 'pres',
 'tran',
 'aliz',
 'cula',
 'stic',
 'tric',
 'comp',
 'omet',
 'tive',
 'ctio',
 'vill',
 'well',
 'lene',
 'tabl',
 'ator',
 'ecti',
 'abil',
 'cati',
 'ousl',
 'blen',
 'nati',
 'emen',
 'opho',
 'acti',
 'able',
 'para',
 'lect',
 'edne',
 'vers',
 'izat',
 'cont',
 'cons',
 'zati',
 'usne',
 'asti',
 'ousn',
 'tero',
 'izin',
 'onis',
 'anth',
 'late',
 'ogen',
 'anis',
 'rica',
 'othe',
 'trop',
 'reco',
 'elli',
 'arch']

In [12]:
import numpy as np
import string

def generate_letter_cofrequency_matrices(words):
    """
    Generate co-frequency matrices for letters that succeed and precede each other.
    
    Parameters:
    words (list): List of words to analyze
    
    Returns:
    tuple: (succeeding_matrix, preceding_matrix)
    """
    # Create lowercase alphabet for matrix indexing
    alphabet = string.ascii_lowercase
    
    # Initialize matrices with zeros
    succeeding_matrix = np.zeros((26, 26), dtype=int)
    preceding_matrix = np.zeros((26, 26), dtype=int)
    
    # Process each word
    for word in words:
        # Convert to lowercase
        word = word.lower()
        
        # Analyze letter successions
        for i in range(len(word) - 1):
            # Current and next letter
            current_letter = word[i]
            next_letter = word[i + 1]
            
            # Skip if either letter is not in alphabet
            if current_letter not in alphabet or next_letter not in alphabet:
                continue
            
            # Get matrix indices
            current_idx = alphabet.index(current_letter)
            next_idx = alphabet.index(next_letter)
            
            # Increment succeeding matrix
            succeeding_matrix[current_idx, next_idx] += 1
        
        # Analyze letter precedences
        for i in range(1, len(word)):
            # Current and previous letter
            current_letter = word[i]
            prev_letter = word[i - 1]
            
            # Skip if either letter is not in alphabet
            if current_letter not in alphabet or prev_letter not in alphabet:
                continue
            
            # Get matrix indices
            current_idx = alphabet.index(current_letter)
            prev_idx = alphabet.index(prev_letter)
            
            # Increment preceding matrix
            preceding_matrix[current_idx, prev_idx] += 1
    
    return succeeding_matrix, preceding_matrix

def print_matrix(matrix, matrix_type):
    """
    Pretty print the co-frequency matrix.
    
    Parameters:
    matrix (numpy.ndarray): The co-frequency matrix
    matrix_type (str): 'Succeeding' or 'Preceding'
    """
    alphabet = string.ascii_lowercase
    
    print(f"{matrix_type} Letter Co-Frequency Matrix:")
    print("    " + " ".join(alphabet))
    
    for i, row in enumerate(matrix):
        print(f"{alphabet[i]} ", end="")
        for val in row:
            print(f"{val:3}", end=" ")
        print()

# Example usage
def example_usage():
    # Sample word list
    words = [
        "hello", "world", "python", "programming", "computer", 
        "science", "algorithm", "machine", "learning", "data"
    ]
    
    # Generate matrices
    succeeding_matrix, preceding_matrix = generate_letter_cofrequency_matrices(words)
    
    # Print matrices
    print_matrix(succeeding_matrix, "Succeeding")
    print("\n")
    print_matrix(preceding_matrix, "Preceding")

# Uncomment to run example
# example_usage()import numpy as np
import string

def generate_letter_cofrequency_matrices(words):
    """
    Generate co-frequency matrices for letters that succeed and precede each other.
    
    Parameters:
    words (list): List of words to analyze
    
    Returns:
    tuple: (succeeding_matrix, preceding_matrix)
    """
    # Create lowercase alphabet for matrix indexing
    alphabet = string.ascii_lowercase
    
    # Initialize matrices with zeros
    succeeding_matrix = np.zeros((26, 26), dtype=int)
    preceding_matrix = np.zeros((26, 26), dtype=int)
    
    # Process each word
    for word in words:
        # Convert to lowercase
        word = word.lower()
        
        # Analyze letter successions
        for i in range(len(word) - 1):
            # Current and next letter
            current_letter = word[i]
            next_letter = word[i + 1]
            
            # Skip if either letter is not in alphabet
            if current_letter not in alphabet or next_letter not in alphabet:
                continue
            
            # Get matrix indices
            current_idx = alphabet.index(current_letter)
            next_idx = alphabet.index(next_letter)
            
            # Increment succeeding matrix
            succeeding_matrix[current_idx, next_idx] += 1
        
        # Analyze letter precedences
        for i in range(1, len(word)):
            # Current and previous letter
            current_letter = word[i]
            prev_letter = word[i - 1]
            
            # Skip if either letter is not in alphabet
            if current_letter not in alphabet or prev_letter not in alphabet:
                continue
            
            # Get matrix indices
            current_idx = alphabet.index(current_letter)
            prev_idx = alphabet.index(prev_letter)
            
            # Increment preceding matrix
            preceding_matrix[current_idx, prev_idx] += 1
    
    return succeeding_matrix, preceding_matrix

def print_matrix(matrix, matrix_type):
    """
    Pretty print the co-frequency matrix.
    
    Parameters:
    matrix (numpy.ndarray): The co-frequency matrix
    matrix_type (str): 'Succeeding' or 'Preceding'
    """
    alphabet = string.ascii_lowercase
    
    print(f"{matrix_type} Letter Co-Frequency Matrix:")
    print("    " + " ".join(alphabet))
    
    for i, row in enumerate(matrix):
        print(f"{alphabet[i]} ", end="")
        for val in row:
            print(f"{val:3}", end=" ")
        print()

# Example usage
def example_usage():
    # Sample word list
    with open("words_train_split.txt", "r") as file:
        words = file.read().splitlines()
    
    # Generate matrices
    succeeding_matrix, preceding_matrix = generate_letter_cofrequency_matrices(words)
    
    # Print matrices
    print_matrix(succeeding_matrix, "Succeeding")
    print("\n")
    print_matrix(preceding_matrix, "Preceding")

# Uncomment to run example
example_usage()

Succeeding Letter Co-Frequency Matrix:
    a b c d e f g h i j k l m n o p q r s t u v w x y z
a 245 6427 9223 5372 2945 1237 4341 967 3988 184 1780 20657 6109 22111 231 5305 179 17455 9332 20775 3041 2142 1177 665 1668 849 
b 4760 934 194 226 5234  75  42 117 4464 135  19 6272 134  97 4006 108   4 3588 777 246 2642  65  48   3 422   6 
c 11438  22 1263  57 7951  29  30 10543 5644   1 4011 2801  72 130 12436  43 103 4245 528 4936 3876  15  20   2 1685  32 
d 4775 327 235 1164 12292 286 644 345 9681 156  37 1701 404 832 4562 173   7 2795 1558 184 1919 244 353   3 1019  35 
e 8994 1910 6777 20338 4797 2062 2321 877 2479 228 517 12415 6434 20513 2618 4481 457 34364 23921 9465 2285 2084 1461 2552 1390 281 
f 2294  80 172 134 2934 1823  39  55 3906  17  17 2173  77  33 3122  97   5 1775 312 811 2022  16  61   3 439   3 
g 4121 157  56 107 6483  88 1150 2101 4190  19  30 2946 429 1385 2664  72   2 3925 902 190 2072  15 155   1 971   8 
h 8444 235 129 104 10268 172  62  96 7901  13  52 933 52

In [None]:
import numpy as np
import string
import re

def generate_letter_cofrequency_matrices(words):
    """
    Generate co-frequency matrices for letters that succeed and precede each other.
    
    Parameters:
    words (list): List of words to analyze
    
    Returns:
    tuple: (succeeding_matrix, preceding_matrix)
    """
    # Create lowercase alphabet for matrix indexing
    alphabet = string.ascii_lowercase
    
    # Initialize matrices with zeros
    succeeding_matrix = np.zeros((26, 26), dtype=int)
    preceding_matrix = np.zeros((26, 26), dtype=int)
    
    # Process each word
    for word in words:
        # Convert to lowercase
        word = word.lower()
        
        # Analyze letter successions
        for i in range(len(word) - 1):
            # Current and next letter
            current_letter = word[i]
            next_letter = word[i + 1]
            
            # Skip if either letter is not in alphabet
            if current_letter not in alphabet or next_letter not in alphabet:
                continue
            
            # Get matrix indices
            current_idx = alphabet.index(current_letter)
            next_idx = alphabet.index(next_letter)
            
            # Increment succeeding matrix
            succeeding_matrix[current_idx, next_idx] += 1
        
        # Analyze letter precedences
        for i in range(1, len(word)):
            # Current and previous letter
            current_letter = word[i]
            prev_letter = word[i - 1]
            
            # Skip if either letter is not in alphabet
            if current_letter not in alphabet or prev_letter not in alphabet:
                continue
            
            # Get matrix indices
            current_idx = alphabet.index(current_letter)
            prev_idx = alphabet.index(prev_letter)
            
            # Increment preceding matrix
            preceding_matrix[current_idx, prev_idx] += 1
    
    return succeeding_matrix, preceding_matrix

def guess(word, guessed_letters, succeeding_matrix, preceding_matrix):
    # Predefined frequency lists
    letters_by_frequency = [
        'e', 't', 'a', 'o', 'i', 'n', 's', 'h', 'r', 'd',
        'l', 'u', 'c', 'm', 'f', 'w', 'g', 'y', 'p', 'b',
        'v', 'k', 'x', 'j', 'q', 'z'
    ]
    
    bigrams_by_frequency = [
        'th', 'he', 'in', 'er', 'an', 're', 'nd', 'on', 'en', 'at',
        'ou', 'ed', 'ha', 'to', 'or', 'it', 'is', 'hi', 'es', 'ng'
    ]
    
    trigrams_by_frequency = [
        'the', 'and', 'ing', 'her', 'hat', 'his', 'tha', 'ere', 'for', 'ent',
        'ion', 'ter', 'was', 'you', 'ith', 'ver', 'all', 'wit', 'thi', 'tio'
    ]
    
    quadgrams_by_frequency = [
        'that', 'ther', 'with', 'tion', 'here', 'ould', 'ight', 'have', 'hich', 'whic',
        'this', 'thin', 'they', 'atio', 'ever', 'from', 'ough', 'were', 'hing', 'ment'
    ]
    
    # Clean the word, stripping spaces and replacing "_" with placeholders
    clean_word = word[::2].replace("_", ".")
    
    # Find length of word
    len_word = len(clean_word)
    
    # Score mechanism for letter selection
    letter_scores = {}
    alphabet = string.ascii_lowercase
    
    # 1. Single Letter Frequency - Initial Base Score
    for letter in letters_by_frequency:
        if letter not in guessed_letters:
            letter_scores[letter] = 1 / (letters_by_frequency.index(letter) + 1)
    
    # 2. Co-Frequency Matrix Scoring
    # Find positions of known letters in the word
    known_letter_positions = [i for i, char in enumerate(clean_word) if char != '.']
    
    for letter in alphabet:
        if letter in guessed_letters:
            continue
        
        # Score based on succeeding and preceding letter frequencies
        succeeding_score = 0
        preceding_score = 0
        
        # Check succeeding letter frequency
        for pos in known_letter_positions:
            if pos < len_word - 1 and clean_word[pos+1] == '.':
                known_letter_idx = alphabet.index(clean_word[pos])
                letter_idx = alphabet.index(letter)
                succeeding_score += succeeding_matrix[known_letter_idx, letter_idx]
        
        # Check preceding letter frequency
        for pos in known_letter_positions:
            if pos > 0 and clean_word[pos-1] == '.':
                known_letter_idx = alphabet.index(clean_word[pos])
                letter_idx = alphabet.index(letter)
                preceding_score += preceding_matrix[known_letter_idx, letter_idx]
        
        # Combine co-frequency scores with base letter frequency
        letter_scores[letter] += (succeeding_score + preceding_score) * 0.1
    
    # 3. Ngram Frequency Scoring
    # (Keep existing ngram scoring logic from previous implementation)
    for bigram in bigrams_by_frequency:
        if is_ngram_compatible(bigram, clean_word):
            for letter in set(bigram):
                if letter not in guessed_letters:
                    letter_scores[letter] = letter_scores.get(letter, 0) + 1 / (bigrams_by_frequency.index(bigram) + 1)
    
    for trigram in trigrams_by_frequency:
        if is_ngram_compatible(trigram, clean_word):
            for letter in set(trigram):
                if letter not in guessed_letters:
                    letter_scores[letter] = letter_scores.get(letter, 0) + 1 / (trigrams_by_frequency.index(trigram) + 1)
    
    for quadgram in quadgrams_by_frequency:
        if is_ngram_compatible(quadgram, clean_word):
            for letter in set(quadgram):
                if letter not in guessed_letters:
                    letter_scores[letter] = letter_scores.get(letter, 0) + 1 / (quadgrams_by_frequency.index(quadgram) + 1)
    
    # If no scores found, fallback to most frequent unguessed letters
    if not letter_scores:
        for letter in letters_by_frequency:
            if letter not in guessed_letters:
                return letter
    
    # Select letter with highest score
    guess_letter = max(letter_scores, key=letter_scores.get)
    
    return guess_letter

def is_ngram_compatible(ngram, word_pattern):
    """
    Check if an ngram is compatible with the current word pattern
    """
    # Create a regex pattern from the ngram that respects the word pattern
    pattern = word_pattern.replace('.', '[a-z]')
    
    # Check if the ngram could exist within the pattern
    return re.search(f'(?=.{ngram}.)', pattern, re.IGNORECASE) is not None

# Example usage function
def hangman_solver_example():
    # Sample word list for generating co-frequency matrices
    word_list = [
        'hello', 'world', 'python', 'programming', 'computer', 
        'science', 'algorithm', 'machine', 'learning', 'data'
    ]
    
    # Generate co-frequency matrices
    succeeding_matrix, preceding_matrix = generate_letter_cofrequency_matrices(word_list)
    
    # Example Hangman game scenario
    word = "_ _ l l _"
    guessed_letters = ['l']
    
    # Get the next letter guess
    next_guess = guess(word, guessed_letters, succeeding_matrix, preceding_matrix)
    
    print(f"Current word: {word}")
    print(f"Guessed letters: {guessed_letters}")
    print(f"Next guess: {next_guess}")

# Uncomment to run example
# hangman_solver_example()