In [None]:
# hangman game
def start_game(word="hello", miss_times=6):
  word_length = len(word)
  try_times = word_length + miss_times
  current_state = ["_"] * word_length
  return word_length, try_times, current_state

def update(word, current_state, used_letters):
  f = False
  for i in range(len(word)):
    ch = word[i]
    if ch in used_letters and current_state[i]!=ch:
      current_state[i] = ch
      f = True
  return f

def evaluate(current_state):
  blanks = current_state.count("_")
  return blanks

def play_game(word, guess_function):
  word_length, try_times, current_state = start_game(word, 6) # step 1
  char_set = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'}
  history, current_times, winning = [], 0, False

  while current_times < try_times:
    c = guess_function(word_length, history, current_state, char_set)
    history.append(c) # add in the newly guessed char
    char_set.remove(c)

    f = update( word, current_state, set(history) ) # step 2
    #if f: print("guess correctly")
    #else: print("no such char")

    blanks = evaluate(current_state) # step 3
    if blanks==0:
      winning = True
      break
    current_times += 1
  return winning

In [None]:
def read_file(file_path):
  f = open(file_path, 'r')
  lns = []
  for ln in f: lns.append(ln.strip())
  return lns

# dict: key-value, the value is the frequency
# convert the frequencies to probabilities
# for example: {'a':1, 'b':2, 'c':3, 'd': 4} -> {'a':0.1, 'b':0.2, 'c':0.3, 'd': 0.4}
def normalize_dic(dic):
  for k in dic:
    S = sum(dic[k].values())
    for t in dic[k]: dic[k][t] = dic[k][t]/S
  return

# group words by length
# for words of length L, count the frequency of each char, then sort in order by frequency
# set default frequency for each char as 1 in case of 0 if normalization
def statistics01(train_words):
  default_char_frequency = {'a':1,'b':1,'c':1,'d':1,'e':1,
                'f':1,'g':1,'h':1,'i':1,'j':1,
                'k':1,'l':1,'m':1,'n':1,'o':1,
                'p':1,'q':1,'r':1,'s':1,'t':1,
                'u':1,'v':1,'w':1,'x':1,'y':1,
                'z':1}
  length_char_frequency = {}
  for word in train_words:
    L = len(word)
    if L not in length_char_frequency:
      length_char_frequency[L] = default_char_frequency.copy()
    for ch in word: length_char_frequency[L][ch] += 1
  char_in_order = {}
  for k in sorted(length_char_frequency.keys()):
    sub = sorted(length_char_frequency[k].items(), key=lambda x:x[1], reverse=True)
    sub = list( map(lambda x:x[0], sub) )
    char_in_order[k] = sub
  return length_char_frequency, char_in_order

# count the frequency of each char for the whole training dataset
def pattern00(train_words):
  dic_1_1 = {'a':0,'b':0,'c':0,'d':0,'e':0,
        'f':0,'g':0,'h':0,'i':0,'j':0,
        'k':0,'l':0,'m':0,'n':0,'o':0,
        'p':0,'q':0,'r':0,'s':0,'t':0,
        'u':0,'v':0,'w':0,'x':0,'y':0,
        'z':0}
  for word in train_words:
    for ch in word: dic_1_1[ch] += 1
  # normalize
  S = sum(dic_1_1.values())
  for k in dic_1_1: dic_1_1[k] = dic_1_1[k] / S
  return dic_1_1

# 2 gram:
  # _*: dic_2_1
  # *_: dic_2_2
def pattern01(train_words):
  dic_2_1, dic_2_2 = {}, {}
  for w in train_words:
    L = len(w)
    for i in range( L ):
      p0 = w[i]
      if i<L-1:
        p2 = w[i+1] # _*
        if p2 not in dic_2_1: dic_2_1[p2] = {}
        if p0 not in dic_2_1[p2]: dic_2_1[p2][p0] = 0
        dic_2_1[p2][p0] += 1
      if i>0:
        p1 = w[i-1] # *_
        if p1 not in dic_2_2: dic_2_2[p1] = {}
        if p0 not in dic_2_2[p1]: dic_2_2[p1][p0] = 0
        dic_2_2[p1][p0] += 1
  normalize_dic(dic_2_1)
  normalize_dic(dic_2_2)
  return dic_2_1, dic_2_2

# 3 gram:
  # _**: dic_3_1
  # *_*: dic_3_2
  # **_: dic_3_3
def pattern02(train_words):
  dic_3_1, dic_3_2, dic_3_3 = {}, {}, {}
  for w in train_words:
    L = len(w)
    for i in range( L ):
      p0 = w[i]
      if i<L-2:
        p1, p2 = w[i+1], w[i+2] # _**
        if (p1,p2) not in dic_3_1: dic_3_1[(p1,p2)] = {}
        if p0 not in dic_3_1[(p1,p2)]: dic_3_1[(p1,p2)][p0] = 0
        dic_3_1[(p1,p2)][p0] += 1
      if i>0 and i<L-1:
        p1, p2 = w[i-1], w[i+1] # *_*
        if (p1,p2) not in dic_3_2: dic_3_2[(p1,p2)] = {}
        if p0 not in dic_3_2[(p1,p2)]: dic_3_2[(p1,p2)][p0] = 0
        dic_3_2[(p1,p2)][p0] += 1
      if i>1:
        p1, p2 = w[i-2], w[i-1] # **_
        if (p1,p2) not in dic_3_3: dic_3_3[(p1,p2)] = {}
        if p0 not in dic_3_3[(p1,p2)]: dic_3_3[(p1,p2)][p0] = 0
        dic_3_3[(p1,p2)][p0] += 1
  normalize_dic(dic_3_1)
  normalize_dic(dic_3_2)
  normalize_dic(dic_3_3)
  return dic_3_1, dic_3_2, dic_3_3

# 4 gram:
  # _***: dic_4_1
  # *_**: dic_4_2
  # **_*: dic_4_3
  # ***_: dic_4_4
def pattern03(train_words):
  dic_4_1, dic_4_2, dic_4_3, dic_4_4 = {}, {}, {}, {}
  for w in train_words:
    L = len(w)
    for i in range( L ):
      p0 = w[i]
      if i<L-3:
        p1, p2, p3 = w[i+1], w[i+2], w[i+3] # _***
        if (p1, p2, p3) not in dic_4_1: dic_4_1[(p1, p2, p3)] = {}
        if p0 not in dic_4_1[(p1, p2, p3)]: dic_4_1[(p1, p2, p3)][p0] = 0
        dic_4_1[(p1, p2, p3)][p0] += 1
      if i>0 and i<L-2:
        p1, p2, p3 = w[i-1], w[i+1], w[i+2] # *_**
        if (p1, p2, p3) not in dic_4_2: dic_4_2[(p1, p2, p3)] = {}
        if p0 not in dic_4_2[(p1, p2, p3)]: dic_4_2[(p1, p2, p3)][p0] = 0
        dic_4_2[(p1, p2, p3)][p0] += 1
      if i>1 and i<L-1:
        p1, p2, p3 = w[i-2], w[i-1], w[i+1] # **_*
        if (p1, p2, p3) not in dic_4_3: dic_4_3[(p1, p2, p3)] = {}
        if p0 not in dic_4_3[(p1, p2, p3)]: dic_4_3[(p1, p2, p3)][p0] = 0
        dic_4_3[(p1, p2, p3)][p0] += 1
      if i>2:
        p1, p2, p3 = w[i-3], w[i-2], w[i-1] # ***_
        if (p1, p2, p3) not in dic_4_4: dic_4_4[(p1, p2, p3)] = {}
        if p0 not in dic_4_4[(p1, p2, p3)]: dic_4_4[(p1, p2, p3)][p0] = 0
        dic_4_4[(p1, p2, p3)][p0] += 1

  normalize_dic(dic_4_1)
  normalize_dic(dic_4_2)
  normalize_dic(dic_4_3)
  normalize_dic(dic_4_4)
  return dic_4_1, dic_4_2, dic_4_3, dic_4_4

# 5 gram
def pattern04(train_words):
  dic_5_1, dic_5_2, dic_5_3, dic_5_4, dic_5_5 = {}, {}, {}, {}, {}
  for w in train_words:
    L = len(w)
    for i in range( L ):
      p0 = w[i]
      if i-4>=0: # ****_
        p1, p2, p3, p4 = w[i-4], w[i-3], w[i-2], w[i-1]
        if (p1, p2, p3, p4) not in dic_5_5: dic_5_5[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_5[(p1, p2, p3, p4)]: dic_5_5[(p1, p2, p3, p4)][p0] = 0
        dic_5_5[(p1, p2, p3, p4)][p0] += 1
      if i-3>=0 and i+1<L: # ***_*
        p1, p2, p3, p4 = w[i-3], w[i-2], w[i-1], w[i+1]
        if (p1, p2, p3, p4) not in dic_5_4: dic_5_4[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_4[(p1, p2, p3, p4)]: dic_5_4[(p1, p2, p3, p4)][p0] = 0
        dic_5_4[(p1, p2, p3, p4)][p0] += 1
      if i-2>=0 and i+2<L: # **_**
        p1, p2, p3, p4 = w[i-2], w[i-1], w[i+1], w[i+2]
        if (p1, p2, p3, p4) not in dic_5_3: dic_5_3[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_3[(p1, p2, p3, p4)]: dic_5_3[(p1, p2, p3, p4)][p0] = 0
        dic_5_3[(p1, p2, p3, p4)][p0] += 1
      if i-1>=0 and i+3<L: # *_***
        p1, p2, p3, p4 = w[i-1], w[i+1], w[i+2], w[i+3]
        if (p1, p2, p3, p4) not in dic_5_2: dic_5_2[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_2[(p1, p2, p3, p4)]: dic_5_2[(p1, p2, p3, p4)][p0] = 0
        dic_5_2[(p1, p2, p3, p4)][p0] += 1
      if i+4<L: # _****
        p1, p2, p3, p4 = w[i+1], w[i+2], w[i+3], w[i+4]
        if (p1, p2, p3, p4) not in dic_5_1: dic_5_1[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_1[(p1, p2, p3, p4)]: dic_5_1[(p1, p2, p3, p4)][p0] = 0
        dic_5_1[(p1, p2, p3, p4)][p0] += 1

  normalize_dic(dic_5_1)
  normalize_dic(dic_5_2)
  normalize_dic(dic_5_3)
  normalize_dic(dic_5_4)
  normalize_dic(dic_5_5)
  return dic_5_1, dic_5_2, dic_5_3, dic_5_4, dic_5_5

In [None]:
import random

# for dict: key is char, and value is the probability (or frequency)
# choose the char with the highest probability (or frequency)
def choose_high_frequency(char_fre):
  c, f = ' ', -1
  for k in char_fre:
    if char_fre[k]>f: c, f = k, char_fre[k]
  return c

# merge two dict
# use the common key, and value is the sum
def merge_two_dic01(d1, d2):
  d = {}
  for k in d1.keys():
    if k in d2: d[k] = d1[k] + d2[k]
  return d

# check for n-gram
def check_list(current_state, lst):
  for i in lst:
    if current_state[i] == '_': return False
  return True

# 2 gram:
  # *_
  # _*
# 3 gram:
  # **_
  # *_*
  # _**
# 4 gram:
  # **_*
  # ***_
  # *_**
  # _***
# 5 gram:
  # **_**
  # ***_*
  # ****_
  # *_***
  # _****
def N_grams(current_state, word_length):
  # dic_2_1, dic_2_2,
  # dic_3_1, dic_3_2, dic_3_3
  # dic_4_1, dic_4_2, dic_4_3, dic_4_4
  # dic_5_1, dic_5_2, dic_5_3, dic_5_4, dic_5_5

  gram2, gram3, gram4 = {}, {}, {}
  gram5 = {}
  for i in range(word_length):
    if current_state[i]=='_':
      # 5 gram:
      if i-4>=0: # ****_
        if check_list(current_state, [i-4,i-3,i-2,i-1]):
          p1, p2, p3, p4 = current_state[i-4], current_state[i-3], current_state[i-2], current_state[i-1]
          if (p1, p2, p3, p4) in dic_5_5:
            if not gram5: gram5 = dic_5_5[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_5[(p1, p2, p3, p4)])
      if i-3>=0 and i+1<word_length: # ***_*
        if check_list(current_state, [i-3,i-2,i-1,i+1]):
          p1, p2, p3, p4 = current_state[i-3], current_state[i-2], current_state[i-1], current_state[i+1]
          if (p1, p2, p3, p4) in dic_5_4:
            if not gram5: gram5 = dic_5_4[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_4[(p1, p2, p3, p4)])
      if i-2>=0 and i+2<word_length: # **_**
        if check_list(current_state, [i-2,i-1,i+1,i+2]):
          p1, p2, p3, p4 = current_state[i-2], current_state[i-1], current_state[i+1], current_state[i+2],
          if (p1, p2, p3, p4) in dic_5_3:
            if not gram5: gram5 = dic_5_3[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_3[(p1, p2, p3, p4)])
      if i-1>=0 and i+3<word_length: # *_***
        if check_list(current_state, [i-1,i+1,i+2,i+3]):
          p1, p2, p3, p4 = current_state[i-1], current_state[i+1], current_state[i+2], current_state[i+3]
          if (p1, p2, p3, p4) in dic_5_2:
            if not gram5: gram5 = dic_5_2[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_2[(p1, p2, p3, p4)])
      if i+4<word_length: # _****
        if check_list(current_state, [i+1,i+2,i+3,i+4]):
          p1, p2, p3, p4 = current_state[i+1], current_state[i+2], current_state[i+3], current_state[i+4]
          if (p1, p2, p3, p4) in dic_5_1:
            if not gram5: gram5 = dic_5_1[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_1[(p1, p2, p3, p4)])

      # 4 gram: ***_
      if i-3>=0 and current_state[i-3]!='_' and current_state[i-2]!='_' and current_state[i-1]!='_':
        p1, p2, p3 = current_state[i-3], current_state[i-2], current_state[i-1]
        if (p1, p2, p3) in dic_4_4:
          if not gram4: gram4 = dic_4_4[(p1, p2, p3)]
          else: gram4 = merge_two_dic01(gram4, dic_4_4[(p1, p2, p3)])
      # 4 gram: **_*
      if i-2>=0 and i+1<word_length and current_state[i-2]!='_' and current_state[i-1]!='_' and current_state[i+1]!='_' :
        p1, p2, p3 = current_state[i-2], current_state[i-1], current_state[i+1]
        if (p1, p2, p3) in dic_4_3:
          if not gram4: gram4 = dic_4_3[(p1, p2, p3)]
          else: gram4 = merge_two_dic01(gram4, dic_4_3[(p1, p2, p3)])
      # 4 gram: *_**
      if i-1>=0 and i+2<word_length and current_state[i-1]!='_' and current_state[i+1]!='_' and current_state[i+2]!='_' :
        p1, p2, p3 = current_state[i-1], current_state[i+1], current_state[i+2]
        if (p1, p2, p3) in dic_4_2:
          if not gram4: gram4 = dic_4_2[(p1, p2, p3)]
          else: gram4 = merge_two_dic01(gram4, dic_4_2[(p1, p2, p3)])
      # 4 gram: _***
      if i+3<word_length and current_state[i+1]!='_' and current_state[i+2]!='_' and current_state[i+3]!='_' :
        p1, p2, p3 = current_state[i+1], current_state[i+2], current_state[i+3]
        if (p1, p2, p3) in dic_4_1:
          if not gram4: gram4 = dic_4_1[(p1, p2, p3)]
          else: gram4 = merge_two_dic01(gram4, dic_4_1[(p1, p2, p3)])

      # 3 gram: **_
      if i-2>=0 and current_state[i-2]!='_' and current_state[i-1]!='_':
        p1, p2 = current_state[i-2], current_state[i-1]
        if (p1, p2) in dic_3_3:
          if not gram3: gram3 = dic_3_3[(p1, p2)]
          else: gram3 = merge_two_dic01(gram3, dic_3_3[(p1, p2)])
      # 3 gram: *_*
      if i-1>=0 and current_state[i-1]!='_' and i+1<word_length and current_state[i+1]!='_':
        p1, p2 = current_state[i-1], current_state[i+1]
        if (p1, p2) in dic_3_2:
          if not gram3: gram3 = dic_3_2[(p1, p2)]
          else: gram3 = merge_two_dic01(gram3, dic_3_2[(p1, p2)])
      # 3 gram: _**
      if i+2<word_length and current_state[i+1]!='_' and current_state[i+2]!='_':
        p1, p2 = current_state[i+1], current_state[i+2]
        if (p1, p2) in dic_3_1:
          if not gram3: gram3 = dic_3_1[(p1, p2)]
          else: gram3 = merge_two_dic01(gram3, dic_3_1[(p1, p2)])

      # 2 gram: *_
      if i-1>=0 and current_state[i-1]!='_':
        p1 = current_state[i-1]
        if p1 in dic_2_2:
          if not gram2: gram2 = dic_2_2[p1]
          else: gram2 = merge_two_dic01(gram2, dic_2_2[p1])
      # 2 gram: _*
      if i+1<word_length and current_state[i+1]!='_':
        p2 = current_state[i+1]
        if p2 in dic_2_1:
          if not gram2: gram2 = dic_2_1[p2]
          else: gram2 = merge_two_dic01(gram2, dic_2_1[p2])
  return gram2, gram3, gram4, gram5

def guess_char(word_length, history, current_state, char_set):
  #
  possible = []
  c = ' '
  #c = random.choices( ['a', 'o', 'e', 'i', 'u'] )[0]
  preferred_dic = char_in_order_by_default.copy()
  if word_length in char_in_order_by_length:
    preferred_dic = char_in_order_by_length[word_length].copy()
  for ch in history:
    if ch in preferred_dic: preferred_dic.remove(ch)
  possible = preferred_dic.copy()
  c = preferred_dic.pop(0)

    # the first guess
  if not history: return c

    # n-grams
  gram2, gram3, gram4, gram5 = N_grams(current_state, word_length)
  grams = merge_grams(gram2, gram3, gram4, gram5, history)
  possible = grams

  #if word_length>=12: gram4, gram5 = {}, {}
  #else: gram5 = {}
  #print(c, history, current_state)
  #print( sorted(grams.items(), key=lambda x:x[1], reverse=True) )
  if gram2: #  or gram3 or gram4 or gram5
    c = choose_high_frequency(grams)

  #print(c, history, current_state)
  return c

def merge_grams(gram2, gram3, gram4, gram5, history):
  grams = dic_1_1.copy()

  for k in grams:
    grams[k] = grams[k] * 0.3 # 0.5, 0.8
    if k in gram2: grams[k] += gram2[k]*0.5
    if k in gram3: grams[k] += gram3[k]*1
    if k in gram4: grams[k] += gram4[k]*2 #2.5
    if k in gram5: grams[k] += gram5[k]*8 #5-8
  for ch in history:
    if ch in grams: del grams[ch]
  return grams

def weeken(grams, unlikely_dic, word_length):
  count = 0
  for (k,v) in sorted(unlikely_dic.items(), key=lambda x:x[1], reverse=True):
    if k in grams:
      grams[k] -= v*0.5
      #grams[k] = grams[k] * weight0
    count += 1
    if count>word_length//2: break
  return

In [None]:
def read_file(file_path):
  f = open(file_path, 'r')
  lns = []
  for ln in f: lns.append(ln.strip())
  return lns
import random

file_path = "/content/words_250000_train.txt"
whole_words = read_file(file_path)
random.shuffle(whole_words) # randomize those words

split_ratio = 0.8
Len = int(len(whole_words)*split_ratio)
train_words = whole_words[:Len]
test_words = whole_words[Len:]
len(train_words), len(test_words)

word_frequency = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0,'j':0,'k':0,'l':0,'m':0,'n':0,'o':0,'p':0,'q':0,'r':0,'s':0,'t':0,'u':0,'v':0,'w':0,'x':0,'y':0,'z':0}
for w in train_words:
  for c in w: word_frequency[c] += 1
word_frequency

chars_in_order_by_frequency = sorted( word_frequency.items(), key=lambda x:x[1], reverse=True)
chars_in_order_by_frequency = list(map(lambda x:x[0], chars_in_order_by_frequency))


# statistics of the training data, calculate probabilities for n-grams
dic_1_1 = pattern00(train_words)
dic_2_1, dic_2_2 = pattern01(train_words)
dic_3_1, dic_3_2, dic_3_3 = pattern02(train_words)
dic_4_1, dic_4_2, dic_4_3, dic_4_4 = pattern03(train_words)
dic_5_1, dic_5_2, dic_5_3, dic_5_4, dic_5_5 = pattern04(train_words)

len_dic, char_in_order_by_length = statistics01(train_words)
char_in_order_by_default = sorted(dic_1_1.items(), key=lambda x:x[1], reverse=True)
char_in_order_by_default = list(map(lambda x:x[0], char_in_order_by_default))

In [None]:
"""
word:
  length L,
current state:
  the number of "_": m
  the number of chars: n
    m + n = L
  it can be expressed as dict
history
  suppose the guesses history is a list with length t,
  then the remaining possible chars are 26-t
maximum trying times:
  L + 6
current times:
  how many times already tried: len( history )
labels:
  the remaining correct chars for this word

for example:
  word = "hello"
  current state = "h_ll_" expressed as dict:
      {
        'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,
        'h':1,
        'i':0,'j':0,'k':0,
        'l':2,
        'm':0,'n':0,'o':0,'p':0,'q':0,'r':0,'s':0,'t':0,'u':0,'v':0,'w':0,'x':0,'y':0,'z':0,
        '_':2
      }
    it can be expressed as vector by using its frequencies in order:
      [
        0,0,0,0,0,0,0,
        1,
        0,0,0,
        2,
        0,0,0,0,0,0,0,0,0,0,0,0,0,0,
        2
       ]
  history = ['a', 'b', 'c', 'h', 'l']
    the remaining possible chars are:
      ['d','e','f','g','i','j','k','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
  labels:
    'e' or 'o'
"""
import numpy as np

def state_to_vector(current_state):
  #dic = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0,'j':0,'k':0,'l':0,'m':0,'n':0,'o':0,'p':0,'q':0,'r':0,'s':0,'t':0,'u':0,'v':0,'w':0,'x':0,'y':0,'z':0,'_':0}
  # length is 27: a-z, _
  vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  for c in current_state:
    if c=='_': vec[-1] += 1
    else: vec[ ord(c)-ord('a') ] += 1
  return vec

# the complement of the already guessed chars
def history_to_vector(history):
  #dic = {'a':1,'b':1,'c':1,'d':1,'e':1,'f':1,'g':1,'h':1,'i':1,'j':1,'k':1,'l':1,'m':1,'n':1,'o':1,'p':1,'q':1,'r':1,'s':1,'t':1,'u':1,'v':1,'w':1,'x':1,'y':1,'z':1}
  vec = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
  for c in history: vec[ ord(c)-ord('a') ] = 0
  return vec

def char_to_vector(c):
  #dic = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0,'j':0,'k':0,'l':0,'m':0,'n':0,'o':0,'p':0,'q':0,'r':0,'s':0,'t':0,'u':0,'v':0,'w':0,'x':0,'y':0,'z':0}
  vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  if c=="_": return vec
  vec[ ord(c)-ord('a') ] = 1
  return vec

def dic_to_list(grams):
  vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  for k in grams:
    vec[ ord(k)-ord('a') ] = grams[k]
  return vec

def get_labels(word, current_state):
  s = set()
  for i in range(len(word)):
    if word[i]!=current_state[i]:
      s.add( word[i] )
  return s

def get_features(current_state, history, current_times, max_times):
  vec1 = state_to_vector(current_state)
  vec2 = history_to_vector(history)
  return vec1 + vec2 + [current_times, max_times]

def get_features02(current_state, ch, current_times, max_times):
  vec1 = state_to_vector(current_state)
  vec2 = char_to_vector(ch)
  return vec1 + vec2 + [current_times, max_times]

def get_features03(current_state, history, current_times, max_times):
  vec1 = state_to_vector(current_state)
  for c in history: vec1[ ord(c)-ord('a') ] = -1
  return vec1 + [current_times, max_times]

def get_samples(word):
  word_length = len(word)
  current_state = ['_'] * word_length
  history = []
  max_times = word_length + 6
  current_times = 0
  # S = sorted(list(set(word)))
  S = []
  for c in chars_in_order_by_frequency:
    if c in set(word): S.append( c )
  # 55, 29
  X_data, y_data = np.empty((0, 53), int), np.empty((0, 26), int)
  ch = "_"

  while S and current_times<max_times:
    features = get_features(current_state, history, current_times, max_times)
    gram2, gram3, gram4, gram5 = N_grams(current_state, word_length)
    grams = merge_grams(gram2, gram3, gram4, gram5, history)
    #features = get_features02(current_state, ch, current_times, max_times)
    #features = get_features03(current_state, history, current_times, max_times)
    #ch = random.choices( list(S) )[0] # one of the correct label
    ch = S.pop(0) #S.remove(ch)
    label = char_to_vector(ch)
    vec1 = state_to_vector(current_state)
    vec2 = dic_to_list(grams)
    features = vec1 + vec2
    #print( vec1 )
    #print(len(vec2), vec2)
    #print( label )


    #yield (features, label)
    #X_data.append( features )
    #y_data.append( label )
    redundancy_features = [features]
    redundancy_labels = [label]
    X_data = np.append(X_data, redundancy_features, axis=0)
    y_data = np.append(y_data, redundancy_labels, axis=0)
    #print( history, ch )

    history.append(ch)
    for i in range(word_length):
      if word[i]==ch and current_state[i]!=ch:
        current_state[i] = ch

    current_times += 1
  return X_data, y_data

In [None]:
words = train_words[:]

training_X = np.vstack( tuple( map( lambda w: get_samples(w)[0], words ) ) )
training_y = np.vstack( tuple( map( lambda w: get_samples(w)[1], words ) ) )
training_X.shape, training_y.shape

((1344732, 53), (1344732, 26))

In [None]:
import numpy as np
import pandas as pd
from keras import Sequential
from keras.layers import Dense, Dropout
from keras import optimizers

seed = 10
np.random.seed(seed)

In [None]:
def create_model():
  model = Sequential()
  model.add(Dense(256, activation='relu', input_dim=53)) # 55, 29
  model.add(Dense(128, activation='relu'))
  model.add(Dense(64, activation='relu'))
  model.add(Dense(32, activation='relu'))
  model.add(Dense(26, activation='softmax'))
  return model

model = create_model()
model.compile(loss = 'categorical_crossentropy',
      optimizer = optimizers.Adam(learning_rate=0.001),
      metrics = ['accuracy'])
model.summary()
model.fit(training_X, training_y, epochs=300, batch_size=128)
#model.evaluate(testing_X, testing_y)
#predictions = model.predict( testing_X )
#labels_predicted = np.argmax(predictions, axis=1)
#labels_predicted

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5 (Dense)             (None, 256)               13824     
                                                                 
 dense_6 (Dense)             (None, 128)               32896     
                                                                 
 dense_7 (Dense)             (None, 64)                8256      
                                                                 
 dense_8 (Dense)             (None, 32)                2080      
                                                                 
 dense_9 (Dense)             (None, 26)                858       
                                                                 
Total params: 57914 (226.23 KB)
Trainable params: 57914 (226.23 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/300
Epoc

<keras.src.callbacks.History at 0x7e66682dac50>

In [None]:
model.save("model06.keras")
# loaded_model = keras.saving.load_model("model.keras")

In [None]:
def choose_high_frequency(char_fre):
  c, f = ' ', -1
  for k in char_fre:
    if char_fre[k]>f:
      c, f = k, char_fre[k]
  return c

def play_game_ML(word):
  word_length, try_times, current_state = start_game(word, 6) # step 1
  #char_set = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'}
  char_set = word_frequency.copy()
  history, current_times, winning = [], 0, 0

  c = '_'
  while current_times < try_times:
    #c = guess_function(word_length, history, current_state, char_set)
    features = get_features(current_state, history, current_times, try_times)
    #features = get_features02(current_state, c, current_times, try_times)
    #features = get_features03(current_state, history, current_times, try_times)
    gram2, gram3, gram4, gram5 = N_grams(current_state, word_length)
    grams = merge_grams(gram2, gram3, gram4, gram5, history)
    vec1 = state_to_vector(current_state)
    vec2 = dic_to_list(grams)
    features = vec1 + vec2


    label = model.predict( np.array([features]), verbose=0 )[0]
    char_idx = np.argmax( label )
    c = chr( ord('a')+char_idx )
    #print(char_idx, c)
    if c not in char_set:
      #c = random.choices( list(char_set) )[0]
      c = choose_high_frequency( char_set )

    history.append(c) # add in the newly guessed char
    #char_set.remove(c)
    del char_set[c]

    f = update( word, current_state, set(history) ) # step 2
    #if f: print("guess correctly")
    #else: print("no such char")

    blanks = evaluate(current_state) # step 3
    if blanks==0:
      winning = 1
      break
    current_times += 1
  return winning

c1, c2 = 0, 0
for word in test_words[:2000]:
  #print(word)
  winning = play_game_ML(word)
  if winning:
    c2 += 1
    #print(word)
  c1 += 1
c1, c2, c2/c1

(2000, 1170, 0.585)

In [None]:
win_list = list( map( lambda w: play_game_ML(w), test_words[-2000:] ) )
sum(win_list)

1118

In [None]:
import keras
def play_game_ML_fast(test_words, idx):
  cases = list( map( lambda w: play_game_ML(w), test_words[idx:idx+10] ) )
  print(sum(cases))
  return

import threading

all_threads = []
for i in range(10):
  t = threading.Thread(target=play_game_ML_fast, args=(test_words, i*10))
  all_threads.append( t )

for i in range(10):
  all_threads[i].start()

for i in range(10):
  all_threads[i].join()

3
5
3
5
4
3
2
5
2
4
