In [None]:
# hangman game
def start_game(word="hello", miss_times=6):
  word_length = len(word)
  try_times = word_length + miss_times
  current_state = ["_"] * word_length
  return word_length, try_times, current_state

def update(word, current_state, used_letters):
  f = False
  for i in range(len(word)):
    ch = word[i]
    if ch in used_letters and current_state[i]!=ch:
      current_state[i] = ch
      f = True
  return f

def evaluate(current_state):
  blanks = current_state.count("_")
  return blanks

def play_game(word, guess_function):
  word_length, try_times, current_state = start_game(word, 6) # step 1
  char_set = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'}
  history, current_times, winning = [], 0, False

  while current_times < try_times:
    c = guess_function(word_length, history, current_state, char_set)
    history.append(c) # add in the newly guessed char
    char_set.remove(c)

    f = update( word, current_state, set(history) ) # step 2
    #if f: print("guess correctly")
    #else: print("no such char")

    blanks = evaluate(current_state) # step 3
    if blanks==0:
      winning = True
      break
    current_times += 1
  return winning

In [None]:
def read_file(file_path):
  f = open(file_path, 'r')
  lns = []
  for ln in f: lns.append(ln.strip())
  return lns

# dict: key-value, the value is the frequency
# convert the frequencies to probabilities
# for example: {'a':1, 'b':2, 'c':3, 'd': 4} -> {'a':0.1, 'b':0.2, 'c':0.3, 'd': 0.4}
def normalize_dic(dic):
  for k in dic:
    S = sum(dic[k].values())
    for t in dic[k]: dic[k][t] = dic[k][t]/S
  return

# group words by length
# for words of length L, count the frequency of each char, then sort in order by frequency
# set default frequency for each char as 1 in case of 0 if normalization
def statistics01(train_words):
  default_char_frequency = {'a':1,'b':1,'c':1,'d':1,'e':1,
                'f':1,'g':1,'h':1,'i':1,'j':1,
                'k':1,'l':1,'m':1,'n':1,'o':1,
                'p':1,'q':1,'r':1,'s':1,'t':1,
                'u':1,'v':1,'w':1,'x':1,'y':1,
                'z':1}
  length_char_frequency = {}
  for word in train_words:
    L = len(word)
    if L not in length_char_frequency:
      length_char_frequency[L] = default_char_frequency.copy()
    for ch in word: length_char_frequency[L][ch] += 1
  char_in_order = {}
  for k in sorted(length_char_frequency.keys()):
    sub = sorted(length_char_frequency[k].items(), key=lambda x:x[1], reverse=True)
    sub = list( map(lambda x:x[0], sub) )
    char_in_order[k] = sub
  return length_char_frequency, char_in_order

# count the frequency of each char for the whole training dataset
def pattern00(train_words):
  dic_1_1 = {'a':0,'b':0,'c':0,'d':0,'e':0,
        'f':0,'g':0,'h':0,'i':0,'j':0,
        'k':0,'l':0,'m':0,'n':0,'o':0,
        'p':0,'q':0,'r':0,'s':0,'t':0,
        'u':0,'v':0,'w':0,'x':0,'y':0,
        'z':0}
  for word in train_words:
    for ch in word: dic_1_1[ch] += 1
  # normalize
  S = sum(dic_1_1.values())
  for k in dic_1_1: dic_1_1[k] = dic_1_1[k] / S
  return dic_1_1

# 2 gram:
  # _*: dic_2_1
  # *_: dic_2_2
def pattern01(train_words):
  dic_2_1, dic_2_2 = {}, {}
  for w in train_words:
    L = len(w)
    for i in range( L ):
      p0 = w[i]
      if i<L-1:
        p2 = w[i+1] # _*
        if p2 not in dic_2_1: dic_2_1[p2] = {}
        if p0 not in dic_2_1[p2]: dic_2_1[p2][p0] = 0
        dic_2_1[p2][p0] += 1
      if i>0:
        p1 = w[i-1] # *_
        if p1 not in dic_2_2: dic_2_2[p1] = {}
        if p0 not in dic_2_2[p1]: dic_2_2[p1][p0] = 0
        dic_2_2[p1][p0] += 1
  normalize_dic(dic_2_1)
  normalize_dic(dic_2_2)
  return dic_2_1, dic_2_2

# 3 gram:
  # _**: dic_3_1
  # *_*: dic_3_2
  # **_: dic_3_3
def pattern02(train_words):
  dic_3_1, dic_3_2, dic_3_3 = {}, {}, {}
  for w in train_words:
    L = len(w)
    for i in range( L ):
      p0 = w[i]
      if i<L-2:
        p1, p2 = w[i+1], w[i+2] # _**
        if (p1,p2) not in dic_3_1: dic_3_1[(p1,p2)] = {}
        if p0 not in dic_3_1[(p1,p2)]: dic_3_1[(p1,p2)][p0] = 0
        dic_3_1[(p1,p2)][p0] += 1
      if i>0 and i<L-1:
        p1, p2 = w[i-1], w[i+1] # *_*
        if (p1,p2) not in dic_3_2: dic_3_2[(p1,p2)] = {}
        if p0 not in dic_3_2[(p1,p2)]: dic_3_2[(p1,p2)][p0] = 0
        dic_3_2[(p1,p2)][p0] += 1
      if i>1:
        p1, p2 = w[i-2], w[i-1] # **_
        if (p1,p2) not in dic_3_3: dic_3_3[(p1,p2)] = {}
        if p0 not in dic_3_3[(p1,p2)]: dic_3_3[(p1,p2)][p0] = 0
        dic_3_3[(p1,p2)][p0] += 1
  normalize_dic(dic_3_1)
  normalize_dic(dic_3_2)
  normalize_dic(dic_3_3)
  return dic_3_1, dic_3_2, dic_3_3

# 4 gram:
  # _***: dic_4_1
  # *_**: dic_4_2
  # **_*: dic_4_3
  # ***_: dic_4_4
def pattern03(train_words):
  dic_4_1, dic_4_2, dic_4_3, dic_4_4 = {}, {}, {}, {}
  for w in train_words:
    L = len(w)
    for i in range( L ):
      p0 = w[i]
      if i<L-3:
        p1, p2, p3 = w[i+1], w[i+2], w[i+3] # _***
        if (p1, p2, p3) not in dic_4_1: dic_4_1[(p1, p2, p3)] = {}
        if p0 not in dic_4_1[(p1, p2, p3)]: dic_4_1[(p1, p2, p3)][p0] = 0
        dic_4_1[(p1, p2, p3)][p0] += 1
      if i>0 and i<L-2:
        p1, p2, p3 = w[i-1], w[i+1], w[i+2] # *_**
        if (p1, p2, p3) not in dic_4_2: dic_4_2[(p1, p2, p3)] = {}
        if p0 not in dic_4_2[(p1, p2, p3)]: dic_4_2[(p1, p2, p3)][p0] = 0
        dic_4_2[(p1, p2, p3)][p0] += 1
      if i>1 and i<L-1:
        p1, p2, p3 = w[i-2], w[i-1], w[i+1] # **_*
        if (p1, p2, p3) not in dic_4_3: dic_4_3[(p1, p2, p3)] = {}
        if p0 not in dic_4_3[(p1, p2, p3)]: dic_4_3[(p1, p2, p3)][p0] = 0
        dic_4_3[(p1, p2, p3)][p0] += 1
      if i>2:
        p1, p2, p3 = w[i-3], w[i-2], w[i-1] # ***_
        if (p1, p2, p3) not in dic_4_4: dic_4_4[(p1, p2, p3)] = {}
        if p0 not in dic_4_4[(p1, p2, p3)]: dic_4_4[(p1, p2, p3)][p0] = 0
        dic_4_4[(p1, p2, p3)][p0] += 1

  normalize_dic(dic_4_1)
  normalize_dic(dic_4_2)
  normalize_dic(dic_4_3)
  normalize_dic(dic_4_4)
  return dic_4_1, dic_4_2, dic_4_3, dic_4_4

# 5 gram
def pattern04(train_words):
  dic_5_1, dic_5_2, dic_5_3, dic_5_4, dic_5_5 = {}, {}, {}, {}, {}
  for w in train_words:
    L = len(w)
    for i in range( L ):
      p0 = w[i]
      if i-4>=0: # ****_
        p1, p2, p3, p4 = w[i-4], w[i-3], w[i-2], w[i-1]
        if (p1, p2, p3, p4) not in dic_5_5: dic_5_5[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_5[(p1, p2, p3, p4)]: dic_5_5[(p1, p2, p3, p4)][p0] = 0
        dic_5_5[(p1, p2, p3, p4)][p0] += 1
      if i-3>=0 and i+1<L: # ***_*
        p1, p2, p3, p4 = w[i-3], w[i-2], w[i-1], w[i+1]
        if (p1, p2, p3, p4) not in dic_5_4: dic_5_4[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_4[(p1, p2, p3, p4)]: dic_5_4[(p1, p2, p3, p4)][p0] = 0
        dic_5_4[(p1, p2, p3, p4)][p0] += 1
      if i-2>=0 and i+2<L: # **_**
        p1, p2, p3, p4 = w[i-2], w[i-1], w[i+1], w[i+2]
        if (p1, p2, p3, p4) not in dic_5_3: dic_5_3[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_3[(p1, p2, p3, p4)]: dic_5_3[(p1, p2, p3, p4)][p0] = 0
        dic_5_3[(p1, p2, p3, p4)][p0] += 1
      if i-1>=0 and i+3<L: # *_***
        p1, p2, p3, p4 = w[i-1], w[i+1], w[i+2], w[i+3]
        if (p1, p2, p3, p4) not in dic_5_2: dic_5_2[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_2[(p1, p2, p3, p4)]: dic_5_2[(p1, p2, p3, p4)][p0] = 0
        dic_5_2[(p1, p2, p3, p4)][p0] += 1
      if i+4<L: # _****
        p1, p2, p3, p4 = w[i+1], w[i+2], w[i+3], w[i+4]
        if (p1, p2, p3, p4) not in dic_5_1: dic_5_1[(p1, p2, p3, p4)] = {}
        if p0 not in dic_5_1[(p1, p2, p3, p4)]: dic_5_1[(p1, p2, p3, p4)][p0] = 0
        dic_5_1[(p1, p2, p3, p4)][p0] += 1

  normalize_dic(dic_5_1)
  normalize_dic(dic_5_2)
  normalize_dic(dic_5_3)
  normalize_dic(dic_5_4)
  normalize_dic(dic_5_5)
  return dic_5_1, dic_5_2, dic_5_3, dic_5_4, dic_5_5

In [None]:
import random

# for dict: key is char, and value is the probability (or frequency)
# choose the char with the highest probability (or frequency)
def choose_high_frequency(char_fre):
  c, f = ' ', -1
  for k in char_fre:
    if char_fre[k]>f: c, f = k, char_fre[k]
  return c

# merge two dict
# use the common key, and value is the sum
def merge_two_dic01(d1, d2):
  d = {}
  for k in d1.keys():
    if k in d2: d[k] = d1[k] + d2[k]
  return d

# check for n-gram
def check_list(current_state, lst):
  for i in lst:
    if current_state[i] == '_': return False
  return True

# 2 gram:
  # *_
  # _*
# 3 gram:
  # **_
  # *_*
  # _**
# 4 gram:
  # **_*
  # ***_
  # *_**
  # _***
# 5 gram:
  # **_**
  # ***_*
  # ****_
  # *_***
  # _****
def N_grams(current_state, word_length):
  # dic_2_1, dic_2_2,
  # dic_3_1, dic_3_2, dic_3_3
  # dic_4_1, dic_4_2, dic_4_3, dic_4_4
  # dic_5_1, dic_5_2, dic_5_3, dic_5_4, dic_5_5

  gram2, gram3, gram4 = {}, {}, {}
  gram5 = {}
  for i in range(word_length):
    if current_state[i]=='_':
      # 5 gram:
      if i-4>=0: # ****_
        if check_list(current_state, [i-4,i-3,i-2,i-1]):
          p1, p2, p3, p4 = current_state[i-4], current_state[i-3], current_state[i-2], current_state[i-1]
          if (p1, p2, p3, p4) in dic_5_5:
            if not gram5: gram5 = dic_5_5[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_5[(p1, p2, p3, p4)])
      if i-3>=0 and i+1<word_length: # ***_*
        if check_list(current_state, [i-3,i-2,i-1,i+1]):
          p1, p2, p3, p4 = current_state[i-3], current_state[i-2], current_state[i-1], current_state[i+1]
          if (p1, p2, p3, p4) in dic_5_4:
            if not gram5: gram5 = dic_5_4[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_4[(p1, p2, p3, p4)])
      if i-2>=0 and i+2<word_length: # **_**
        if check_list(current_state, [i-2,i-1,i+1,i+2]):
          p1, p2, p3, p4 = current_state[i-2], current_state[i-1], current_state[i+1], current_state[i+2],
          if (p1, p2, p3, p4) in dic_5_3:
            if not gram5: gram5 = dic_5_3[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_3[(p1, p2, p3, p4)])
      if i-1>=0 and i+3<word_length: # *_***
        if check_list(current_state, [i-1,i+1,i+2,i+3]):
          p1, p2, p3, p4 = current_state[i-1], current_state[i+1], current_state[i+2], current_state[i+3]
          if (p1, p2, p3, p4) in dic_5_2:
            if not gram5: gram5 = dic_5_2[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_2[(p1, p2, p3, p4)])
      if i+4<word_length: # _****
        if check_list(current_state, [i+1,i+2,i+3,i+4]):
          p1, p2, p3, p4 = current_state[i+1], current_state[i+2], current_state[i+3], current_state[i+4]
          if (p1, p2, p3, p4) in dic_5_1:
            if not gram5: gram5 = dic_5_1[(p1, p2, p3, p4)]
            else: gram5 = merge_two_dic01(gram5, dic_5_1[(p1, p2, p3, p4)])

      # 4 gram: ***_
      if i-3>=0 and current_state[i-3]!='_' and current_state[i-2]!='_' and current_state[i-1]!='_':
        p1, p2, p3 = current_state[i-3], current_state[i-2], current_state[i-1]
        if (p1, p2, p3) in dic_4_4:
          if not gram4: gram4 = dic_4_4[(p1, p2, p3)]
          else: gram4 = merge_two_dic01(gram4, dic_4_4[(p1, p2, p3)])
      # 4 gram: **_*
      if i-2>=0 and i+1<word_length and current_state[i-2]!='_' and current_state[i-1]!='_' and current_state[i+1]!='_' :
        p1, p2, p3 = current_state[i-2], current_state[i-1], current_state[i+1]
        if (p1, p2, p3) in dic_4_3:
          if not gram4: gram4 = dic_4_3[(p1, p2, p3)]
          else: gram4 = merge_two_dic01(gram4, dic_4_3[(p1, p2, p3)])
      # 4 gram: *_**
      if i-1>=0 and i+2<word_length and current_state[i-1]!='_' and current_state[i+1]!='_' and current_state[i+2]!='_' :
        p1, p2, p3 = current_state[i-1], current_state[i+1], current_state[i+2]
        if (p1, p2, p3) in dic_4_2:
          if not gram4: gram4 = dic_4_2[(p1, p2, p3)]
          else: gram4 = merge_two_dic01(gram4, dic_4_2[(p1, p2, p3)])
      # 4 gram: _***
      if i+3<word_length and current_state[i+1]!='_' and current_state[i+2]!='_' and current_state[i+3]!='_' :
        p1, p2, p3 = current_state[i+1], current_state[i+2], current_state[i+3]
        if (p1, p2, p3) in dic_4_1:
          if not gram4: gram4 = dic_4_1[(p1, p2, p3)]
          else: gram4 = merge_two_dic01(gram4, dic_4_1[(p1, p2, p3)])

      # 3 gram: **_
      if i-2>=0 and current_state[i-2]!='_' and current_state[i-1]!='_':
        p1, p2 = current_state[i-2], current_state[i-1]
        if (p1, p2) in dic_3_3:
          if not gram3: gram3 = dic_3_3[(p1, p2)]
          else: gram3 = merge_two_dic01(gram3, dic_3_3[(p1, p2)])
      # 3 gram: *_*
      if i-1>=0 and current_state[i-1]!='_' and i+1<word_length and current_state[i+1]!='_':
        p1, p2 = current_state[i-1], current_state[i+1]
        if (p1, p2) in dic_3_2:
          if not gram3: gram3 = dic_3_2[(p1, p2)]
          else: gram3 = merge_two_dic01(gram3, dic_3_2[(p1, p2)])
      # 3 gram: _**
      if i+2<word_length and current_state[i+1]!='_' and current_state[i+2]!='_':
        p1, p2 = current_state[i+1], current_state[i+2]
        if (p1, p2) in dic_3_1:
          if not gram3: gram3 = dic_3_1[(p1, p2)]
          else: gram3 = merge_two_dic01(gram3, dic_3_1[(p1, p2)])

      # 2 gram: *_
      if i-1>=0 and current_state[i-1]!='_':
        p1 = current_state[i-1]
        if p1 in dic_2_2:
          if not gram2: gram2 = dic_2_2[p1]
          else: gram2 = merge_two_dic01(gram2, dic_2_2[p1])
      # 2 gram: _*
      if i+1<word_length and current_state[i+1]!='_':
        p2 = current_state[i+1]
        if p2 in dic_2_1:
          if not gram2: gram2 = dic_2_1[p2]
          else: gram2 = merge_two_dic01(gram2, dic_2_1[p2])
  return gram2, gram3, gram4, gram5

def previous_info(current_state, history, word_length):
  prev_state = current_state.copy()
  prev_guess = history[-1]
  label = 0
  for i in range(word_length):
    if prev_state[i] == prev_guess:
      prev_state[i] = '_'
      label = 1
  return label, prev_state, prev_guess

def state_to_vector(current_state, history):
  #dic = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0,'j':0,'k':0,'l':0,'m':0,'n':0,'o':0,'p':0,'q':0,'r':0,'s':0,'t':0,'u':0,'v':0,'w':0,'x':0,'y':0,'z':0,'_':0}
  # length is 27: a-z, _
  vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  for c in current_state:
    if c=='_': vec[-1] += 1
    else: vec[ ord(c)-ord('a') ] += 1
  right_num = len(current_state) - vec[-1]
  wrong_num = 0
  for c in history:
    if c not in current_state: wrong_num += 1
  #for c in history:
    #vec[ ord(c)-ord('a') ] = -1
  return vec + [6-wrong_num]

def char_to_vector(c):
  #dic = {'a':0,'b':0,'c':0,'d':0,'e':0,'f':0,'g':0,'h':0,'i':0,'j':0,'k':0,'l':0,'m':0,'n':0,'o':0,'p':0,'q':0,'r':0,'s':0,'t':0,'u':0,'v':0,'w':0,'x':0,'y':0,'z':0}
  vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  if c=="_": return vec
  vec[ ord(c)-ord('a') ] = 1
  return vec

def dic_to_list(grams):
  vec = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
  for k in grams:
    vec[ ord(k)-ord('a') ] = grams[k]
  return vec

def guess_char(word_length, history, current_state, char_set):
  #
  # use ML model
  use_model = False # True, False

  if history and not use_model:
    label, prev_state, prev_guess = previous_info(current_state, history, word_length)
    prev_gram2, prev_gram3, prev_gram4, prev_gram5 = N_grams(prev_state, word_length)
    prev_grams = merge_grams(prev_gram2, prev_gram3, prev_gram4, prev_gram5, history[:-1])
    vec1 = state_to_vector(prev_state, history[:-1])
    vec2 = dic_to_list(prev_grams)
    vec3 = char_to_vector(prev_guess)
    if label==1:
      global training_X, training_y
      training_X.append( vec1+[word_length]+vec2 )
      training_y.append( vec3 )

    """
    print( "previous state & guess: ", label, prev_state, prev_guess )
    print( vec1 )
    print( vec2 )
    print( vec3 )
    print()
    """

  possible = []
  c = ' '
  #c = random.choices( ['a', 'o', 'e', 'i', 'u'] )[0]
  preferred_dic = char_in_order_by_default.copy()
  if word_length in char_in_order_by_length:
    preferred_dic = char_in_order_by_length[word_length].copy()
  for ch in history:
    if ch in preferred_dic: preferred_dic.remove(ch)
  possible = preferred_dic.copy()
  c = preferred_dic.pop(0)

    # the first guess
  #if not history: return c

    # n-grams
  gram2, gram3, gram4, gram5 = N_grams(current_state, word_length)
  grams = merge_grams(gram2, gram3, gram4, gram5, history)
  possible = grams

  #if word_length>=12: gram4, gram5 = {}, {}
  #else: gram5 = {}
  #print(c, history, current_state)
  #print( sorted(grams.items(), key=lambda x:x[1], reverse=True) )
  #if gram2: #  or gram3 or gram4 or gram5
  c = choose_high_frequency(grams)


  if use_model:
    global clf
    vec1 = state_to_vector(current_state, history)
    vec2 = dic_to_list(grams)
    features = vec1 + [word_length] + vec2
    chs = clf.predict_proba( [features] )[0]
    for ch in history:
      chs[ ord(ch)-ord('a') ] = 0

    if max(chs) >= 0.0:
      pred = chr( np.argmax( chs ) + 97 )
      #print( "pred: \t", pred )
      #if pred in grams:
      c = pred
  #print(c, history, current_state)
  return c

def merge_grams(gram2, gram3, gram4, gram5, history):
  grams = dic_1_1.copy()

  for k in grams:
    grams[k] = grams[k] * 0.3 # 0.5, 0.8
    if k in gram2: grams[k] += gram2[k]*0.5
    if k in gram3: grams[k] += gram3[k]*1
    if k in gram4: grams[k] += gram4[k]*2 #2.5
    if k in gram5: grams[k] += gram5[k]*8 #5-8
  for ch in history:
    if ch in grams: del grams[ch]
  return grams

def weeken(grams, unlikely_dic, word_length):
  count = 0
  for (k,v) in sorted(unlikely_dic.items(), key=lambda x:x[1], reverse=True):
    if k in grams:
      grams[k] -= v*0.5
      #grams[k] = grams[k] * weight0
    count += 1
    if count>word_length//2: break
  return

In [None]:
file_path = "/content/words_250000_train.txt"
whole_words = read_file(file_path)
random.shuffle(whole_words) # randomize those words

training_X, training_y = [], []

for split_ratio in [0.8]: # 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95
  Len = int(len(whole_words)*split_ratio)
  train_words = whole_words[:Len]
  test_words = whole_words[Len:]

  # statistics of the training data, calculate probabilities for n-grams
  dic_1_1 = pattern00(train_words)
  dic_2_1, dic_2_2 = pattern01(train_words)
  dic_3_1, dic_3_2, dic_3_3 = pattern02(train_words)
  dic_4_1, dic_4_2, dic_4_3, dic_4_4 = pattern03(train_words)
  dic_5_1, dic_5_2, dic_5_3, dic_5_4, dic_5_5 = pattern04(train_words)

  len_dic, char_in_order_by_length = statistics01(train_words)
  char_in_order_by_default = sorted(dic_1_1.items(), key=lambda x:x[1], reverse=True)
  char_in_order_by_default = list(map(lambda x:x[0], char_in_order_by_default))

  # training data
  c1, c2 = 0, 0
  for word in train_words[:]:
    #print(word)
    winning = play_game(word, guess_char)
    if winning:
      c2 += 1
      #print(word)
    c1 += 1
  print( "training vs testing \t", Len, " vs", len(whole_words)-Len )
  print( "\t", round(split_ratio*100), "% vs", round((1-split_ratio)*100), "%" )
  print( "\t", "testing accuracy: ", round(c2/c1*100, 2), "%", "\t", c1, c2 )

len(training_X), len(training_y)

training vs testing 	 181840  vs 45460
	 80 % vs 20 %
	 testing accuracy:  78.46 % 	 181840 142679


(1126107, 1126107)

In [None]:
from sklearn.neural_network import MLPClassifier
import numpy as np
clf = MLPClassifier(max_iter=10, alpha=1e-5,
                    hidden_layer_sizes=(256, 128, 64, 32),
                    verbose=True, random_state=1)
clf.fit( training_X, training_y )
clf.score( training_X, training_y )

Iteration 1, loss = 0.70613355
Iteration 2, loss = 0.26782214
Iteration 3, loss = 0.20010905
Iteration 4, loss = 0.16849337
Iteration 5, loss = 0.15003835
Iteration 6, loss = 0.13614140
Iteration 7, loss = 0.12623058
Iteration 8, loss = 0.11911023
Iteration 9, loss = 0.11029544
Iteration 10, loss = 0.10734996




0.9686104428797618

In [None]:
# testing data
c1, c2 = 0, 0
for word in test_words[:]:
  winning = play_game(word, guess_char)
  if winning: c2 += 1
  c1 += 1
print( "training vs testing \t", Len, " vs", len(whole_words)-Len )
print( "\t", round(split_ratio*100), "% vs", round((1-split_ratio)*100), "%" )
print( "\t", "testing accuracy: ", round(c2/c1*100, 2), "%", "\t", c1, c2 )
# N-gram:     testing accuracy:  70.95 % 	 45460 32254
# ML model:   testing accuracy:  71.73 % 	 4546 3261

training vs testing 	 181840  vs 45460
	 80 % vs 20 %
	 testing accuracy:  71.29 % 	 45460 32408


In [None]:
import pickle

# save
with open('model11.pkl','wb') as f:
    pickle.dump(clf,f)

In [None]:
# load
with open('model11.pkl', 'rb') as f:
    clf2 = pickle.load(f)

In [None]:
for length in [4]: #range(1, 20)
  # testing data
  c1, c2 = 0, 0
  for word in whole_words[:]:  # whole_words, test_words
    if len(word)!=length: continue
    #print(word)
    winning = play_game(word, guess_char)
    if winning:
      c2 += 1
      #print(word)
    c1 += 1
  #print( "training vs testing \t", Len, " vs", len(whole_words)-Len )
  #print( "\t", round(split_ratio*100), "% vs", round((1-split_ratio)*100), "%" )
  print( "\t", length, "\t", "testing accuracy: ", round(c2/c1*100, 2), "%", "\t", c1, c2 )

	 4 	 testing accuracy:  21.43 % 	 5287 1133


In [None]:
"""
5 grams with initialization
   1 	 testing accuracy:  41.18 % 	 17 7
	 2 	 testing accuracy:  12.88 % 	 264 34
	 3 	 testing accuracy:  12.4 % 	 2201 273
	 4 	 testing accuracy:  21.79 % 	 5287 1152
	 5 	 testing accuracy:  33.25 % 	 11274 3749
	 6 	 testing accuracy:  46.83 % 	 19541 9151
	 7 	 testing accuracy:  60.9 % 	 25948 15802
	 8 	 testing accuracy:  74.42 % 	 30452 22662
	 9 	 testing accuracy:  85.07 % 	 30906 26291
	 10 	 testing accuracy:  91.66 % 	 26953 24704
	 11 	 testing accuracy:  95.94 % 	 22786 21860
	 12 	 testing accuracy:  97.95 % 	 18178 17805
	 13 	 testing accuracy:  98.96 % 	 12956 12821
	 14 	 testing accuracy:  99.44 % 	 8710 8661
	 15 	 testing accuracy:  99.79 % 	 5211 5200
	 16 	 testing accuracy:  99.87 % 	 3143 3139
	 17 	 testing accuracy:  100.0 % 	 1775 1775
	 18 	 testing accuracy:  100.0 % 	 859 859
	 19 	 testing accuracy:  100.0 % 	 441 441
5 grams
1 	 testing accuracy:  41.18 % 	 17 7
2 	 testing accuracy:  12.88 % 	 264 34
3 	 testing accuracy:  12.22 % 	 2201 269
4 	 testing accuracy:  21.2 % 	 5287 1121
5 	 testing accuracy:  31.54 % 	 11274 3556
6 	 testing accuracy:  43.82 % 	 19541 8563
7 	 testing accuracy:  57.62 % 	 25948 14952
8 	 testing accuracy:  70.61 % 	 30452 21502
9 	 testing accuracy:  82.35 % 	 30906 25451
10 	 testing accuracy:  89.05 % 	 26953 24001
11 	 testing accuracy:  93.25 % 	 22786 21248
12 	 testing accuracy:  95.76 % 	 18178 17408
13 	 testing accuracy:  96.41 % 	 12956 12491
14 	 testing accuracy:  97.03 % 	 8710 8451
15 	 testing accuracy:  99.06 % 	 5211 5162
16 	 testing accuracy:  97.9 % 	 3143 3077
17 	 testing accuracy:  98.93 % 	 1775 1756
18 	 testing accuracy:  99.65 % 	 859 856
19 	 testing accuracy:  100.0 % 	 441 441
"""

In [None]:
clf.predict_proba(training_X[:1])[0]
chr( np.argmax( clf.predict(training_X[:1])[0] ) + 97 )

array([1.15497573e-06, 8.12927773e-08, 9.25120795e-15, 5.07808828e-16,
       9.99998780e-01, 4.95202581e-10, 4.42485816e-12, 1.17635321e-08,
       6.62618404e-08, 3.93186305e-17, 3.11000742e-09, 2.92719191e-08,
       2.99775159e-13, 7.46888890e-09, 3.72784725e-07, 6.45992240e-11,
       2.39263130e-11, 1.85475320e-09, 1.39655986e-08, 2.40194363e-08,
       4.77673640e-06, 3.40532750e-10, 1.80602430e-10, 6.43457899e-11,
       1.94626552e-12, 3.74234504e-11])

In [None]:
play_game("hello", guess_char)

False