# Find anagrams: part 11
# Get the parent / child words of the top 5 word groups by character length
Mike Babb  
babb.mike@outlook.com


In [1]:
# standard libraries
from collections import Counter
import os
import sqlite3

In [2]:
# external
import json
import pandas as pd

In [3]:
# custom
import _run_constants as rc
from part_00_file_db_utils import query_db
from part_00_process_functions import *

# Word counts by word group

In [4]:
# join in word_group_id
sql = 'select * from word_counts;'
word_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)

...query execution took: 2.1 seconds...


In [5]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,n_from_words,n_to_words,n_from_word_groups,n_to_word_groups
0,A,a,1,a,0,0,a,a,144511,1,133001,1
1,aa,aa,2,a,1,1,a,a,45451,2,43229,2
2,aal,aal,3,a,2,2,al,la,24276,7,23271,5
3,aalii,aalii,5,a,3,3,ail,lai,5358,15,5263,11
4,aam,aam,3,a,4,4,am,ma,14107,7,13497,5


In [6]:
# turn this into the word_group_df
col_names = ['word_group_id', 'lcase']
wg_df = word_df.sort_values(by = col_names).drop_duplicates(subset = 'word_group_id').copy()

# dictionary of words!
word_group_dict = {wg_id:word for wg_id, word in zip(wg_df['word_group_id'],
                                                     wg_df['lcase'])}



In [7]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,n_from_words,n_to_words,n_from_word_groups,n_to_word_groups
0,A,a,1,a,0,0,a,a,144511,1,133001,1
1,aa,aa,2,a,1,1,a,a,45451,2,43229,2
2,aal,aal,3,a,2,2,al,la,24276,7,23271,5
3,aalii,aalii,5,a,3,3,ail,lai,5358,15,5263,11
4,aam,aam,3,a,4,4,am,ma,14107,7,13497,5


In [8]:
wg_df.shape

(215842, 12)

# the top five words by from/to status by character length

In [9]:
# ranks for words
wg_df['n_from_rank'] = wg_df[['n_chars', 'n_from_words']].groupby(['n_chars']).rank(method = 'first', ascending=False)
wg_df['n_to_rank'] = wg_df[['n_chars', 'n_to_words']].groupby(['n_chars']).rank(method = 'first',ascending=False)

# melt to get the word count
wc_df = pd.melt(frame = wg_df, id_vars = ['word_id', 'word_group_id', 'lcase', 'n_chars'],
                  value_vars = ['n_from_words','n_to_words'],
                  var_name = 'direction', value_name = 'n_words')
				  
# word count				  
recode_dict = {'n_from_words':'from',
               'n_to_words':'to'}

wc_df['direction'] = wc_df['direction'].map(recode_dict)			   
				  
# melt to get the ranks
rank_df = pd.melt(frame = wg_df, id_vars = ['word_id', 'word_group_id', 'lcase', 'n_chars'],
                  value_vars = ['n_from_rank','n_to_rank'],
                  var_name = 'direction', value_name = 'word_rank')			  
				  
recode_dict = {'n_from_rank':'from',
               'n_to_rank':'to'}

rank_df['direction'] = rank_df['direction'].map(recode_dict)		   
				  				  
# overwrite the wg_df by joining
wg_df = pd.merge(left = wc_df, right = rank_df)				  

In [10]:
wg_df.head()

Unnamed: 0,word_id,word_group_id,lcase,n_chars,direction,n_words,word_rank
0,0,0,a,1,from,144511,3.0
1,1,1,aa,2,from,45451,30.0
2,2,2,aal,3,from,24276,110.0
3,3,3,aalii,5,from,5358,757.0
4,4,4,aam,3,from,14107,284.0


In [11]:
# select the top 5
curr_wg_df = wg_df.loc[wg_df['word_rank'] <= 5, :].copy()

In [12]:
curr_wg_df.shape

(240, 7)

In [13]:
# count how many words are in each word group
word_group_id_counter = Counter(word_df['word_group_id'])

In [14]:
curr_wg_df['word_group_size'] = curr_wg_df['word_group_id'].map(word_group_id_counter)

In [15]:
curr_wg_df.head()

Unnamed: 0,word_id,word_group_id,lcase,n_chars,direction,n_words,word_rank,word_group_size
0,0,0,a,1,from,144511,3.0,1
1194,1212,1194,acetoin,7,from,5188,3.0,5
2024,2054,2024,actioner,8,from,3206,1.0,5
2989,3028,2989,ae,2,from,91948,3.0,2
3092,3133,3092,aer,3,from,55024,1.0,5


In [None]:
# sort, select columns, and save to disk
# this is the data that populates the grid
curr_wg_df['word_rank'] = curr_wg_df['word_rank'].round(0).astype(int)

curr_wg_df = curr_wg_df.sort_values(by = ['n_chars', 'lcase'])
temp_output_df = curr_wg_df[['lcase', 'n_chars', 'direction', 'n_words', 'word_rank', 'word_group_size']]

temp_output_df.columns = ['word', 'number of characters','direction', 'number of words', 'word rank', 'word group size']
temp_output_df.to_json('../media/finding_anagrams/words/words.json',orient='records')

# generate the list of words for each word group

In [17]:
curr_wg_df.shape

(240, 8)

In [18]:
curr_wg_df.head()

Unnamed: 0,word_id,word_group_id,lcase,n_chars,direction,n_words,word_rank,word_group_size
0,0,0,a,1,from,144511,3,1
215842,0,0,a,1,to,1,1,1
232355,16974,16513,b,1,to,1,2,1
242718,27937,26876,c,1,to,1,3,1
261526,47720,45684,d,1,to,1,4,1


In [19]:
# create the from word id df
sql = 'select word_group_id, word_id, lcase, n_chars from words;'
fw_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)
fw_df.columns = ['from_word_group_id', 'from_word_id', 'from_word', 'from_n_chars']

# create the to word id df
sql = 'select word_group_id, word_id, lcase, n_chars from words;'
tw_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)
tw_df.columns = ['to_word_group_id', 'to_word_id', 'to_word', 'to_n_chars']

...query execution took: 0.79 seconds...
...query execution took: 0.8 seconds...


In [None]:
# output path to save the generated word lists
word_list_output_path = '../media/finding_anagrams/wordlists'

In [23]:
test_word = 'formaldehydesulphoxylate'
wg_id = word_df.loc[word_df['lcase'] == test_word, 'word_group_id'].iloc[0]
wg_id

69032

In [24]:
pw_df = build_list_of_parent_words(word_group_id=wg_id, db_path=rc.DB_PATH, db_name=rc.DB_NAME)
print(pw_df.shape)
cw_df = build_list_of_child_words(word_group_id=wg_id, db_path=rc.DB_PATH, db_name=rc.DB_NAME)
print(cw_df.shape)

...query execution took: 0.0 seconds...
...query execution took: 0.67 seconds...
(1, 6)
...query execution took: 0.08 seconds...
...query execution took: 0.61 seconds...
(8214, 4)
      to_word_id  to_word_group_id to_word  from_word_group_id
8209      233399            214926    yule               69032
8210      233402            214929    yuma               69032
8211      233412            214938    yurt               69032
8212      233413            214939   yurta               69032
8213      233421            214947     yus               69032
(8214, 6)


In [25]:
pw_df.head()

Unnamed: 0,from_word_id,to_word_id,from_word_group_id,to_word_group_id,from_word,to_word
0,72186,72186,69032,69032,formaldehydesulphoxylate,formaldehydesulphoxylate


In [26]:
pw_df.shape

(1, 6)

In [27]:
cw_df.shape

(8214, 6)

In [28]:
# counters
to_word_counter = Counter()
from_word_counter = Counter()
for i_c, row in curr_wg_df.iterrows():
    
    # the current word
    curr_word_group_id = row['word_group_id']
    curr_word = word_group_dict[curr_word_group_id]        

    if row['direction'] == 'from':
        print('####', curr_word, 'FROM WORDS')
        
        pw_df = build_list_of_parent_words(word_group_id=curr_word_group_id,
                                          db_path = rc.DB_PATH, db_name = rc.DB_NAME)               
                
        # save it
        format_and_save_words_json(df = pw_df, r_direction='from',
                                   curr_word=curr_word, output_path=word_list_output_path)
        

        # distinct to words
        to_word_counter.update(pw_df['from_word_id'])


    if row['direction'] == 'to':
        print('####', curr_word, 'TO WORDS')        
        
        cw_df = build_list_of_child_words(word_group_id=curr_word_group_id,
                                          db_path = rc.DB_PATH, db_name = rc.DB_NAME)        
        
        # save it
        format_and_save_words_json(df = cw_df, r_direction='to',
                                   curr_word=curr_word, output_path=word_list_output_path)
        
        # distinct from words
        from_word_counter.update(cw_df['to_word_id'])

    


#### a FROM WORDS
...query execution took: 0.31 seconds...
...query execution took: 0.67 seconds...
#### a TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.63 seconds...
(1, 4)
   to_word_id  to_word_group_id to_word  from_word_group_id
0           0                 0       a                   0
#### b TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.58 seconds...
(1, 4)
   to_word_id  to_word_group_id to_word  from_word_group_id
0       16974             16513       b               16513
#### c TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.62 seconds...
(1, 4)
   to_word_id  to_word_group_id to_word  from_word_group_id
0       27937             26876       c               26876
#### d TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.61 seconds...
(1, 4)
   to_word_id  to_word_group_id to_word  from_word_group_id
0       47720             45684       d               45684
####

In [29]:
wg_df.head()

Unnamed: 0,word_id,word_group_id,lcase,n_chars,direction,n_words,word_rank
0,0,0,a,1,from,144511,3.0
1,1,1,aa,2,from,45451,30.0
2,2,2,aal,3,from,24276,110.0
3,3,3,aalii,5,from,5358,757.0
4,4,4,aam,3,from,14107,284.0


In [30]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,n_from_words,n_to_words,n_from_word_groups,n_to_word_groups
0,A,a,1,a,0,0,a,a,144511,1,133001,1
1,aa,aa,2,a,1,1,a,a,45451,2,43229,2
2,aal,aal,3,a,2,2,al,la,24276,7,23271,5
3,aalii,aalii,5,a,3,3,ail,lai,5358,15,5263,11
4,aam,aam,3,a,4,4,am,ma,14107,7,13497,5


In [31]:
# the letters in the top five from words by word length can be found in:
print(len(from_word_counter)) 
# which is:
print(len(from_word_counter) / word_df.shape[0])
# of words



101632
0.4336391176345095


In [None]:
# the letters in the top five to words by word length can be rearraged to spell:
print(len(to_word_counter))
# which is:
print(len(to_word_counter) / word_df.shape[0])
# of words


233233
0.9951486965055254


In [33]:
# what letters are represented?
from_letter_counter = Counter()
to_letter_counter = Counter()


In [34]:
for ir, row in curr_wg_df.iterrows():
    if row['direction'] == 'from':
        from_letter_counter.update(row['lcase'])
    if row['direction'] == 'to':
        to_letter_counter.update(row['lcase'])

In [35]:
from string import ascii_lowercase

In [36]:
from_letter_counter

Counter({'o': 209,
         'a': 142,
         'i': 140,
         't': 139,
         'e': 117,
         'r': 104,
         'c': 92,
         'n': 91,
         'l': 86,
         'h': 85,
         'p': 76,
         's': 52,
         'm': 40,
         'g': 37,
         'y': 35,
         'd': 23,
         'u': 14,
         'b': 7,
         'f': 4,
         'z': 2,
         'x': 2,
         'q': 1,
         'j': 1,
         'v': 1})

In [37]:
set(ascii_lowercase).difference(from_letter_counter.keys())

{'k', 'w'}

In [38]:
curr_wg_df.shape

(240, 8)

In [39]:
set(ascii_lowercase).difference(to_letter_counter.keys())

{'q', 'w'}

In [40]:
to_letter_counter

Counter({'a': 159,
         'e': 150,
         'i': 126,
         't': 122,
         'o': 119,
         'r': 112,
         's': 104,
         'l': 97,
         'n': 90,
         'c': 79,
         'p': 78,
         'u': 53,
         'h': 51,
         'm': 45,
         'd': 32,
         'g': 28,
         'y': 26,
         'b': 18,
         'v': 3,
         'f': 3,
         'z': 2,
         'k': 1,
         'j': 1,
         'x': 1})

In [1]:
157437 / 234370

0.6717455305713188