In [1]:
# mike babb
# 2025 03 12
# get the definitions of the top 5 word groups by character length

In [2]:
from collections import Counter
import os
import sqlite3

In [3]:
import duckdb
import json
from lxml import etree
import pandas as pd
import pprint
import requests

In [4]:
import _run_constants as rc
from part_00_file_db_utils import query_db, write_data_to_sqlite

# Libraries

In [None]:
db_path_name = os.path.join(rc.db_path, rc.db_name)

In [None]:
cuts = list(range(0, 217001, 1000))

In [None]:
# the from word id df
sql = 'select word_group_id, word_id from words;'
fw_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)
fw_df.columns = ['from_word_group_id', 'from_word_id']

# the to word id df
sql = 'select word_group_id, word_id from words;'
tw_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)
tw_df.columns = ['to_word_group_id', 'to_word_id']

In [None]:
# counters
to_word_counter = Counter()
from_word_counter = Counter()
for i_c, ii in enumerate(cuts[:-1]):
    print('#### TO WORDS')
    # to words
    sql = f'select from_word_group_id, to_word_group_id from anagram_groups where to_word_group_id >= {ii} and to_word_group_id < {cuts[i_c+ 1]};'
    print(sql)
    wg_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)
    print(wg_df.shape)
    wg_df = pd.merge(left = wg_df, right = fw_df)
    wg_df = pd.merge(left = wg_df, right = tw_df)
    print(wg_df.shape)

    # count - and this is where it gets counter intuitive. The to word groups are counts of from words
    from_word_counter.update(wg_df['to_word_id'])

    print('#### FROM WORDS')
    # from words
    sql = f'select from_word_group_id, to_word_group_id from anagram_groups where from_word_group_id >= {ii} and from_word_group_id < {cuts[i_c+ 1]};'
    print(sql)
    wg_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)
    print(wg_df.shape)
    wg_df = pd.merge(left = wg_df, right = fw_df)
    wg_df = pd.merge(left = wg_df, right = tw_df)
    print(wg_df.shape)

    # aggregate by to word
    to_word_counter.update(wg_df['from_word_id'])



In [None]:
sql = 'select * from words;'
word_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)

In [None]:
word_df.head()

In [None]:
word_df['n_from_words'] = word_df['word_id'].map(from_word_counter)
word_df['n_to_words'] = word_df['word_id'].map(to_word_counter)

In [None]:
write_data_to_sqlite(df = word_df, table_name='word_counts', db_path=rc.db_path, db_name=rc.db_name)


In [None]:
# now, let's format for export to json

In [None]:
word_df.shape

In [None]:
word_df.head()

In [None]:
word_df['n_from_rank'] = word_df[['n_chars', 'n_from_words']].groupby(['n_chars']).rank(method = 'first', ascending=False)

In [None]:
word_df['n_to_rank'] = word_df[['n_chars', 'n_to_words']].groupby(['n_chars']).rank(method = 'first',ascending=False)

In [None]:
# melt to get the word count
wc_df = pd.melt(frame = word_df, id_vars = ['word_id', 'lcase', 'n_chars'],
                  value_vars = ['n_from_words','n_to_words'],
                  var_name = 'direction', value_name = 'n_words')

In [None]:
wc_df.shape

In [None]:
wc_df['direction'].unique().tolist()

In [None]:
recode_dict = {'n_from_words':'from',
               'n_to_words':'to'}

In [None]:
wc_df['direction'] = wc_df['direction'].map(recode_dict)

In [None]:
# melt to get the ranks
rank_df = pd.melt(frame = word_df, id_vars = ['word_id', 'lcase', 'n_chars'],
                  value_vars = ['n_from_rank','n_to_rank'],
                  var_name = 'direction', value_name = 'word_rank')

In [None]:
recode_dict = {'n_from_rank':'from',
               'n_to_rank':'to'}

In [None]:
rank_df['direction'] = rank_df['direction'].map(recode_dict)

In [None]:
wc_df.head()

In [None]:
# join
wg_df = pd.merge(left = wc_df, right = rank_df)

In [None]:
wg_df.head()

In [None]:
wg_df.shape

In [None]:
wg_df['word_rank'].value_counts()

In [None]:
wg_df = wg_df.loc[wg_df['word_rank'] <= 5, :]

In [None]:
wg_df.shape

In [None]:
# now, 
def remove_html_tags(text):
    parser = etree.HTMLParser()
    tree = etree.fromstring(text, parser)
    return ''.join(tree.itertext())

def get_definitions(word):
    print(word)
    url = f"https://en.wiktionary.org/api/rest_v1/page/definition/{word}"
    response = requests.get(url)
    data = response.json()
    lang_keys = list(data.keys())
    if 'en' in data:
        def_list = data['en'][0]['definitions']
       
        clean_string = ''
        for ii in range(0, len(def_list)):
            curr_def = def_list[ii]['definition']     
            print('here')  
            print(curr_def)
            
            clean_string = clean_string + ' ' + curr_def 
    else:
        clean_string = 'not an english word'
    return (lang_keys, clean_string)

In [None]:
#outcome =  wg_df['lcase'].map(get_definitions)

In [None]:
#wg_df['lang_keys'] = outcome.map(lambda x: x[0])

In [None]:
#wg_df['word_definition'] = outcome.map(lambda x: x[1])

In [None]:
#wg_df.head()

In [None]:
#wg_df['word_definition_clean'] = wg_df['word_definition'].map(remove_html_tags)

In [None]:
wg_df.tail()

In [None]:
# wg_df['word_definition'].value_counts()

In [None]:
# export the words, chars, direction, and count to a javascript file for later use

In [None]:
wg_df.head()

In [None]:
# write out the words
temp_output_df = wg_df[['lcase', 'n_chars', 'direction', 'n_words', 'word_rank']]
temp_output_df.columns = ['word', 'number of characters','direction', 'number of words', 'word rank']
temp_output_df.to_json('../webpage/words.json',orient='records')

# WORD COUNTS

In [5]:
# join in word_group_id
sql = 'select * from word_counts;'
word_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)

...query execution took: 1.82 seconds...


In [6]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,n_from_words,n_to_words
0,A,a,1,a,0,0,a,a,144511,1
1,aa,aa,2,a,1,1,a,a,45451,2
2,aal,aal,3,a,2,2,al,la,24276,7
3,aalii,aalii,5,a,3,3,ail,lai,5358,15
4,aam,aam,3,a,4,4,am,ma,14107,7


In [7]:
word_df['n_from_words'].sum()

123758418

In [8]:
word_df['n_to_words'].sum()

123758418

In [9]:
# what about the actual rows when storing this by word_group
# join in word_group_id
sql = 'select * from words_me_01;'
word_group_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)

...query execution took: 0.96 seconds...


In [10]:
word_group_df['n_from_word_groups'].sum()

73218235

In [11]:
word_group_df['n_to_word_groups'].sum()

73179245

In [12]:
# those numbers should be the same...

In [13]:
# turn this into the word_group_df
wg_df = word_df.drop_duplicates(subset=['word_group_id']).copy()


In [14]:
wg_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,n_from_words,n_to_words
0,A,a,1,a,0,0,a,a,144511,1
1,aa,aa,2,a,1,1,a,a,45451,2
2,aal,aal,3,a,2,2,al,la,24276,7
3,aalii,aalii,5,a,3,3,ail,lai,5358,15
4,aam,aam,3,a,4,4,am,ma,14107,7


In [15]:
wg_df.shape

(215842, 10)

In [16]:
# rank
wg_df['n_from_rank'] = wg_df[['n_chars', 'n_from_words']].groupby(['n_chars']).rank(method = 'first', ascending=False)
wg_df['n_to_rank'] = wg_df[['n_chars', 'n_to_words']].groupby(['n_chars']).rank(method = 'first',ascending=False)

# melt to get the word count
wc_df = pd.melt(frame = wg_df, id_vars = ['word_id', 'word_group_id', 'lcase', 'n_chars'],
                  value_vars = ['n_from_words','n_to_words'],
                  var_name = 'direction', value_name = 'n_words')
				  
				  
recode_dict = {'n_from_words':'from',
               'n_to_words':'to'}


wc_df['direction'] = wc_df['direction'].map(recode_dict)			   
				  
				  
# melt to get the ranks
rank_df = pd.melt(frame = wg_df, id_vars = ['word_id', 'word_group_id', 'lcase', 'n_chars'],
                  value_vars = ['n_from_rank','n_to_rank'],
                  var_name = 'direction', value_name = 'word_rank')			  
				  
				  
recode_dict = {'n_from_rank':'from',
               'n_to_rank':'to'}

rank_df['direction'] = rank_df['direction'].map(recode_dict)		   
				  
				  				  
# join
wg_df = pd.merge(left = wc_df, right = rank_df)				  

In [17]:
wg_df.head()

Unnamed: 0,word_id,word_group_id,lcase,n_chars,direction,n_words,word_rank
0,0,0,a,1,from,144511,3.0
1,1,1,aa,2,from,45451,30.0
2,2,2,aal,3,from,24276,110.0
3,3,3,aalii,5,from,5358,757.0
4,4,4,aam,3,from,14107,284.0


In [18]:
curr_wg_df = wg_df.loc[wg_df['word_rank'] <= 5, :].copy()

In [19]:
curr_wg_df.shape

(240, 7)

In [20]:
word_group_id_counter = Counter(word_df['word_group_id'])

In [21]:
curr_wg_df['word_group_size'] = curr_wg_df['word_group_id'].map(word_group_id_counter)

In [22]:
curr_wg_df.head()

Unnamed: 0,word_id,word_group_id,lcase,n_chars,direction,n_words,word_rank,word_group_size
0,0,0,a,1,from,144511,3.0,1
1194,1212,1194,acetoin,7,from,5188,3.0,5
2024,2054,2024,actioner,8,from,3206,1.0,5
2989,3028,2989,ae,2,from,91948,3.0,2
3092,3133,3092,aer,3,from,55024,1.0,5


In [23]:
curr_wg_df = curr_wg_df.sort_values(by = ['n_chars', 'lcase'])
temp_output_df = curr_wg_df[['lcase', 'n_chars', 'direction', 'n_words', 'word_rank', 'word_group_size']]

temp_output_df.columns = ['word', 'number of characters','direction', 'number of words', 'word rank', 'word group size']
temp_output_df.to_json('../webpage/word_groups.json',orient='records')

In [24]:
curr_wg_df.shape

(240, 8)

In [25]:
curr_wg_df.head()

Unnamed: 0,word_id,word_group_id,lcase,n_chars,direction,n_words,word_rank,word_group_size
0,0,0,a,1,from,144511,3.0,1
215842,0,0,a,1,to,1,1.0,1
232355,16974,16513,b,1,to,1,2.0,1
242718,27937,26876,c,1,to,1,3.0,1
261526,47720,45684,d,1,to,1,4.0,1


In [26]:
# the from word id df
sql = 'select word_group_id, word_id, lcase, n_chars from words;'
fw_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)
fw_df.columns = ['from_word_group_id', 'from_word_id', 'from_word', 'from_n_chars']

# the to word id df
sql = 'select word_group_id, word_id, lcase, n_chars from words;'
tw_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)
tw_df.columns = ['to_word_group_id', 'to_word_id', 'to_word', 'to_n_chars']

...query execution took: 0.97 seconds...
...query execution took: 0.83 seconds...


In [27]:
tw_df.head()

Unnamed: 0,to_word_group_id,to_word_id,to_word,to_n_chars
0,0,0,a,1
1,1,1,aa,2
2,2,2,aal,3
3,3,3,aalii,5
4,4,4,aam,3


In [60]:
# output path
word_list_output_path = '../webpage/wordlists'

In [61]:
def save_words(df:pd.DataFrame, r_direction:str, curr_word:str):
    if r_direction == 'from':
        r_direction_opp = 'to'
    else:
        r_direction_opp = 'from'
    # the output file and name
    output_file_name = f'{r_direction}_{curr_word}.txt'
    ofpn = os.path.join(word_list_output_path, output_file_name)
    
    with open(file = ofpn, mode = 'w') as my_file:
        if df.empty:            
            my_file.write(f"## THERE ARE NO {r_direction_opp} WORDS FOR {curr_word} ## \n")
        else:

            # number of characters to iterate through
            nchar_list = sorted(df[f'{r_direction_opp}_n_chars'].unique().tolist(), reverse=True)
        
            for nc in nchar_list:
                my_file.write(f"##########\n")
                my_file.write(f"## WORDS OF LENGTH {nc} ## \n")
                my_file.write(f"##########\n")
                # subset by 
                temp_df = df.loc[df[f'{r_direction_opp}_n_chars' ]==nc, [f'{r_direction_opp}_word']].copy()
                temp_df = temp_df.drop_duplicates().sort_values(by = f'{r_direction_opp}_word')
                for fw in temp_df[f'{r_direction_opp}_word'].tolist():
                    my_file.write(fw + '  \n')

In [210]:
def save_words_json(df:pd.DataFrame, r_direction:str, curr_word:str):
    
    
    # the output file and name
    output_file_name = f'{curr_word}.json'
    ofpn = os.path.join(word_list_output_path, r_direction, output_file_name)
    
    with open(file = ofpn, mode = 'w') as my_file:
        
        if df.empty:            
            output_dict = {'word':curr_word,
                       'number_of_words':0,
                       'relatedWords':[]}        
            json.dump(obj = output_dict, fp =  my_file, indent=4)            
        else:
            df[f'{r_direction}_n_chars'] = df[f'{r_direction}_word'].str.len()
            col_names = [f'{r_direction}_n_chars', f'{r_direction}_word']            
            temp_df = df.sort_values(by=col_names, ascending=[False, True])
            word_list = temp_df[f'{r_direction}_word'].tolist()
            if curr_word in word_list:
                word_list.remove(curr_word)

            output_dict = {'word':curr_word,
                           'number_of_words':temp_df.shape[0],
                           'relatedWords':word_list}        
            json.dump(obj = output_dict, fp =  my_file, indent=4)            
        
            

In [211]:
col_names = ['word_group_id', 'lcase']
temp_word_df = word_df[col_names].sort_values(by = col_names).drop_duplicates(subset = 'word_group_id')

In [212]:
# dictionary of words!
word_group_dict = {wg_id:word for wg_id, word in zip(temp_word_df['word_group_id'],
                                                     temp_word_df['lcase'])}

In [213]:
def build_list_of_parent_words(word_group_id:int, db_path:str, db_name:str):

    # build the list of parent words
    sql = f'select from_word_group_id, to_word_group_id from anagram_groups where to_word_group_id = {word_group_id};'
    
    pwg_df = query_db(sql = sql, db_path=db_path, db_name=db_name)    

    # now, get the word list
    sql = 'select word_id as from_word_id, word_group_id as from_word_group_id, lcase as from_word from words;'
    word_df = query_db(sql = sql, db_path=db_path, db_name=db_name)
    pw_df = pd.merge(left = word_df, right = pwg_df)       
    
    # let's add information to highlight the focal word
    col_names = ['from_word_group_id', 'from_word_id', 'from_word']
    
    id_df = pw_df.loc[pw_df['from_word_group_id'] == word_group_id, col_names].copy()

    id_df.columns = ['to_word_group_id', 'to_word_id', 'to_word']
    pw_df = pd.merge(left = pw_df, right = id_df)

    col_names = ['from_word_id', 'to_word_id',
                 'from_word_group_id','to_word_group_id',
                 'from_word', 'to_word']
    
    pw_df = pw_df[col_names].drop_duplicates(subset=['from_word_id', 'from_word_group_id', 'from_word'])
    
    return pw_df

def build_list_of_child_words(word_group_id:int, db_path:str, db_name:str):

    # build the list of parent words
    sql = f'select from_word_group_id, to_word_group_id from anagram_groups where from_word_group_id = {word_group_id};'
    
    cwg_df = query_db(sql = sql, db_path=db_path, db_name=db_name)

    # now, get the word list
    sql = 'select word_id as to_word_id, word_group_id as to_word_group_id, lcase as to_word from words;'
    word_df = query_db(sql = sql, db_path=db_path, db_name=db_name)

    cw_df = pd.merge(left = word_df, right = cwg_df)
        
    # let's add information to highlight the focal word
    col_names = ['to_word_group_id', 'to_word_id', 'to_word']

    id_df = cw_df.loc[cw_df['to_word_group_id'] == word_group_id, col_names].copy()
    
    id_df.columns = ['from_word_group_id', 'from_word_id', 'from_word']
    cw_df = pd.merge(left = cw_df, right = id_df)
    
    col_names = ['from_word_id', 'to_word_id',
                 'from_word_group_id','to_word_group_id',
                 'from_word', 'to_word']
    
    cw_df = cw_df[col_names].drop_duplicates(subset=['to_word_id', 'to_word_group_id', 'to_word'])
    

    return cw_df

In [214]:
wg_id = 60945
pw_df = build_list_of_parent_words(word_group_id=wg_id, db_path=rc.db_path, db_name=rc.db_name)
cw_df = build_list_of_child_words(word_group_id=wg_id, db_path=rc.db_path, db_name=rc.db_name)

...query execution took: 0.23 seconds...
...query execution took: 0.69 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.78 seconds...


In [215]:
curr_wg_df.head()

Unnamed: 0,word_id,word_group_id,lcase,n_chars,direction,n_words,word_rank,word_group_size
0,0,0,a,1,from,144511,3.0,1
215842,0,0,a,1,to,1,1.0,1
232355,16974,16513,b,1,to,1,2.0,1
242718,27937,26876,c,1,to,1,3.0,1
261526,47720,45684,d,1,to,1,4.0,1


In [216]:
# counters
to_word_counter = Counter()
from_word_counter = Counter()
#test_wg_df = curr_wg_df.iloc]
for i_c, row in curr_wg_df.iterrows():
    

    # the current word
    curr_word_group_id = row['word_group_id']
    curr_word = word_group_dict[curr_word_group_id]    
    print(curr_word)

    if row['direction'] == 'to':
        print('#### TO WORDS')        
        
        cw_df = build_list_of_child_words(word_group_id=curr_word_group_id,
                                          db_path = rc.db_path, db_name = rc.db_name)        
        
        # save it
        save_words_json(df = cw_df, r_direction='to', curr_word=curr_word)
        
        # distinct from words
        #from_word_counter.update(wg_df['from_word_id'])

    if row['direction'] == 'from':
        print('#### FROM WORDS')
        
        pw_df = build_list_of_parent_words(word_group_id=curr_word_group_id,
                                          db_path = rc.db_path, db_name = rc.db_name)        
        
                
        # save it
        save_words_json(df = pw_df, r_direction='from', curr_word=curr_word)
        

        # distinct to words
        #to_word_counter.update(wg_df['to_word_id'])



#### FROM WORDS
...query execution took: 0.43 seconds...
...query execution took: 0.72 seconds...
#### TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.79 seconds...
#### TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.73 seconds...
#### TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.78 seconds...
#### TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.73 seconds...
#### FROM WORDS
...query execution took: 0.38 seconds...
...query execution took: 0.88 seconds...
#### TO WORDS
...query execution took: 0.0 seconds...
...query execution took: 0.74 seconds...
#### FROM WORDS
...query execution took: 0.35 seconds...
...query execution took: 0.79 seconds...
#### FROM WORDS
...query execution took: 0.32 seconds...
...query execution took: 0.89 seconds...
#### FROM WORDS
...query execution took: 0.39 seconds...
...query execution took: 0.79 seconds...
#### TO WORDS
...query execution to

In [114]:
wg_df.head()

Unnamed: 0,from_word_group_id,to_word_group_id,from_word_id,from_word,from_n_chars,to_word_id,to_word,to_n_chars


In [84]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked,n_from_words,n_to_words
0,A,a,1,a,0,0,a,a,144511,1
1,aa,aa,2,a,1,1,a,a,45451,2
2,aal,aal,3,a,2,2,al,la,24276,7
3,aalii,aalii,5,a,3,3,ail,lai,5358,15
4,aam,aam,3,a,4,4,am,ma,14107,7


In [None]:
# the letters in the top five words by word length can be found in:
print(len(from_word_counter)) 
# which is:
print(len(from_word_counter) / word_df.shape[0])
# of words



In [None]:
# the letters in the top five words by word length can be rearraged to spell:
print(len(to_word_counter))
# which is:
print(len(to_word_counter) / word_df.shape[0])
# of words


In [None]:
# what letters are represented?
from_letter_counter = Counter()
to_letter_counter = Counter()


In [None]:
for ir, row in curr_wg_df.iterrows():
    if row['direction'] == 'from':
        from_letter_counter.update(row['lcase'])
    if row['direction'] == 'to':
        to_letter_counter.update(row['lcase'])

In [None]:
from string import ascii_lowercase

In [None]:
from_letter_counter

In [None]:
set(ascii_lowercase).difference(from_letter_counter.keys())

In [None]:
curr_wg_df.shape

In [None]:
set(ascii_lowercase).difference(to_letter_counter.keys())

In [None]:
to_letter_counter