# Mike Babb
# babb.mike@outlook.com
# Find anagrams
## Get the parent / child words of the top 5 word groups by character length

In [None]:
# standard libraries
from collections import Counter
import os
import sqlite3

In [None]:
# external
import json
import pandas as pd

In [None]:
# custom
import _run_constants as rc
from part_00_file_db_utils import query_db
from part_00_process_functions import *

# WORD COUNTS BY WORD GROUP

In [None]:
# join in word_group_id
sql = 'select * from word_counts;'
word_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)

In [None]:
word_df.head()

In [None]:
# turn this into the word_group_df
col_names = ['word_group_id', 'lcase']
wg_df = word_df.sort_values(by = col_names).drop_duplicates(subset = 'word_group_id').copy()

# dictionary of words!
word_group_dict = {wg_id:word for wg_id, word in zip(wg_df['word_group_id'],
                                                     wg_df['lcase'])}



In [None]:
wg_df.head()

In [None]:
wg_df.shape

# the top five words by from/to status by character length

In [None]:
# ranks for words
wg_df['n_from_rank'] = wg_df[['n_chars', 'n_from_words']].groupby(['n_chars']).rank(method = 'first', ascending=False)
wg_df['n_to_rank'] = wg_df[['n_chars', 'n_to_words']].groupby(['n_chars']).rank(method = 'first',ascending=False)

# melt to get the word count
wc_df = pd.melt(frame = wg_df, id_vars = ['word_id', 'word_group_id', 'lcase', 'n_chars'],
                  value_vars = ['n_from_words','n_to_words'],
                  var_name = 'direction', value_name = 'n_words')
				  
# word count				  
recode_dict = {'n_from_words':'from',
               'n_to_words':'to'}

wc_df['direction'] = wc_df['direction'].map(recode_dict)			   
				  
# melt to get the ranks
rank_df = pd.melt(frame = wg_df, id_vars = ['word_id', 'word_group_id', 'lcase', 'n_chars'],
                  value_vars = ['n_from_rank','n_to_rank'],
                  var_name = 'direction', value_name = 'word_rank')			  
				  
recode_dict = {'n_from_rank':'from',
               'n_to_rank':'to'}

rank_df['direction'] = rank_df['direction'].map(recode_dict)		   
				  				  
# overwrite the wg_df by joining
wg_df = pd.merge(left = wc_df, right = rank_df)				  

In [None]:
wg_df.head()

In [None]:
# select the top 5
curr_wg_df = wg_df.loc[wg_df['word_rank'] <= 5, :].copy()

In [None]:
curr_wg_df.shape

In [None]:
# count how many words are in each word group
word_group_id_counter = Counter(word_df['word_group_id'])

In [None]:
curr_wg_df['word_group_size'] = curr_wg_df['word_group_id'].map(word_group_id_counter)

In [None]:
curr_wg_df.head()

In [None]:
# sort, select columns, and save to disk
# this is the data that populates the grid
curr_wg_df['word_rank'] = curr_wg_df['word_rank'].round(0).astype(int)

curr_wg_df = curr_wg_df.sort_values(by = ['n_chars', 'lcase'])
temp_output_df = curr_wg_df[['lcase', 'n_chars', 'direction', 'n_words', 'word_rank', 'word_group_size']]

temp_output_df.columns = ['word', 'number of characters','direction', 'number of words', 'word rank', 'word group size']
temp_output_df.to_json('../webpage/word_groups.json',orient='records')

# generate the list of words for each word group

In [None]:
curr_wg_df.shape

In [None]:
curr_wg_df.head()

In [None]:
# create the from word id df
sql = 'select word_group_id, word_id, lcase, n_chars from words;'
fw_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)
fw_df.columns = ['from_word_group_id', 'from_word_id', 'from_word', 'from_n_chars']

# create the to word id df
sql = 'select word_group_id, word_id, lcase, n_chars from words;'
tw_df = query_db(sql = sql, db_path=rc.db_path, db_name=rc.db_name)
tw_df.columns = ['to_word_group_id', 'to_word_id', 'to_word', 'to_n_chars']

In [None]:
# output path to save the generated word lists
word_list_output_path = '../webpage/wordlists'

In [None]:
def build_list_of_parent_words(word_group_id: int, db_path: str, db_name: str):

    # build the list of parent words
    sql = f'select from_word_group_id, to_word_group_id from anagram_groups where to_word_group_id = {word_group_id};'

    pwg_df = query_db(sql=sql, db_path=db_path, db_name=db_name)

    # now, get the word list
    sql = 'select word_id as from_word_id, word_group_id as from_word_group_id, lcase as from_word from words;'
    word_df = query_db(sql=sql, db_path=db_path, db_name=db_name)
    pw_df = pd.merge(left=word_df, right=pwg_df)

    # let's add information to highlight the focal word
    # select
    col_names = ['from_word_group_id', 'from_word_id', 'from_word']

    # get a single row - the focal word
    id_df = pw_df.loc[pw_df['from_word_group_id']
                      == word_group_id, col_names].copy()

    # rename
    id_df.columns = ['to_word_group_id', 'to_word_id', 'to_word']
    # merge
    pw_df = pd.merge(left=pw_df, right=id_df)

    # reorder
    col_names = ['from_word_id', 'to_word_id',
                 'from_word_group_id', 'to_word_group_id',
                 'from_word', 'to_word']

    # drop duplicates, if any
    pw_df = pw_df[col_names].drop_duplicates(
        subset=['from_word_id', 'from_word_group_id', 'from_word'])

    return pw_df


In [None]:


def build_list_of_child_words(word_group_id: int, db_path: str, db_name: str):

    # build the list of parent words
    sql = f'select from_word_group_id, to_word_group_id from anagram_groups where from_word_group_id = {word_group_id};'

    cwg_df = query_db(sql=sql, db_path=db_path, db_name=db_name)

    # now, get the word list
    sql = 'select word_id as to_word_id, word_group_id as to_word_group_id, lcase as to_word from words;'
    word_df = query_db(sql=sql, db_path=db_path, db_name=db_name)

    cw_df = pd.merge(left=word_df, right=cwg_df)

    # let's add information to highlight the focal word
    col_names = ['to_word_group_id', 'to_word_id', 'to_word']

    id_df = cw_df.loc[cw_df['to_word_group_id']
                      == word_group_id, col_names].copy()

    id_df.columns = ['from_word_group_id', 'from_word_id', 'from_word']
    cw_df = pd.merge(left=cw_df, right=id_df)

    col_names = ['from_word_id', 'to_word_id',
                 'from_word_group_id', 'to_word_group_id',
                 'from_word', 'to_word']

    cw_df = cw_df[col_names].drop_duplicates(
        subset=['to_word_id', 'to_word_group_id', 'to_word'])

    return cw_df


In [None]:
test_word = 'formaldehydesulphoxylate'
wg_id = word_df.loc[word_df['lcase'] == test_word, 'word_group_id'].iloc[0]

In [None]:
wg_id

In [None]:
pw_df = build_list_of_parent_words(word_group_id=wg_id, db_path=rc.db_path, db_name=rc.db_name)
print(pw_df.shape)
cw_df = build_list_of_child_words(word_group_id=wg_id, db_path=rc.db_path, db_name=rc.db_name)
print(cw_df.shape)

In [None]:
pw_df.head()

In [None]:
pw_df.shape

In [None]:
cw_df.shape

In [None]:
# counters
to_word_counter = Counter()
from_word_counter = Counter()
for i_c, row in curr_wg_df.iterrows():
    
    # the current word
    curr_word_group_id = row['word_group_id']
    curr_word = word_group_dict[curr_word_group_id]        

    if row['direction'] == 'from':
        print('####', curr_word, 'FROM WORDS')
        
        pw_df = build_list_of_parent_words(word_group_id=curr_word_group_id,
                                          db_path = rc.db_path, db_name = rc.db_name)               
                
        # save it
        format_and_save_words_json(df = pw_df, r_direction='from',
                                   curr_word=curr_word, output_path=word_list_output_path)
        

        # distinct to words
        to_word_counter.update(pw_df['to_word_id'])


    if row['direction'] == 'to':
        print('####', curr_word, 'TO WORDS')        
        
        cw_df = build_list_of_child_words(word_group_id=curr_word_group_id,
                                          db_path = rc.db_path, db_name = rc.db_name)        
        
        # save it
        format_and_save_words_json(df = cw_df, r_direction='to',
                                   curr_word=curr_word, output_path=word_list_output_path)
        
        # distinct from words
        from_word_counter.update(cw_df['from_word_id'])

    


In [None]:
wg_df.head()

In [None]:
word_df.head()

In [None]:
# the letters in the top five from words by word length can be found in:
print(len(from_word_counter)) 
# which is:
print(len(from_word_counter) / word_df.shape[0])
# of words



In [None]:
# the letters in the top five words by word length can be rearraged to spell:
print(len(to_word_counter))
# which is:
print(len(to_word_counter) / word_df.shape[0])
# of words


In [None]:
# what letters are represented?
from_letter_counter = Counter()
to_letter_counter = Counter()


In [None]:
for ir, row in curr_wg_df.iterrows():
    if row['direction'] == 'from':
        from_letter_counter.update(row['lcase'])
    if row['direction'] == 'to':
        to_letter_counter.update(row['lcase'])

In [None]:
from string import ascii_lowercase

In [None]:
from_letter_counter

In [None]:
set(ascii_lowercase).difference(from_letter_counter.keys())

In [None]:
curr_wg_df.shape

In [None]:
set(ascii_lowercase).difference(to_letter_counter.keys())

In [None]:
to_letter_counter