# Find anagrams - Part 10
# Create the count of parent / child words by word - not just word group
Mike Babb  
babb.mike@outlook.com

In [1]:
# standard
from collections import Counter
import os
import sqlite3

In [2]:
# external
import pandas as pd

In [3]:
# custom
import _run_constants as rc
from part_00_file_db_utils import query_db, write_data_to_sqlite

# Libraries

In [4]:
db_path_name = os.path.join(rc.DB_PATH, rc.DB_NAME)

In [5]:
# break up the sql calls into batches of 1000 word groups
cuts = list(range(0, 217001, 1000))

In [6]:
# the from word id df
sql = 'select word_group_id, word_id from words;'
fw_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)
fw_df.columns = ['from_word_group_id', 'from_word_id']

# the to word id df
sql = 'select word_group_id, word_id from words;'
tw_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)
tw_df.columns = ['to_word_group_id', 'to_word_id']

...query execution took: 0.94 seconds...
...query execution took: 0.78 seconds...


In [7]:
# counters
to_word_counter = Counter()
from_word_counter = Counter()
# enumerate the cuts, send to the SQLiteDB, get the data, and aggregate
for i_c, ii in enumerate(cuts[:-1]):
    print('#### TO WORDS')
    # to words
    sql = f'select from_word_group_id, to_word_group_id from anagram_groups where to_word_group_id >= {ii} and to_word_group_id < {cuts[i_c+ 1]};'
    
    print(sql)
    wg_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)
    print(wg_df.shape)
    # exapnd the from words
    wg_df = pd.merge(left = wg_df, right = fw_df)
    # expand the to words
    wg_df = pd.merge(left = wg_df, right = tw_df)
    
    wg_df = wg_df.drop_duplicates()
    print(wg_df.shape)    

    # count - and this is where it gets counter intuitive.
    # The to word counts are counts of from-to word pairs
    from_word_counter.update(wg_df['to_word_id'])

    print('#### FROM WORDS')
    # from words
    sql = f'select from_word_group_id, to_word_group_id from anagram_groups where from_word_group_id >= {ii} and from_word_group_id < {cuts[i_c+ 1]};'
    print(sql)
    wg_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)
    print(wg_df.shape)
    wg_df = pd.merge(left = wg_df, right = fw_df)
    wg_df = pd.merge(left = wg_df, right = tw_df)
    

    wg_df = wg_df.drop_duplicates()
    print(wg_df.shape)

    # count each word id to get th
    # the from word counts is the from-to-wrd pairs
    to_word_counter.update(wg_df['from_word_id'])



#### TO WORDS
select from_word_group_id, to_word_group_id from anagram_groups where to_word_group_id >= 0 and to_word_group_id < 1000;
...query execution took: 2.85 seconds...
(646287, 2)
(1154306, 4)
#### FROM WORDS
select from_word_group_id, to_word_group_id from anagram_groups where from_word_group_id >= 0 and from_word_group_id < 1000;
...query execution took: 8.66 seconds...
(252990, 2)
(454175, 4)
#### TO WORDS
select from_word_group_id, to_word_group_id from anagram_groups where to_word_group_id >= 1000 and to_word_group_id < 2000;
...query execution took: 2.76 seconds...
(665726, 2)
(1588737, 4)
#### FROM WORDS
select from_word_group_id, to_word_group_id from anagram_groups where from_word_group_id >= 1000 and from_word_group_id < 2000;
...query execution took: 6.98 seconds...
(366155, 2)
(661531, 4)
#### TO WORDS
select from_word_group_id, to_word_group_id from anagram_groups where to_word_group_id >= 2000 and to_word_group_id < 3000;
...query execution took: 1.83 seconds...
(

In [8]:
wg_df.head()

Unnamed: 0,from_word_group_id,to_word_group_id,from_word_id,to_word_id


In [9]:
# select the list of words
sql = 'select * from words;'
word_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)

...query execution took: 1.19 seconds...


In [10]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_id,word_group_id,letter_group,letter_group_ranked
0,A,a,1,a,0,0,a,a
1,aa,aa,2,a,1,1,a,a
2,aal,aal,3,a,2,2,al,la
3,aalii,aalii,5,a,3,3,ail,lai
4,aam,aam,3,a,4,4,am,ma


In [11]:
word_df['n_from_words'] = word_df['word_id'].map(from_word_counter)
word_df['n_to_words'] = word_df['word_id'].map(to_word_counter)

In [12]:
word_df.shape

(234370, 10)

In [13]:
# add in the counts of from_word_group_id and to_word_group_id
# the columns in words_me_0* are all the same
sql = 'select word_group_id, n_from_word_groups, n_to_word_groups from words_me_05;'
word_group_df = query_db(sql = sql, db_path=rc.DB_PATH, db_name=rc.DB_NAME)

...query execution took: 0.56 seconds...


In [14]:
output_df = pd.merge(left = word_df, right = word_group_df)

In [15]:
output_df.shape

(234370, 12)

In [16]:
# sum of parent/child word relationships
output_df['n_from_words'].sum()

123798056

In [17]:
# sum of parent/child word relationships
output_df['n_to_words'].sum()

123798056

In [18]:
# sum of parent/child word group relationships: by from word groups
word_group_df['n_from_word_groups'].sum()

73218235

In [19]:
# sum of parent/child word relationships: by to word groups
word_group_df['n_to_word_groups'].sum()

73218235

In [20]:
# this is also the number of rows in the anagram group table

In [21]:
write_data_to_sqlite(df = output_df, table_name='word_counts', db_path=rc.DB_PATH, db_name=rc.DB_NAME)


...now writing: word_counts
