# Mike Babb
# babbm@uw.edu
# Find anagrams
## Part 3: Query the anagram database

In [1]:
# standard libraries - installed by default
import datetime
import csv
import os
import sqlite3

In [2]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [3]:
from part_00_process_functions import build_db_conn, query_db

### set names and paths

In [4]:
# database path and name
db_path = '/project/finding_anagrams/db'
db_name = 'words.db'

In [5]:
# set up paths
out_file_path = '/project/finding_anagrams/words'

In [6]:
if os.path.exists(out_file_path):
    pass
else:
    os.makedirs(out_file_path)

### define focal word

In [7]:
# select a focal word for testing
focal_word = 'acromioscapular'

### load the list of words

In [8]:
sql = 'select lcase, word_id, word_group_id from words_v2;'
word_df = query_db(sql=sql, db_path = db_path, db_name = db_name)

...query execution took: 0.289772 seconds...


In [9]:
# use dictionary comprehension to create a very fast lookup table
word_id_dict = {word:(word_id, word_group_id) for word, word_id, word_group_id in 
                zip(word_df['lcase'], word_df['word_id'], word_df['word_group_id'])}


In [10]:
focal_word_id, focal_word_group_id = word_id_dict[focal_word]

### load from/parent word group id pairs

In [11]:
sql = 'select from_word_group_id from anagram_groups where to_word_group_id = (?);'

In [12]:
pw_df = query_db(sql = sql, db_path = db_path, db_name = db_name, params = (focal_word_group_id,))

...query execution took: 0.000947 seconds...


In [13]:
# parent word dataframe
parent_word_df = word_df.loc[word_df['word_group_id'].isin(pw_df['from_word_group_id']), ['lcase']]

### load to/child word group id pairs

In [14]:
# get child words
sql = 'select to_word_group_id from anagram_groups where from_word_group_id = (?);'

In [15]:
cw_df = query_db(sql = sql, db_path = db_path, db_name = db_name, params = (focal_word_group_id,))

...query execution took: 0.009849 seconds...


In [16]:
child_word_df = word_df.loc[word_df['word_group_id'].isin(cw_df['to_word_group_id']), ['lcase']]

### determine exact anagrams

In [17]:
# compute exact anagrams
pw_set = set(parent_word_df['lcase'].tolist())
cw_set = set(child_word_df['lcase'].tolist())

In [18]:
# compute the intersection
ew_set = pw_set.intersection(cw_set)

In [19]:
# now, remove the interesection from both the from and the to word group sets
pw_set = pw_set.difference(ew_set)

In [20]:
cw_set = cw_set.difference(ew_set)

In [21]:
# now, remove the focal word from the exact word set
if focal_word in ew_set:
    ew_set.remove(focal_word)

In [22]:
## Save the list of anagrams to disk

In [23]:
# so, let's set up writing to disk
out_file_name = focal_word + '_anagrams.txt'

In [24]:
ofpn = os.path.join(out_file_path, out_file_name)

In [25]:
pw_list = sorted(pw_set)
ew_list = sorted(ew_set)
cw_list = sorted(cw_set)

In [26]:
# write the from, exact, and to anagram pairs to a text file.
with open( ofpn, 'w', newline = '') as out_file:
    # from anagrams
    write_line = 'Parent/From anagrams for: ' + focal_word + '\n'
    out_file.writelines(write_line)
    for pw in pw_list:
        write_line = pw + '\n'
        out_file.writelines(write_line)
    
    # exact anagrams
    out_file.writelines('\n')    
    write_line = 'Exact anagrams for: ' + focal_word + '\n'
    out_file.writelines(write_line)    
    for ew in ew_list:
        write_line = ew + '\n'
        out_file.writelines(write_line)
    
    # to anagrams
    out_file.writelines('\n')
    write_line = 'Child/To anagrams for: ' + focal_word + '\n'
    out_file.writelines(write_line)    
    for cw in cw_list:
        write_line = cw + '\n'
        out_file.writelines(write_line)      