# Mike Babb
# babbm@uw.edu
# Find anagrams
## Part 5: Generate a graph of word relationships

In [None]:
# standard libraries - installed by default
import csv
import os
import sqlite3
import time

In [None]:
# external libraries - not installed by default
import networkx as nx
import numpy as np
import pandas as pd

In [None]:
from part_00_process_functions import query_db

### paths

In [None]:
# database path and name
db_path = '/project/finding_anagrams/db'
db_name = 'words.db'

In [None]:
output_file_path = '/project/finding_anagrams/data'
output_file_name = 'sample_graph_v1.gexf'

In [None]:
ofpn = os.path.join(output_file_path, output_file_name)

### load words

In [None]:
sql = 'select lcase, word_id from words_v1;'
word_df = query_db(sql=sql, db_path = db_path, db_name = db_name)

In [None]:
# this is now stored in our the db table

In [None]:
focal_word_list = ['it', 'tie', 'item', 'mite', 'time', 'emit', 'rite', 'terminator']

In [None]:
# use dictionary comprehension to create a very fast lookup table
word_id_dict = {}
for word, word_id in zip(word_df['lcase'], word_df['word_id']):
    word_id_dict[word] = word_id 
    word_id_dict[word_id] = word    

In [None]:
focal_word_id_list = [word_id_dict[focal_word] for focal_word in focal_word_list]

### load 'from' words

In [None]:
# get parent words
sql = 'select from_word_id, to_word_id from anagrams where to_word_id = (?);'

In [None]:
# get the parent words
pw_df_list = []
for focal_word_id in focal_word_id_list:
    pw_df = query_db(sql = sql, db_path = db_path, db_name = db_name, params = (focal_word_id,))
    pw_df_list.append(pw_df)

In [None]:
pw_df = pd.concat(pw_df_list)

In [None]:
# get the parent words
pw_df['from_word'] = pw_df['from_word_id'].map(word_id_dict)
pw_df['to_word'] = pw_df['to_word_id'].map(word_id_dict)

### load 'to' words

In [None]:
# get child words
sql = 'select from_word_id, to_word_id from anagrams where from_word_id = (?);'

In [None]:
cw_df_list = []
for focal_word_id in focal_word_id_list:
    cw_df = query_db(sql = sql, db_path = db_path, db_name = db_name, params = (focal_word_id,))    
    cw_df_list.append(cw_df)

In [None]:
cw_df = pd.concat(cw_df_list)

In [None]:
cw_df.head()

In [None]:
cw_df['from_word'] = cw_df['from_word_id'].map(word_id_dict)
cw_df['to_word'] = cw_df['to_word_id'].map(word_id_dict)

In [None]:
# concatente
anagram_df = pd.concat([pw_df, cw_df])

In [None]:
dir(nx)

In [None]:
anagram_df = anagram_df.drop_duplicates()

In [None]:
# number of edges:
len(anagram_df)

In [None]:
# how many nodes?
my_set = set(anagram_df['from_word'].unique().tolist()).union(anagram_df['to_word'].unique().tolist()                                                             )
len(my_set)

### focus only on a few words

In [None]:
# let's focus on just our words of interest
anagram_df = anagram_df.loc[anagram_df['from_word_id'].isin(focal_word_id_list), ]
anagram_df = anagram_df.loc[anagram_df['to_word_id'].isin(focal_word_id_list), ]

In [None]:
# sort the values
anagram_df = anagram_df.sort_values(by = ['from_word_id', 'to_word_id'])

### create and save graph

In [None]:
# create a directed graph using networkx
# https://networkx.org/
my_graph = nx.from_pandas_edgelist(df=anagram_df,source = 'from_word',
                                    target = 'to_word', create_using = nx.DiGraph())

In [None]:
# save to disk as a GEXF for visualization in Gephi
# https://gephi.org/
nx.write_gexf(G=my_graph, path = ofpn)