# Mike Babb
# babbm@uw.edu
# Find anagrams
## Part 5: Generate a graph of word relationships

In [2]:
# standard libraries - installed by default
import csv
import os
import sqlite3
import time

In [3]:
# external libraries - not installed by default
import networkx as nx
import numpy as np
import pandas as pd

In [4]:
from part_00_process_functions import query_db

### paths

In [5]:
# database path and name
db_path = '/project/finding_anagrams/db'
db_name = 'words.db'

In [6]:
output_file_path = '/project/finding_anagrams/data'
output_file_name = 'sample_graph_v2.gexf'

In [7]:
ofpn = os.path.join(output_file_path, output_file_name)

### load words

In [8]:
sql = 'select lcase, word_id, word_group_id from words_v2;'
word_df = query_db(sql=sql, db_path = db_path, db_name = db_name)

...query execution took: 0.852352 seconds...


In [9]:
# this is now stored in our the db table

In [10]:
focal_word_list = ['it', 'tie', 'item', 'mite', 'time', 'emit', 'rite', 'terminator']
focal_word_list = ['item', 'mite', 'time', 'emit']

In [11]:
# use dictionary comprehension to create a very fast lookup table
word_group_id_dict = {}
for word, word_group_id in zip(word_df['lcase'], word_df['word_group_id']):
    word_group_id_dict[word] = word_group_id
    word_group_id_dict[word_group_id] = word
    
    

In [12]:
focal_wg_id_list = [word_group_id_dict[focal_word] for focal_word in focal_word_list]

### load 'from' words

In [13]:
focal_wg_id_list

[58098, 58098, 58098, 58098]

In [14]:
# drop the duplicates
focal_wg_id_list = sorted(set(focal_wg_id_list))

In [15]:
focal_wg_id_list

[58098]

In [16]:
# get parent words
sql = 'select from_word_group_id, to_word_group_id from anagram_groups where to_word_group_id = (?);'

In [17]:
# get the parent words
pw_df_list = []
for focal_wg_id in focal_wg_id_list:
    pw_df = query_db(sql = sql, db_path = db_path, db_name = db_name, params = (focal_wg_id,))
    pw_df_list.append(pw_df)

...query execution took: 0.039975 seconds...


In [18]:
pw_df = pd.concat(pw_df_list)

In [19]:
pw_df.head()

Unnamed: 0,from_word_group_id,to_word_group_id
0,131095,58098
1,131112,58098
2,131133,58098
3,131183,58098
4,140,58098


In [20]:
# join to go from word_group_id to focal_word


### load 'to' words

In [21]:
# get child words
sql = 'select from_word_group_id, to_word_group_id from anagram_groups where from_word_group_id = (?);'

In [22]:
cw_df_list = []
for focal_word_group_id in focal_wg_id_list:
    cw_df = query_db(sql = sql, db_path = db_path, db_name = db_name, params = (focal_word_group_id,))    
    cw_df_list.append(cw_df)

...query execution took: 0.002994 seconds...


In [23]:
cw_df = pd.concat(cw_df_list)

In [24]:
# concatente
anagram_df = pd.concat([pw_df, cw_df])

In [26]:
len(anagram_df)

14639

In [27]:
word_df.head()

Unnamed: 0,lcase,word_id,word_group_id
0,a,0,0
1,aa,1,1
2,aal,2,2
3,ala,4557,2
4,aalii,3,3


In [28]:
p_word_df = word_df.copy()

In [29]:
c_word_df = word_df.copy()

In [30]:
p_word_df.columns = ['from_word', 'from_word_id', 'from_word_group_id']

In [31]:
c_word_df.columns = ['to_word', 'to_word_id', 'to_word_group_id']

In [32]:
anagram_df = pd.merge(left = anagram_df, right = p_word_df)

In [33]:
len(anagram_df)

15374

In [34]:
anagram_df = pd.merge(left = anagram_df, right = c_word_df)

In [35]:
len(anagram_df)

61360

In [36]:
anagram_df.head()

Unnamed: 0,from_word_group_id,to_word_group_id,from_word,from_word_id,to_word,to_word_id
0,131095,58098,passementerie,139135,emit,60812
1,131095,58098,passementerie,139135,item,98551
2,131095,58098,passementerie,139135,mite,116870
3,131095,58098,passementerie,139135,time,202076
4,131112,58098,passimeter,139152,emit,60812


In [37]:
dir(nx)

['AmbiguousSolution',
 'DiGraph',
 'ExceededMaxIterations',
 'Graph',
 'GraphMLReader',
 'GraphMLWriter',
 'HasACycle',
 'LCF_graph',
 'LFR_benchmark_graph',
 'MultiDiGraph',
 'MultiGraph',
 'NetworkXAlgorithmError',
 'NetworkXError',
 'NetworkXException',
 'NetworkXNoCycle',
 'NetworkXNoPath',
 'NetworkXNotImplemented',
 'NetworkXPointlessConcept',
 'NetworkXTreewidthBoundExceeded',
 'NetworkXUnbounded',
 'NetworkXUnfeasible',
 'NodeNotFound',
 'NotATree',
 'OrderedDiGraph',
 'OrderedGraph',
 'OrderedMultiDiGraph',
 'OrderedMultiGraph',
 'PlanarEmbedding',
 'PowerIterationFailedConvergence',
 '__author__',
 '__bibtex__',
 '__builtins__',
 '__cached__',
 '__date__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 'adamic_adar_index',
 'add_cycle',
 'add_path',
 'add_star',
 'adj_matrix',
 'adjacency',
 'adjacency_data',
 'adjacency_graph',
 'adjacency_matrix',
 'adjacency_spectrum',
 'adjlist',
 'algebraic_connectivity',
 

In [38]:
anagram_df = anagram_df.drop_duplicates()

In [39]:
# number of edges:
len(anagram_df)

61344

In [46]:
len(anagram_df['from_word_id'])

61344

In [47]:
len(anagram_df['to_word_id'])

61344

In [None]:
# how many nodes?
my_set = set(anagram_df['from_word'].unique().tolist()).union(anagram_df['to_word'].unique().tolist()                                                             )
len(my_set)

### focus only on a few words

In [None]:
# let's focus on just our words of interest
anagram_df = anagram_df.loc[anagram_df['from_word'].isin(focal_word_list), ]
anagram_df = anagram_df.loc[anagram_df['to_word'].isin(focal_word_list), ]

In [None]:
# sort the values
anagram_df = anagram_df.sort_values(by = ['from_word_id', 'to_word_id'])

### create and save graph

In [None]:
len(anagram_df)

In [None]:
# create a directed graph using networkx
# https://networkx.org/
my_graph = nx.from_pandas_edgelist(df=anagram_df,source = 'from_word',
                                    target = 'to_word', create_using = nx.DiGraph())

In [None]:
# save to disk as a GEXF for visualization in Gephi
# https://gephi.org/
nx.write_gexf(G=my_graph, path = ofpn)