# Mike Babb
# babb.mike@outlook.com
# Find anagrams
## Part 6: Generate a graph of word relationships

In [1]:
# standard libraries - installed by default
import csv
import os
import sqlite3
import time

In [2]:
# external libraries - not installed by default
import networkx as nx
import numpy as np
import pandas as pd

In [3]:
from part_00_process_functions import build_db_conn, query_db
import _run_constants as rc

### paths

In [4]:
output_file_name = 'sample_graph.gexf'

In [5]:
ofpn = os.path.join(rc.data_output_file_path, output_file_name)

In [6]:
ofpn

'/project/finding_anagrams\\data\\sample_graph.gexf'

### load words

In [7]:
sql = 'select lcase, word_id, word_group_id from words;'
word_df = query_db(sql=sql, db_path = rc.db_path, db_name = rc.db_name)

...query execution took: 0.79 seconds...


In [8]:
# we now need to manipulate the list of words so that when we join, the values are expanded with the 
# appropriate names
from_word_df = word_df.rename(columns = {'lcase':'from_word', 'word_id':'from_word_id', 
                                        'word_group_id':'from_word_group_id'}).copy()
to_word_df = word_df.rename(columns = {'lcase':'to_word', 'word_id':'to_word_id', 
                                        'word_group_id':'to_word_group_id'}).copy()



In [9]:
to_word_df.head()

Unnamed: 0,to_word,to_word_id,to_word_group_id
0,a,0,0
1,aa,1,1
2,aal,2,2
3,aalii,3,3
4,aam,4,4


In [10]:
# build a graph starting with the word terminator
focal_word = 'terminator'

In [11]:
focal_word_group_id = word_df.loc[word_df['lcase'] == focal_word, 'word_group_id'].iloc[0]

In [12]:
focal_word_group_id

183984

In [13]:
type(focal_word_group_id)

numpy.int64

In [14]:
# now, start making calls to build the graph

In [15]:
# get parent words
sql = 'select * from anagram_groups where from_word_group_id = ?'

In [16]:
# get the parent words
pw_df = query_db(sql = sql, db_path = rc.db_path, db_name = rc.db_name, params = (int(focal_word_group_id), ))
#pw_df = query_db(sql = sql, db_path = rc.db_path, db_name = rc.db_name)

...query execution took: 0.1 seconds...


In [17]:
# now, iterate through all of the to words to find the complete graph of relationships from the word 'terminator'
pw_df_list = [pw_df]
for focal_wg_id in pw_df['to_word_group_id']:    
    pw_df = query_db(sql = sql, db_path = rc.db_path, db_name = rc.db_name, params = (int(focal_wg_id),))
    pw_df_list.append(pw_df)

...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.01 seconds...
...query execution took: 0.01 seconds...
...query execution took: 0.01 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.01 seconds...
...query execution took: 0.01 seconds...
...query execution took: 0.01 seconds...
...query execution took: 0.01 seconds...
...query execution took: 0.02 seconds...
...query execution took: 0.0 seconds...
...query execution took: 0.0 sec

In [18]:
pw_df = pd.concat(objs = pw_df_list)

In [19]:
pw_df.shape

(8564, 2)

In [20]:
pw_df.head()

Unnamed: 0,from_word_group_id,to_word_group_id
0,183984,114621
1,183984,116882
2,183984,115538
3,183984,116883
4,183984,119843


In [21]:
# join things together: From words

In [22]:
from_word_df.head()

Unnamed: 0,from_word,from_word_id,from_word_group_id
0,a,0,0
1,aa,1,1
2,aal,2,2
3,aalii,3,3
4,aam,4,4


In [23]:
anagram_df = pd.merge(left = pw_df, right = from_word_df)

In [24]:
# to words
anagram_df = pd.merge(left = anagram_df, right = to_word_df)

In [25]:
anagram_df.head()

Unnamed: 0,from_word_group_id,to_word_group_id,from_word,from_word_id,to_word,to_word_id
0,183984,114621,terminator,198967,n,121240
1,183984,116882,terminator,198967,no,123918
2,183984,116882,terminator,198967,on,129913
3,183984,115538,terminator,198967,nei,122394
4,183984,116883,terminator,198967,noa,123919


In [26]:
anagram_df.shape

(37335, 6)

In [27]:
# duplicates are coming from the word groups
# item, mite, etc... have the same word group id
anagram_df = anagram_df.drop_duplicates()

In [28]:
anagram_df.shape

(36658, 6)

In [29]:
focal_word_list = ['it', 'tie', 'item', 'mite', 'time', 'emit', 'rite', 'terminator']

### focus only on a few words

In [30]:
# let's focus on just our words of interest
anagram_df = anagram_df.loc[anagram_df['from_word'].isin(focal_word_list), ]
anagram_df = anagram_df.loc[anagram_df['to_word'].isin(focal_word_list), ]

In [31]:
# sort the values
anagram_df = anagram_df.sort_values(by = ['from_word_id', 'to_word_id'])

### create and save graph

In [32]:
len(anagram_df)

38

In [33]:
# create a directed graph using networkx
# https://networkx.org/
my_graph = nx.from_pandas_edgelist(df=anagram_df,source = 'from_word',
                                    target = 'to_word', create_using = nx.DiGraph())

In [34]:
# save to disk as a GEXF for visualization in Gephi
# https://gephi.org/
nx.write_gexf(G=my_graph, path = ofpn)