# Mike Babb
# babbm@uw.edu
# Introduction to Python Part 01

In [1]:
# Problem Definition

In [2]:
# standard libraries - installed by default
import csv
from itertools import permutations
import math
import os
import time

In [3]:
# external libraries - not installed by default
import numpy as np
import pandas as pd

In [4]:
# set up our input file path and name
in_file_path = 'H:/git/anagrams'    
in_file_name = 'words.txt'    
in_fpn = os.path.join(in_file_path, in_file_name)    
in_fpn = in_file_name


In [5]:
# use pandas to load the data
print('Reading in list of words...')
word_df = pd.read_csv(filepath_or_buffer = in_fpn, sep = ',', header = None)    
# check the first few rows

Reading in list of words...


In [6]:
# check the first few records
word_df.head()

Unnamed: 0,0
0,A
1,a
2,aa
3,aal
4,aalii


In [7]:
# let's specify a a more appropriate column name
col_names = ['word']
word_df.columns = col_names

In [8]:
# how many words are we working with?
n_words = len(word_df)
print('...found', '{:,}'.format(n_words), 'words to find anagrams for...')

...found 235,886 words to find anagrams for...


In [9]:
# convert the only column to a string - just to be safe.
# nan is a word in the dictionary. nan is an internal python value.
word_df['word'] = word_df['word'].astype(np.str)

In [10]:
# create lower case values of the words
word_df['lcase'] = word_df['word'].str.lower()
# and now drop duplicates
word_df = word_df.drop_duplicates('lcase')

In [11]:
# 1. find word length
word_df['n_chars'] = word_df['lcase'].str.len()

In [12]:
# 2. extract the first letter
word_df['first_letter'] = word_df['lcase'].str[:1]

In [13]:
# 3. Let's aggregate the data to create a new dataframe featuring the counts of words by word length
agg_word_df = word_df['n_chars'].groupby(word_df['n_chars']).agg(np.size).to_frame()
col_names = ['n_words']
agg_word_df.columns = col_names
agg_word_df =  agg_word_df.reset_index()

In [14]:
agg_word_df.head()
# there are 26 one letter words... that checks out

Unnamed: 0,n_chars,n_words
0,1,26
1,2,139
2,3,1294
3,4,4993
4,5,9972


In [15]:
# and the tail
agg_word_df.tail()

Unnamed: 0,n_chars,n_words
19,20,198
20,21,82
21,22,41
22,23,17
23,24,5


In [16]:
# let's do a cross tab - word length by start character
select_columns = ['first_letter', 'n_chars']
ct_word_df = pd.crosstab(index=word_df['first_letter'], columns=word_df['n_chars'])

In [17]:
# check it
ct_word_df.head()
# looks right

n_chars,1,2,3,4,5,6,7,8,9,10,...,15,16,17,18,19,20,21,22,23,24
first_letter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
a,1,17,100,305,786,1281,1722,2073,2321,2333,...,413,228,125,64,30,19,8,2,1,0
b,1,5,59,332,712,1294,1634,1785,1620,1285,...,136,66,37,20,14,7,1,1,1,0
c,1,2,47,279,732,1408,2053,2434,2780,2639,...,503,305,171,81,43,20,15,4,0,0
d,1,4,59,257,451,807,1089,1396,1525,1504,...,258,129,72,31,13,6,6,4,0,0
e,1,11,40,118,245,538,846,1122,1260,1224,...,238,124,78,40,29,12,3,2,1,0


In [18]:
#  reset the index and write to excel
ct_word_df = ct_word_df.reset_index()

In [19]:
ct_word_df.head()

n_chars,first_letter,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,a,1,17,100,305,786,1281,1722,2073,2321,...,413,228,125,64,30,19,8,2,1,0
1,b,1,5,59,332,712,1294,1634,1785,1620,...,136,66,37,20,14,7,1,1,1,0
2,c,1,2,47,279,732,1408,2053,2434,2780,...,503,305,171,81,43,20,15,4,0,0
3,d,1,4,59,257,451,807,1089,1396,1525,...,258,129,72,31,13,6,6,4,0,0
4,e,1,11,40,118,245,538,846,1122,1260,...,238,124,78,40,29,12,3,2,1,0


In [20]:
# the amazing thing about pandas is that we can write to disk in a variety of formats.
# Let's pick excel for now.

In [21]:
# setup the output path
e_file_path = 'H:/git/anagrams'
e_file_name = 'words_analysis.xlsx'
e_fpn = os.path.join(e_file_path, e_file_name)
e_fpn = e_file_name

In [22]:
# create the writer object
e_writer = pd.ExcelWriter(e_fpn)

In [23]:
# let's first write the list of words
word_df.to_excel(excel_writer=e_writer, sheet_name='word_list', index = False)

In [24]:
# let's write the count of words by character length
agg_word_df.to_excel(excel_writer=e_writer, sheet_name='word_count_by_length', index = False)

In [25]:
ct_word_df.to_excel(excel_writer=e_writer, sheet_name='word_count_by_length_by_letter', index = False)

In [26]:
# save and close the excel file
e_writer.save()
e_writer.close()

In [27]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter
0,A,a,1,a
2,aa,aa,2,a
3,aal,aal,3,a
4,aalii,aalii,5,a
5,aam,aam,3,a


# what if we were trying to find all words by a brute-force technique?

In [28]:
# let's play with pandas to learn some facts about our list of words and show case how to use pandas
# we're going to make extensive use of the string functions in python and the pandas variates
# pure python: https://docs.python.org/3.6/library/string.html
# pandas versions: https://pandas.pydata.org/pandas-docs/stable/text.html#working-with-text-data

In [29]:
# what is the maximum character length?
max_char_length = agg_word_df['n_chars'].max()

In [30]:
max_char_length = 20

In [31]:
max_char_length

20

In [32]:
# how many possible combinations are there?

In [33]:
n_permutations = math.factorial(max_char_length)

In [34]:
n_permutations

2432902008176640000

In [35]:
# to check all permutations for one 24-character word
# assuming our computer can perform x checks per second
checks_per_second = 100000

In [36]:
n_seconds = n_permutations / checks_per_second

In [37]:
print('examining permutations takes', '{:,}'.format(n_seconds), 'seconds...')

examining permutations takes 24,329,020,081,766.4 seconds...


In [38]:
n_minutes = n_seconds / 60 

In [39]:
print('examining permutations takes', '{:,}'.format(n_minutes), 'minutes...')

examining permutations takes 405,483,668,029.44 minutes...


In [40]:
n_hours = n_minutes / 60

In [41]:
print('examining permutations takes', '{:,}'.format(n_hours), 'hours...')

examining permutations takes 6,758,061,133.824 hours...


In [42]:
n_days = n_hours / 24

In [43]:
print('examining permutations takes', '{:,}'.format(n_days), 'days...')

examining permutations takes 281,585,880.57600003 days...


In [44]:
n_years = n_days / 365

In [45]:
print('examining permutations takes', '{:,}'.format(n_years), 'years...')

examining permutations takes 771,468.1659616439 years...


In [46]:
# if we were to check all permutations of all words...
agg_word_df['n_char_factorial'] = agg_word_df['n_chars'].map(math.factorial)

In [47]:
# and do that for each word...
agg_word_df['n_char_checks'] = agg_word_df['n_words'] * agg_word_df['n_char_factorial']

In [48]:
total_checks = agg_word_df['n_char_checks'].sum()

In [49]:
# assuming we can process:
# 1M permutations a second
checks_per_second = 1000000000

In [50]:
processing_time = total_checks / checks_per_second

In [51]:
processing_years = processing_time / 60 / 60 / 24 / 365

In [52]:
print('examining all permutations takes', '{:,}'.format(processing_years), 'years...')

examining all permutations takes 113,918,685.75722675 years...


In [53]:
# that's a long time!

# let's re-think our approach by first focusing on one word: time

In [54]:
for p in permutations('time'):
    print(p)

('t', 'i', 'm', 'e')
('t', 'i', 'e', 'm')
('t', 'm', 'i', 'e')
('t', 'm', 'e', 'i')
('t', 'e', 'i', 'm')
('t', 'e', 'm', 'i')
('i', 't', 'm', 'e')
('i', 't', 'e', 'm')
('i', 'm', 't', 'e')
('i', 'm', 'e', 't')
('i', 'e', 't', 'm')
('i', 'e', 'm', 't')
('m', 't', 'i', 'e')
('m', 't', 'e', 'i')
('m', 'i', 't', 'e')
('m', 'i', 'e', 't')
('m', 'e', 't', 'i')
('m', 'e', 'i', 't')
('e', 't', 'i', 'm')
('e', 't', 'm', 'i')
('e', 'i', 't', 'm')
('e', 'i', 'm', 't')
('e', 'm', 't', 'i')
('e', 'm', 'i', 't')


In [55]:
type(p)

tuple

In [56]:
for p in permutations('time'):
    new_word = ''.join(p)
    print(new_word)    

time
tiem
tmie
tmei
teim
temi
itme
item
imte
imet
ietm
iemt
mtie
mtei
mite
miet
meti
meit
etim
etmi
eitm
eimt
emti
emit


In [57]:
for p in permutations('time'):    
    new_word = ''.join(sorted(p))
    print(new_word)

eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt
eimt


In [58]:
sorted_word_group = 'eimt'

In [59]:
words_to_examine = ['time', 'mite', 'emit', 'item', 'rite']

In [60]:
for word in words_to_examine:    
    sorted_word = sorted(word)
    if ''.join(sorted_word) == sorted_word_group:
        print(word, 'is in the', sorted_word_group, 'group')
    else:
        print(word, 'is NOT in the', sorted_word_group, 'group')

time is in the eimt group
mite is in the eimt group
emit is in the eimt group
item is in the eimt group
rite is NOT in the eimt group


In [61]:
# so we're going to do the same with pandas.
# But we need to write a function to do this. 

def create_sort_word(word):
    sorted_word = sorted(word)
    output_word = ''.join(sorted_word)    
    return output_word

In [62]:
word_df['word_group'] = word_df['lcase'].map(create_sort_word)

In [63]:
word_df['sorted_word'] = word_df['lcase'].map(sorted)

In [64]:
word_df['word_group_2'] = word_df['sorted_word'].map(lambda x: ''.join(x))

In [65]:
word_df['word_group_3'] = word_df['lcase'].map(sorted).map(lambda x: ''.join(x))

In [66]:
word_df['word_group_4'] = word_df['lcase'].map(lambda x: ''.join(sorted(x)))

In [67]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_group,sorted_word,word_group_2,word_group_3,word_group_4
0,A,a,1,a,a,[a],a,a,a
2,aa,aa,2,a,aa,"[a, a]",aa,aa,aa
3,aal,aal,3,a,aal,"[a, a, l]",aal,aal,aal
4,aalii,aalii,5,a,aaiil,"[a, a, i, i, l]",aaiil,aaiil,aaiil
5,aam,aam,3,a,aam,"[a, a, m]",aam,aam,aam


In [68]:
# let's check our work
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_group,sorted_word,word_group_2,word_group_3,word_group_4
0,A,a,1,a,a,[a],a,a,a
2,aa,aa,2,a,aa,"[a, a]",aa,aa,aa
3,aal,aal,3,a,aal,"[a, a, l]",aal,aal,aal
4,aalii,aalii,5,a,aaiil,"[a, a, i, i, l]",aaiil,aaiil,aaiil
5,aam,aam,3,a,aam,"[a, a, m]",aam,aam,aam


In [69]:
# and the tail
word_df.tail()

Unnamed: 0,word,lcase,n_chars,first_letter,word_group,sorted_word,word_group_2,word_group_3,word_group_4
235881,zythem,zythem,6,z,ehmtyz,"[e, h, m, t, y, z]",ehmtyz,ehmtyz,ehmtyz
235882,Zythia,zythia,6,z,ahityz,"[a, h, i, t, y, z]",ahityz,ahityz,ahityz
235883,zythum,zythum,6,z,hmtuyz,"[h, m, t, u, y, z]",hmtuyz,hmtuyz,hmtuyz
235884,Zyzomys,zyzomys,7,z,mosyyzz,"[m, o, s, y, y, z, z]",mosyyzz,mosyyzz,mosyyzz
235885,Zyzzogeton,zyzzogeton,10,z,egnootyzzz,"[e, g, n, o, o, t, y, z, z, z]",egnootyzzz,egnootyzzz,egnootyzzz


In [84]:
# do a simple group by and sort to count our values
word_df['word_group_count'] = 1
select_columns = ['n_chars', 'word_group', 'word_group_count']
word_group_df = word_df[select_columns].groupby(select_columns[:-1]).agg(np.size)

In [85]:
# reset the index    
word_group_df = word_group_df.reset_index()  

In [86]:
word_group_df.head()

Unnamed: 0,n_chars,word_group,word_group_count
0,1,a,1
1,1,b,1
2,1,c,1
3,1,d,1
4,1,e,1


In [87]:
word_group_df.tail()

Unnamed: 0,n_chars,word_group,word_group_count
215837,24,aacccgghhiillloooooppsty,1
215838,24,aacdeehhiimoooprrrtttyyz,1
215839,24,aaddeeefhhlllmooprstuxyy,1
215840,24,aadeeehhhiillnnooopprttt,1
215841,24,acccefhhiiiiillnoooppsst,1


In [74]:
# select only values that occur more than once.
word_group_df = word_group_df.loc[word_group_df['word_group_count'] > 1, ]

In [75]:
# very cool.
# So now we pull out the unique word count values, query the df,
# then write to disk.

In [76]:
n_char_list = word_group_df['n_chars'].unique().tolist()
n_char_list.sort()

In [77]:
n_char_list[:5]

[2, 3, 4, 5, 6]

In [78]:
# specify the proper line ending when using a csv writer.
output_path = 'H:/git/anagrams'
output_file = 'anagrams_found.txt'
output_fpn = os.path.join(output_path, output_file)

In [79]:
word_df.head()

Unnamed: 0,word,lcase,n_chars,first_letter,word_group,sorted_word,word_group_2,word_group_3,word_group_4,word_group_count
0,A,a,1,a,a,[a],a,a,a,1
2,aa,aa,2,a,aa,"[a, a]",aa,aa,aa,1
3,aal,aal,3,a,aal,"[a, a, l]",aal,aal,aal,1
4,aalii,aalii,5,a,aaiil,"[a, a, i, i, l]",aaiil,aaiil,aaiil,1
5,aam,aam,3,a,aam,"[a, a, m]",aam,aam,aam,1


In [80]:
# use the csv writer to write this to disk
output_file = open(output_fpn, 'w', newline='')
cw = csv.writer(output_file)    

# intialize some counters
n_anagram_groups = 0
n_anagrams = 0

# enumerate over the word counts
for n_char in n_char_list:
    curr_df = word_group_df.loc[word_group_df['n_chars']==n_char, ]    
    # sort by the count
    # curr_df = curr_df.sort_values(by='word_group_count')

    # let's do some enumeration   
    word_group_list = curr_df['word_group'].unique().tolist()
    n_word_groups = len(word_group_list)

    print('...found', '{:,}'.format(n_word_groups), 'unique word groups within', n_char, 'digit words.')
    for i_wg, wg in enumerate(word_group_list):
        # the current hash value corresponds to a group of anagrams.
        # get that group of words as a list.
        curr_word_list = word_df.loc[word_df['word_group'] == wg, 'lcase'].tolist()
        # print(wg)
        # print(curr_word_list)

        cw.writerow(curr_word_list)
        # increment the counters to find the total number of anagram groups
        # and words
        n_anagram_groups += 1
        n_anagrams += len(curr_word_list)

# close my file
output_file.close()

...found 40 unique word groups within 2 digit words.
...found 356 unique word groups within 3 digit words.
...found 1,055 unique word groups within 4 digit words.
...found 1,783 unique word groups within 5 digit words.
...found 2,607 unique word groups within 6 digit words.
...found 2,523 unique word groups within 7 digit words.
...found 2,214 unique word groups within 8 digit words.
...found 1,679 unique word groups within 9 digit words.
...found 1,001 unique word groups within 10 digit words.
...found 510 unique word groups within 11 digit words.
...found 276 unique word groups within 12 digit words.
...found 121 unique word groups within 13 digit words.
...found 70 unique word groups within 14 digit words.
...found 44 unique word groups within 15 digit words.
...found 35 unique word groups within 16 digit words.
...found 22 unique word groups within 17 digit words.
...found 10 unique word groups within 18 digit words.
...found 7 unique word groups within 19 digit words.
...found 3 u

In [81]:
# how many anagram groups and anagrams did we find?
n_anagram_groups = '{:,}'.format(n_anagram_groups)
n_anagrams = '{:,}'.format(n_anagrams)
print('...found', n_anagram_groups, 'anagram groups consisting of',
      n_anagrams, 'words...')

...found 14,362 anagram groups consisting of 32,890 words...


In [82]:
# now, what if we wanted to find all words that can be made from a larger word?
# for example: anagram: gram, rag, nag, ram, 

In [None]:
# we could do a lot of enumeration. But again, that will take too long.
# so, let's think about this.

In [None]:
# we don't need to examine all words - just the word groups.
# and we're going to create a way to score the words by a letter count

In [83]:
word_group_df.head()

Unnamed: 0,n_chars,word_group,word_group_count
27,2,ab,2
29,2,ad,2
30,2,ae,2
33,2,ah,2
35,2,ak,2


In [129]:
zero_df = np.zeros(shape=(len(word_group_df), 26))

In [130]:
zero_df

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [116]:
word_group_df['row_index'] = range(0, len(word_group_df))

In [117]:
word_group_df.head()

Unnamed: 0,n_chars,word_group,word_group_count,row_index
0,1,a,1,0
1,1,b,1,1
2,1,c,1,2
3,1,d,1,3
4,1,e,1,4


In [118]:
alphabet_letters = word_df.loc[word_df['n_chars']==1, 'lcase'].tolist()

In [119]:
alphabet_letters = sorted(alphabet_letters)

In [120]:
letter_dict = {}
for letter_index, letter in enumerate(alphabet_letters):
    letter_dict[letter] = letter_index

In [121]:
letter_dict

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25}

In [132]:
def score_word(row):
    word_group = row['word_group']
    row_index = row['row_index']
    for i_letter, letter in enumerate(word_group):
        if letter in letter_dict:
            # we're only concered with the letters, not hyphens
            letter_index = letter_dict[letter]
            # update our matrix with the counts
            zero_df[row_index, letter_index] += 1            
        
    return None    

In [133]:
zero_df = np.zeros(shape=(len(word_group_df), 26))

In [134]:
outcome = word_group_df.apply(score_word, 1)

In [136]:
zero_df = pd.DataFrame(data=zero_df, columns=alphabet_letters)

In [147]:
zero_df['word_group'] = word_group_df['word_group']

In [148]:
zero_df.head()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,r,s,t,u,v,w,x,y,z,word_group
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,a
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,c
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,d
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,e


In [152]:
word_group_df = pd.merge(left=word_group_df, right=zero_df)

In [176]:
def build_search_string(row):
    search_string = ''
    for i_letter, letter in enumerate(alphabet_letters):
        search_string += str(int(row[letter]))
    
    
    return search_string
        

In [177]:
word_group_df.head()

Unnamed: 0,n_chars,word_group,word_group_count,row_index,a,b,c,d,e,f,...,r,s,t,u,v,w,x,y,z,search_string
0,1,a,1,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_...
1,1,b,1,1,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0_1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_...
2,1,c,1,2,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_...
3,1,d,1,3,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0_0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_...
4,1,e,1,4,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0_0_0_0_1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_...


In [178]:
word_group_df['search_string'] = word_group_df.apply(build_search_string, 1)

In [179]:
word_group_df.head()

Unnamed: 0,n_chars,word_group,word_group_count,row_index,a,b,c,d,e,f,...,r,s,t,u,v,w,x,y,z,search_string
0,1,a,1,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10000000000000000000000000
1,1,b,1,1,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,01000000000000000000000000
2,1,c,1,2,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00100000000000000000000000
3,1,d,1,3,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00010000000000000000000000
4,1,e,1,4,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,00001000000000000000000000


In [180]:
#let's pick three letter words by word group
curr_df = word_group_df.loc[word_group_df['n_chars']==3, ]

In [181]:
other_df = word_group_df.loc[word_group_df['n_chars']>3, ]

In [182]:
curr_df.head()

Unnamed: 0,n_chars,word_group,word_group_count,row_index,a,b,c,d,e,f,...,r,s,t,u,v,w,x,y,z,search_string
125,3,aab,2,125,2.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,21000000000000000000000000
126,3,aad,1,126,2.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20010000000000000000000000
127,3,aag,1,127,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20000010000000000000000000
128,3,aah,1,128,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20000001000000000000000000
129,3,aak,1,129,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20000000001000000000000000


In [183]:
curr_word = 'ram'

In [184]:
curr_ss = curr_df.loc[curr_df['word_group']=='amr', 'search_string'].iloc[0]

In [185]:
curr_ss

'10000000000010000100000000'

In [186]:
candidate_df = other_df.copy()
for i_pos, pos in enumerate(curr_ss):
    if int(pos) > 0:
        letter = alphabet_letters[i_pos]
        candidate_df = candidate_df.loc[candidate_df[letter] >= int(pos), :]
    

In [187]:
candidate_df.head()

Unnamed: 0,n_chars,word_group,word_group_count,row_index,a,b,c,d,e,f,...,r,s,t,u,v,w,x,y,z,search_string
1068,4,aamr,4,1068,2.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20000000000010000100000000
1188,4,abmr,2,1188,1.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11000000000010000100000000
1288,4,acmr,2,1288,1.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10100000000010000100000000
1392,4,admr,1,1392,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10010000000010000100000000
1497,4,aemr,4,1497,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10001000000010000100000000


In [188]:
len(candidate_df)

20986

In [190]:
output_df = word_df.loc[word_df['word_group'].isin(candidate_df['word_group']), :]

In [191]:
len(output_df)

22270

In [192]:
output_df.loc[output_df['lcase']=='anagram', :]

Unnamed: 0,word,lcase,n_chars,first_letter,word_group,sorted_word,word_group_2,word_group_3,word_group_4,word_group_count
7611,anagram,anagram,7,a,aaagmnr,"[a, a, a, g, m, n, r]",aaagmnr,aaagmnr,aaagmnr,1


In [193]:
len(output_df)

22270

In [194]:
output_df.loc[output_df['word_group']=='aaagmnr', ]

Unnamed: 0,word,lcase,n_chars,first_letter,word_group,sorted_word,word_group_2,word_group_3,word_group_4,word_group_count
7611,anagram,anagram,7,a,aaagmnr,"[a, a, a, g, m, n, r]",aaagmnr,aaagmnr,aaagmnr,1


In [None]:
testo = other_df