# Studying Anagrams in the English Language

In [1]:
address = '/Users/mattwilliams/Documents/LinkedInLEarning/PythonDataAnalysis/Exercise Files/Ch3/03_02/words'

In [2]:
with open(address, 'r') as f:
    wordlist = f.readlines()

In [3]:
wordlist[:10]

['A\n',
 'a\n',
 'aa\n',
 'aal\n',
 'aalii\n',
 'aam\n',
 'Aani\n',
 'aardvark\n',
 'aardwolf\n',
 'Aaron\n']

The worlist is read in, but it is desireable to remove the newline character at the end of each word.

In [4]:
len(wordlist)

235886

In [5]:
wordclean = [word.strip().lower() for word in wordlist]

In [6]:
wordclean[:10]

['a',
 'a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron']

In [7]:
wordunique = list(set(wordclean))

In [8]:
wordunique[:10]

['swannecked',
 'tungusian',
 'aggie',
 'periphrastically',
 'ursal',
 'imponderability',
 'adendritic',
 'shrimpfish',
 'facetious',
 'unarticled']

I converted to a set, which contains only unique instances of a list, in order to get rid of the duplicates. But doing so lost the alphabetical order of the original list, so we will need to sort.

In [9]:
wordunique.sort()

In [10]:
wordunique[:10]

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic']

This pipeline of cleaning operations could have been performed more concisely with a list comprehension.

In [11]:
wordclean = sorted(list(set([word.strip().lower() for word in wordlist])))

In [12]:
wordclean[:10]

['a',
 'aa',
 'aal',
 'aalii',
 'aam',
 'aani',
 'aardvark',
 'aardwolf',
 'aaron',
 'aaronic']

Anagrams are words that can be created by using the same set of letters. A clever way to find anagrams is to use the 'sorted' function.

In [13]:
sorted('elvis')

['e', 'i', 'l', 's', 'v']

In [14]:
sorted('lives') == sorted('elvis')

True

In [15]:
def signature(word):
    return ''.join(sorted(word))

In [16]:
signature('elvis')

'eilsv'

In [17]:
def anagram(myword):
    return [word for word in wordclean if signature(word) == signature(myword)]

In [18]:
anagram('dictionary')

['dictionary', 'indicatory']

This function actually works fine, but calling it on a single word actually took a considerable amount of time.

In [22]:
%timeit -n 10 anagram('dictionary')

278 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


278 ms per loop is not good.

In [24]:
import collections

In [25]:
words_bysig = collections.defaultdict(list)

In [27]:
words_bysig

defaultdict(list, {})

In [28]:
for word in wordclean:
    words_bysig[signature(word)].append(word)

In [33]:
for x in list(words_bysig.items())[:100]:
    print(x)

('a', ['a'])
('aa', ['aa'])
('aal', ['aal', 'ala'])
('aaiil', ['aalii'])
('aam', ['aam', 'ama'])
('aain', ['aani'])
('aaadkrrv', ['aardvark'])
('aadflorw', ['aardwolf'])
('aanor', ['aaron'])
('aacinor', ['aaronic', 'nicarao', 'ocarina'])
('aaacilnor', ['aaronical'])
('aaeinort', ['aaronite', 'aeration'])
('aaciinort', ['aaronitic'])
('aaru', ['aaru', 'aura'])
('ab', ['ab', 'ba'])
('aab', ['aba', 'baa'])
('aabbdeh', ['ababdeh'])
('aaabbu', ['ababua'])
('aabc', ['abac', 'caba'])
('aaabc', ['abaca'])
('aaabcet', ['abacate'])
('aaabcy', ['abacay'])
('aaabceint', ['abacinate'])
('aaabciinnot', ['abacination'])
('aabccissu', ['abaciscus'])
('aabcist', ['abacist'])
('aabck', ['aback'])
('aaabcilnt', ['abactinal'])
('aaabcillnty', ['abactinally'])
('aabcinot', ['abaction'])
('aabcort', ['abactor', 'acrobat'])
('aabclsuu', ['abaculus'])
('aabcsu', ['abacus'])
('aabdeit', ['abadite'])
('aabff', ['abaff'])
('aabft', ['abaft', 'bafta'])
('aaabceins', ['abaisance'])
('aabeirs', ['abaiser'])
('aabde

In [34]:
def anagram_fast(myword):
    return words_bysig[signature(myword)]