-
Notifications
You must be signed in to change notification settings - Fork 1
/
pseudo_diceware.py
64 lines (54 loc) · 2.39 KB
/
pseudo_diceware.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/python
### Attempt to fix encoding issue from https://stackoverflow.com/questions/10561923/unicodedecodeerror-ascii-codec-cant-decode-byte-0xef-in-position-1
# -*- coding: utf-8 -*
import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')
import random
import re
"""
Some code snippits taken from:
https://stackoverflow.com/questions/6255641/counting-the-number-of-unique-words-in-a-document-with-python
Dicts are implemented as hash tables and are therefore are orders of magnitude faster than lists (i.e. arrays) for lookups.
See here:
https://stackoverflow.com/questions/38927794/python-dictionary-vs-list-which-is-faster/38927968
---> Will use the dict method below instead of a list.
Random numbers generated by the SystemRandom class in the random library.
https://pynative.com/cryptographically-secure-random-data-in-python/
I chose to use this instead of the secrets library (available from Python 3.6) for backwards compatibility.
The documentation for random recommends using the SystemRandom class for crypto applications. See here:
https://docs.python.org/2/library/random.html
"""
def string_cleaner(path_to_book):
# exclude characters are not in a-z A-Z
clean_string=re.sub('[^a-zA-Z ]',' ', open(path_to_book, encoding='utf8').read())
# convert to lower case
clean_string = clean_string.lower()
# condense all whitespace and tabs to single spaces.
clean_string = re.sub(' +|\t',' ',clean_string)
return(clean_string)
def create_wordcount_dict(clean_book_string):
count = {}
for w in clean_book_string.split(' '):
if w not in count:
count[w] = 1
print("There are "+str(len(count.keys()))+" unique words in the file.")
return(count)
def diceware_calculator(count_dictionary, n_words):
#getting systemRandom instance out of random class
SystemRandom = random.SystemRandom()
diceware_list = []
for x in range(0,n_words):
random_index = SystemRandom.randrange(0,len(count_dictionary.keys()))
diceware_list.append(list(count_dictionary.keys())[random_index])
return(diceware_list)
def main(path_to_book, n_random_words):
clean_book_string=string_cleaner(path_to_book)
word_count_dict = create_wordcount_dict(clean_book_string)
diceware_list = diceware_calculator(word_count_dict, n_random_words)
return(diceware_list)
if __name__ == '__main__':
path_to_book = str(sys.argv[1])
n_random_words = int(sys.argv[2])
word_list = main(path_to_book, n_random_words)
print(word_list)