-
Notifications
You must be signed in to change notification settings - Fork 5
/
five_cliques.py
154 lines (126 loc) · 4.33 KB
/
five_cliques.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
# Copyright (C) 2022 - Benjamin Paassen
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
from tqdm import tqdm
import csv
# prepare a data structure for all five-letter words in string and set representation
anagrams = {}
graph = {}
# This is a compact representation of a "char set" that fits in just 26 bits
# each bit represents a letter in the alphabet
# we can quickly check for an intersection between two words with the bitwise AND operator
# or union them with bitwise OR
def alphaBit(word):
ret = 0
for char in word:
alphaIndex = (ord(char) | 0x20) - ord('a')
ret |= 1 << alphaIndex
return ret
def bitCount(num):
#return num.bit_count() # faster, but requires python 3.10+
count = 0
while num:
num &= num - 1
count += 1
return count
print('--- reading words file ---')
# words_alpha.txt from https://github.com/dwyl/english-words
with open('words_alpha.txt') as f:
for word in tqdm(f):
word = word[:-1]
if len(word) != 5:
continue
# compute set representation of the word
char_set = alphaBit(word)
if bitCount(char_set) != 5:
continue
if char_set in anagrams:
anagrams[char_set].append(word)
continue
anagrams[char_set] = [word]
graph[char_set] = set()
print('--- building neighborhoods ---')
# compute the 'neighbors' for each word, i.e. other words which have entirely
# distinct letters
words = sorted(list(anagrams.keys()))
for i, char_set in tqdm(enumerate(words), total=len(words)):
neighbors = graph[char_set]
# We only need one-direction of each relationship
for j, other_set in enumerate(words[i+1:], i+1):
if char_set & other_set == 0:
neighbors.add(j)
print('--- finding cliques ---')
# We maintain a set of tree branches that we have already visited and know
# don't contain any clique subsets
#
# For performance reasons, it's only worth pruning layers 2 and 3
# layer 1 is already cached with the neighbor calculation above, and if you
# get to layer 4, it's quicker to just complete the search
prune = set()
# start clique finding
graphs = [(graph[char_set], char_set) for char_set in words]
Cliques = []
for i in tqdm(range(len(words))):
Ni, i_set = graphs[i]
for j in Ni:
Nj, j_set = graphs[j]
if (i_set | j_set) in prune:
continue
# the remaining candidates are only the words in the intersection
# of the neighborhood sets of i and j
Nij = Ni & Nj
if len(Nij) < 3:
prune.add(i_set | j_set)
continue
have_ij = False
for k in Nij:
Nk, k_set = graphs[k]
if (i_set | j_set | k_set) in prune:
continue
# intersect with neighbors of k
Nijk = Nij & Nk
if len(Nijk) < 2:
prune.add(i_set | j_set | k_set)
continue
have_ijk = False
for l in Nijk:
# intersect with neighbors of l
Nijkl = Nijk & graphs[l][0]
# all remaining neighbors form a 5-clique with i, j, k, and l
for r in Nijkl:
Cliques.append([i, j, k, l, r])
have_ij = True
have_ijk = True
if not have_ijk:
# we didn't find anything on this branch, prune it
prune.add(i_set | j_set | k_set)
if not have_ij:
# we didn't find anything on this branch, prune it
prune.add(i_set | j_set)
print('completed! Found %d cliques' % len(Cliques))
print(f'{len(prune)} branches of the tree were pruned')
def RecursiveExpand(lst):
head, *tail = lst
if not tail:
return [[w] for w in anagrams[words[head]]]
tail = RecursiveExpand(tail)
return [[w, *t] for w in anagrams[words[head]] for t in tail]
ExpandedCliques = []
for cliq in Cliques:
ExpandedCliques += [sorted(c) for c in RecursiveExpand(cliq)]
print(f'expanded to {len(ExpandedCliques)} cliques with anagrams')
print('--- write to output ---')
with open('cliques.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.writer(f, delimiter = '\t')
for cliq_words in sorted(ExpandedCliques):
writer.writerow(cliq_words)