### TireTree-based Keyword Search

In [1]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False

In [2]:
class Trie:
    def __init__(self):
        self.root = TrieNode()
        
    def insert(self, word):
        node = self.root
        
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end = True
        
    def find(self, prefix):
        node = self.root
        
        for char in prefix:
            node = node.children.get(char)
            if not node and char in self.root.children:
                node = self.root.children.get(char)
            elif not node:
                node = self.root
            if node.is_end:
                return True
           

        return False

In [3]:
keywords = ["台積電", "晶圓", "電子", "電路", "半導體", "韌體", "台積"]

In [4]:
with open('./news_titles.txt', 'r') as f:
    titles = f.readlines()

In [5]:
import time
num_keywords = []
trie_costs = []
bf_costs = []
for i in range(1,1002,250):
    tmp_keywords = ["A"] * i + keywords
    tmp_trie_costs = []
    tmp_bf_costs = []
    num_titles = []
    num_keywords.append(len(tmp_keywords))
    for i in range(1, 251, 50):
        tmp_titles = titles * i
        num_titles.append(len(tmp_titles))
        
        start_time = time.time()
        trie_tree = Trie()
        is_matches = []
        for keyword in tmp_keywords:
            trie_tree.insert(keyword)
        for title in tmp_titles:
            is_matches.append(trie_tree.find(title))
        tmp_trie_costs.append(time.time() - start_time)
        
        start_time = time.time()
        bf_results = []
        for title in tmp_titles:
            for keyword in tmp_keywords:
                if keyword in title:
                    bf_results.append(title)
                    break
        tmp_bf_costs.append(time.time() - start_time)
    trie_costs.append(tmp_trie_costs)
    bf_costs.append(tmp_bf_costs)

In [6]:
import pandas as pd
trie_df = pd.DataFrame(data=trie_costs,columns=num_titles,index=num_keywords)
trie_df

Unnamed: 0,340,17340,34340,51340,68340
8,0.001413,0.073018,0.147575,0.210869,0.280949
258,0.00167,0.07177,0.141315,0.21743,0.280499
508,0.001683,0.073545,0.141412,0.212318,0.289525
758,0.001854,0.071312,0.145125,0.210744,0.280661
1008,0.001924,0.071115,0.140125,0.215817,0.278619


In [7]:
bf_df = pd.DataFrame(data=bf_costs,columns=num_titles,index=num_keywords)
bf_df

Unnamed: 0,340,17340,34340,51340,68340
8,0.000216,0.010739,0.021196,0.031629,0.042025
258,0.005972,0.304542,0.602538,0.901527,1.199524
508,0.011721,0.599167,1.183635,1.773852,2.356044
758,0.017437,0.891555,1.766044,2.640474,3.51375
1008,0.023233,1.184505,2.346336,3.507877,4.67035


In [8]:
!pip install tabulate

You should consider upgrading via the '/home/ubuntu/anaconda3/bin/python -m pip install --upgrade pip' command.[0m


In [9]:
print(trie_df.to_markdown())

|      |        340 |     17340 |    34340 |    51340 |    68340 |
|-----:|-----------:|----------:|---------:|---------:|---------:|
|    8 | 0.00141263 | 0.0730178 | 0.147575 | 0.210869 | 0.280949 |
|  258 | 0.00167012 | 0.0717704 | 0.141315 | 0.21743  | 0.280499 |
|  508 | 0.00168252 | 0.0735452 | 0.141412 | 0.212318 | 0.289525 |
|  758 | 0.00185394 | 0.0713117 | 0.145125 | 0.210744 | 0.280661 |
| 1008 | 0.00192428 | 0.071115  | 0.140125 | 0.215817 | 0.278619 |


In [10]:
print(bf_df.to_markdown())

|      |         340 |     17340 |     34340 |     51340 |     68340 |
|-----:|------------:|----------:|----------:|----------:|----------:|
|    8 | 0.000215769 | 0.0107393 | 0.0211964 | 0.0316288 | 0.0420249 |
|  258 | 0.00597215  | 0.304542  | 0.602538  | 0.901527  | 1.19952   |
|  508 | 0.0117214   | 0.599167  | 1.18363   | 1.77385   | 2.35604   |
|  758 | 0.017437    | 0.891555  | 1.76604   | 2.64047   | 3.51375   |
| 1008 | 0.0232332   | 1.1845    | 2.34634   | 3.50788   | 4.67035   |


In [11]:
trie_tree = Trie()
for keyword in keywords:
    trie_tree.insert(keyword)

In [12]:
len(keywords)

7

In [13]:
%%time
is_matches = []
for title in titles:
    is_matches.append(trie_tree.find(title))

CPU times: user 1.63 ms, sys: 3 µs, total: 1.63 ms
Wall time: 1.63 ms


In [14]:
from itertools import compress
results = list(compress(titles, is_matches))

In [15]:
%%time
bf_results = []
for title in titles:
    for keyword in keywords:
        if keyword in title:
            bf_results.append(title)
            break

CPU times: user 214 µs, sys: 0 ns, total: 214 µs
Wall time: 216 µs
