### TireTree-based Keyword Search

In [36]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end = False

In [46]:
class Trie:
    def __init__(self):
        self.root = TrieNode()
        
    def insert(self, word):
        node = self.root
        
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end = True
        
    def find(self, prefix):
        node = self.root
        
        for char in prefix:
            node = node.children.get(char)
            if not node and char in self.root.children:
                node = self.root.children.get(char)
            elif not node:
                node = self.root
            if node.is_end:
                return True
           

        return False

In [47]:
keywords = ["台積電", "晶圓", "電子", "電路", "半導體", "韌體", "台積"]

In [48]:
with open('./news_titles.txt', 'r') as f:
    titles = f.readlines()

In [97]:
import time
num_keywords = []
trie_costs = []
bf_costs = []
for i in range(1,1002,250):
    tmp_keywords = ["A"] * i + keywords
    tmp_trie_costs = []
    tmp_bf_costs = []
    num_titles = []
    num_keywords.append(len(tmp_keywords))
    for i in range(1, 251, 50):
        tmp_titles = titles * i
        num_titles.append(len(tmp_titles))
        
        start_time = time.time()
        trie_tree = Trie()
        is_matches = []
        for keyword in tmp_keywords:
            trie_tree.insert(keyword)
        for title in tmp_titles:
            is_matches.append(trie_tree.find(title))
        tmp_trie_costs.append(time.time() - start_time)
        
        start_time = time.time()
        bf_results = []
        for title in tmp_titles:
            for keyword in tmp_keywords:
                if keyword in title:
                    bf_results.append(title)
                    break
        tmp_bf_costs.append(time.time() - start_time)
    trie_costs.append(tmp_trie_costs)
    bf_costs.append(tmp_bf_costs)

In [98]:
import pandas as pd
trie_df = pd.DataFrame(data=trie_costs,columns=num_titles,index=num_keywords)
trie_df

Unnamed: 0,340,17340,34340,51340,68340
8,0.001858,0.073991,0.146461,0.222226,0.289016
258,0.001791,0.076102,0.150156,0.229883,0.309911
508,0.001784,0.078693,0.150274,0.229878,0.312178
758,0.001829,0.076468,0.156772,0.220338,0.304582
1008,0.00201,0.074446,0.147618,0.227561,0.294658


In [99]:
bf_df = pd.DataFrame(data=bf_costs,columns=num_titles,index=num_keywords)
bf_df

Unnamed: 0,340,17340,34340,51340,68340
8,0.000246,0.011542,0.022832,0.034205,0.045335
258,0.00661,0.332057,0.657595,0.983928,1.307396
508,0.012816,0.652283,1.291974,1.931831,2.570547
758,0.019108,0.972633,1.925985,2.879633,3.832785
1008,0.025403,1.292857,2.560573,3.828218,5.095213


In [38]:
keywords = ["A"]*1000 + 
trie_tree = Trie()
for keyword in keywords:
    trie_tree.insert(keyword)

In [39]:
len(keywords)

1007

In [41]:
titles = org_titles * 1000
len(titles)

340000

In [42]:
%%time
is_matches = []
for title in titles:
    is_matches.append(trie_tree.find(title))

CPU times: user 1.43 s, sys: 0 ns, total: 1.43 s
Wall time: 1.43 s


In [43]:
from itertools import compress
results = list(compress(titles, is_matches))

In [44]:
results

['Google App抽佣砍半 國內業者：有利開發環境\n',
 '創辦人鄭崇華堅持的事 成企業DNA\n',
 '以國電子交易平台 將赴美上市\n',
 '半導體三雄 319外資槍擊\n',
 '法國19日恢復施打AZ疫苗 總理也將接種\n',
 '全球晶圓廠設備支出 SEMI：將連三年創新高\n',
 'AFMA線上培訓 金融競爭力up\n',
 '面試MA 中信三總座直播談要訣\n',
 '針對AZ疫苗注射 世衛組織專家再度表態支持\n',
 '台積電 談水情 上半年營運不受影響\n',
 '台積籌錢擴廠 發債211億\n',
 'AFMA 推動全方位金融發展\n',
 '台積、世界獲利 外資唱高\n',
 '中國電子科技集團大突破 晶片製造設備國產化\n',
 '電子廠大舉增持 2月人民幣存款回升\n',
 '官方力挺 中芯深圳擴產12吋晶圓 斥資23.5億美元\n',
 '陳時中掛保證 AZ疫苗沒問題\n',
 '國泰航空測試電子健康護照 可核實乘客疫苗接種紀錄\n',
 '歐盟藥品管理局：AZ疫苗安全有效 好處大於風險\n',
 '金融座談／許永欽：融入DNA 發展軟實力\n',
 '除息成買點！ 單日盤中交易逾152萬股 台積 重登零股交易人氣王\n',
 '台積拉尾盤 除息行情點火\n',
 '歐洲多國停止施打AZ疫苗 國際油價收低\n',
 'AZ疫苗預防效力達 79%\n',
 'AES申購超熱 中籤率僅2.1％\n',
 '超微追單台積７奈米 新EPYC 3處理器強勢登場\n',
 '台積電ADR隨費半上漲0.2% 費半漲1.3%\n',
 '台積訂單滿到明年底\n',
 '義大利總理：19日起恢復施打AZ疫苗\n',
 'IEA：油市未現新超級周期\n',
 'IC Insights：追趕台積 五年最少砸1,500億美元\n',
 'Google App抽佣砍半 國內業者：有利開發環境\n',
 '創辦人鄭崇華堅持的事 成企業DNA\n',
 '以國電子交易平台 將赴美上市\n',
 '半導體三雄 319外資槍擊\n',
 '法國19日恢復施打AZ疫苗 總理也將接種\n',
 '全球晶圓廠設備支出 SEMI：將連三年創新高\n',
 'AFMA線上培訓 金融競爭力up\n',
 '面試MA 中信三總座直播談要訣\n',
 '針對AZ疫苗注射

In [45]:
%%time
bf_results = []
for title in titles:
    for keyword in keywords:
        if keyword in title:
            bf_results.append(title)
            break

CPU times: user 25.8 s, sys: 0 ns, total: 25.8 s
Wall time: 25.8 s


In [19]:
bf_results

['以國電子交易平台 將赴美上市\n',
 '半導體三雄 319外資槍擊\n',
 '全球晶圓廠設備支出 SEMI：將連三年創新高\n',
 '台積電 談水情 上半年營運不受影響\n',
 '台積籌錢擴廠 發債211億\n',
 '台積、世界獲利 外資唱高\n',
 '中國電子科技集團大突破 晶片製造設備國產化\n',
 '電子廠大舉增持 2月人民幣存款回升\n',
 '官方力挺 中芯深圳擴產12吋晶圓 斥資23.5億美元\n',
 '國泰航空測試電子健康護照 可核實乘客疫苗接種紀錄\n',
 '除息成買點！ 單日盤中交易逾152萬股 台積 重登零股交易人氣王\n',
 '台積拉尾盤 除息行情點火\n',
 '超微追單台積７奈米 新EPYC 3處理器強勢登場\n',
 '台積電ADR隨費半上漲0.2% 費半漲1.3%\n',
 '台積訂單滿到明年底\n',
 'IC Insights：追趕台積 五年最少砸1,500億美元\n',
 '以國電子交易平台 將赴美上市\n',
 '半導體三雄 319外資槍擊\n',
 '全球晶圓廠設備支出 SEMI：將連三年創新高\n',
 '台積電 談水情 上半年營運不受影響\n',
 '台積籌錢擴廠 發債211億\n',
 '台積、世界獲利 外資唱高\n',
 '中國電子科技集團大突破 晶片製造設備國產化\n',
 '電子廠大舉增持 2月人民幣存款回升\n',
 '官方力挺 中芯深圳擴產12吋晶圓 斥資23.5億美元\n',
 '國泰航空測試電子健康護照 可核實乘客疫苗接種紀錄\n',
 '除息成買點！ 單日盤中交易逾152萬股 台積 重登零股交易人氣王\n',
 '台積拉尾盤 除息行情點火\n',
 '超微追單台積７奈米 新EPYC 3處理器強勢登場\n',
 '台積電ADR隨費半上漲0.2% 費半漲1.3%\n',
 '台積訂單滿到明年底\n',
 'IC Insights：追趕台積 五年最少砸1,500億美元\n',
 '以國電子交易平台 將赴美上市\n',
 '半導體三雄 319外資槍擊\n',
 '全球晶圓廠設備支出 SEMI：將連三年創新高\n',
 '台積電 談水情 上半年營運不受影響\n',
 '台積籌錢擴廠 發債211億\n',
 '台積、世界獲利 外資唱高\n',
 '中國電子科技集團大突破 晶片製造設備國產化\n