# 主題模型

In [1]:
import pandas as pd
import pickle
from gensim import matutils, models
import scipy.sparse

data = pd.read_pickle('data_pickle/dtm_stop.pkl')
data

Unnamed: 0,一一,一上,一上車,一下,一下子,一下子把,一下肚子,一不小心,一中,一串,...,龐大金額,龐然,龐皮,龐皮歐,龔明鑫,龜山島,龜山間,龜苓,龜裂,龜速
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
tdm = data.transpose()
tdm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,1000
一一,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
一上,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
一上車,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
一下,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
一下子,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# 我們將把 turm-doc 矩陣放入新的 gensim 格式，從 df --> 稀疏矩陣 --> gensim 語料庫
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [5]:
# Gensim 還需要所有 turms 及其在術語文檔矩陣中各自位置的字典
cv = pickle.load(open("data_pickle/cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

現在我們有了語料庫（term-doc mxtrix）和 id2word（位置詞典：術語），我們需要指定另外兩個參數 - 主題數量和傳遞次數。 讓我們從 2 開始主題數量，看看結果是否有意義，然後從那裡增加主題數量。

In [6]:
# 現在我們有了語料庫（term-doc mxtrix）和 id2word（位置詞典：術語）
# 我們還需要指定另外兩個參數 - 主題數量和傳遞次數
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=10)
# LDA for num_topics = 3
lda.print_topics(3)

[(1,
  '0.005*"疫情" + 0.005*"病毒" + 0.004*"台灣" + 0.003*"一個" + 0.003*"大利" + 0.002*"表示" + 0.002*"民眾" + 0.002*"防疫" + 0.002*"發生" + 0.002*"已經"'),
 (4,
  '0.005*"台灣" + 0.003*"疫情" + 0.003*"一個" + 0.002*"民眾" + 0.002*"總統" + 0.002*"香港" + 0.002*"韓國瑜" + 0.002*"政府" + 0.002*"已經" + 0.002*"口罩"'),
 (0,
  '0.007*"口罩" + 0.005*"地區" + 0.004*"北部" + 0.003*"台灣" + 0.003*"影響" + 0.003*"天氣" + 0.002*"一個" + 0.002*"各地" + 0.002*"半部" + 0.002*"局部"')]

In [7]:
# LDA for num_topics = 5
lda.print_topics(5)

[(0,
  '0.007*"口罩" + 0.005*"地區" + 0.004*"北部" + 0.003*"台灣" + 0.003*"影響" + 0.003*"天氣" + 0.002*"一個" + 0.002*"各地" + 0.002*"半部" + 0.002*"局部"'),
 (1,
  '0.005*"疫情" + 0.005*"病毒" + 0.004*"台灣" + 0.003*"一個" + 0.003*"大利" + 0.002*"表示" + 0.002*"民眾" + 0.002*"防疫" + 0.002*"發生" + 0.002*"已經"'),
 (2,
  '0.006*"疫情" + 0.004*"指揮" + 0.004*"感染" + 0.003*"中心" + 0.003*"台灣" + 0.003*"一個" + 0.003*"武漢" + 0.003*"中央" + 0.003*"確診" + 0.003*"病毒"'),
 (3,
  '0.005*"台灣" + 0.004*"總統" + 0.004*"疫情" + 0.002*"一個" + 0.002*"英文" + 0.002*"表示" + 0.002*"民眾" + 0.002*"時間" + 0.002*"目前" + 0.002*"台北"'),
 (4,
  '0.005*"台灣" + 0.003*"疫情" + 0.003*"一個" + 0.002*"民眾" + 0.002*"總統" + 0.002*"香港" + 0.002*"韓國瑜" + 0.002*"政府" + 0.002*"已經" + 0.002*"口罩"')]

## 測試一下結果

In [8]:
# 輸入文句，看看是屬于哪一個 topic?
other_text = ["焦耳 國際 單位 制 下 的 能量 單位 焦耳 相當 於 牛頓 的 力 作用 在 物體 上"]
other_data = cv.fit_transform(other_text)
other_dtm = pd.DataFrame(other_data.toarray(), columns=cv.get_feature_names_out())
test_dtm = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(other_dtm.transpose()))
test_topics = lda[test_dtm]

In [9]:
test_topics[0]

[(0, 0.01847245),
 (1, 0.6733303),
 (2, 0.2712806),
 (3, 0.018537836),
 (4, 0.018378811)]