-
Notifications
You must be signed in to change notification settings - Fork 0
/
topic_modeling.py
62 lines (52 loc) · 1.9 KB
/
topic_modeling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn import decomposition
from scipy import linalg
import matplotlib.pyplot as plt
def data_processing(data):
"""
this function extract all the word counts and return the vectors and the words mapping
"""
cv = CountVectorizer()
vectors = cv.fit_transform(data.text).todense() # (documents, vocab)
words_vocab = np.array(cv.get_feature_names()) # mapping numbers to words
return vectors, words_vocab
def get_topics(vh, words_vocab, num_top_words=8):
"""
this function return the top words for each topic
"""
top_words = lambda t: [words_vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
topic_words = ([top_words(t) for t in vh])
return [' '.join(t) for t in topic_words]
def print_messages_by_topic(data, u, num_of_massages=8):
"""
this function print each topic and the messages that match to this topic
"""
for i in range(6):
print("------------------------")
print("topic number " + str(i))
cur_column = u[:, i]
topic_messages = [i for i in np.argsort(cur_column)[:-num_of_massages-1:-1]]
for j in topic_messages:
print("message ID " + str(j))
print(data.text.iloc[j])
print("label: " + str(data.label.iloc[j]))
def svd(vectors):
"""
This function return the singular value decomposition
u- from message to topic
s- perfects the issues by importance
vh- from topic to words
"""
u, s, vh = linalg.svd(vectors, full_matrices=False)
return u, s, vh
def nmf(vectors, num_of_topics=8):
"""
return nmf decomposition
w1- connects topics to documents
h1- connects topics to terms
"""
clf = decomposition.NMF(n_components=num_of_topics, random_state=1)
w1 = clf.fit_transform(vectors)
h1 = clf.components_
return w1, h1