# 使用gensim来计算TF-IDF

In [38]:
from gensim import corpora

from gensim import models
# 建立一个语料库
corpus = [
    "what is the weather like today",
    "what is for dinner tonight",
    "this is a question worth pondering",
    "it is a beautiful day today"
]

#进行分词
words = []
for i in corpus:
    words.append(i.split(" "))
print(words)

[['what', 'is', 'the', 'weather', 'like', 'today'], ['what', 'is', 'for', 'dinner', 'tonight'], ['this', 'is', 'a', 'question', 'worth', 'pondering'], ['it', 'is', 'a', 'beautiful', 'day', 'today']]


In [39]:
#给每一个词一个ID并统计每个词在当前文档中出现的次数
"""
doc2bow函数主要用于让dic中的内用变为bow词袋模型，
其中每个括号中的第一个数代表词的ID第二个数代表在当前文档中出现的次数。
"""
dic = corpora.Dictionary(words)
new_corpus = [dic.doc2bow(text) for text in words]
print(new_corpus)
print(dic.token2id)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)], [(0, 1), (5, 1), (6, 1), (7, 1), (8, 1)], [(0, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1)], [(0, 1), (3, 1), (9, 1), (14, 1), (15, 1), (16, 1)]]
{'is': 0, 'like': 1, 'the': 2, 'today': 3, 'weather': 4, 'what': 5, 'dinner': 6, 'for': 7, 'tonight': 8, 'a': 9, 'pondering': 10, 'question': 11, 'this': 12, 'worth': 13, 'beautiful': 14, 'day': 15, 'it': 16}


In [31]:
#训练模型并保存
tfidf = models.TfidfModel(new_corpus)
tfidf.save("my_model.tfidf")

#载入模型
tfidf = models.TfidfModel.load("my_model.tfidf")

#使用训练好的模型计算TF-IDF值
string = "i like the weather today"
string_bow = dic.doc2bow(string.lower().split())
string_tfidf = tfidf[string_bow]
print(string_tfidf)

[(1, 0.5547001962252291), (2, 0.5547001962252291), (3, 0.2773500981126146), (4, 0.5547001962252291)]


# sklearn来计算TF-IDF

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    "what is the weather like today",
    "what is for dinner tonight",
    "this is a question worth pondering",
    "it is a beautiful day today"
]

tfidf_vec = TfidfVectorizer()
#利用fit_transform得到TF-IDF矩阵
tfidf_matrix = tfidf_vec.fit_transform(corpus)
print("tfidf_matrix:\n",tfidf_matrix)

#利用get_feature_names得到不重复的单词
print(tfidf_vec.vocabulary_)


tfidf_matrix:
   (0, 11)	0.3710221459250386
  (0, 6)	0.47059454669821993
  (0, 13)	0.47059454669821993
  (0, 9)	0.47059454669821993
  (0, 4)	0.24557575678403082
  (0, 14)	0.3710221459250386
  (1, 12)	0.506765426545092
  (1, 2)	0.506765426545092
  (1, 3)	0.506765426545092
  (1, 4)	0.2644512224141842
  (1, 14)	0.3995396830595886
  (2, 7)	0.4838025881780501
  (2, 15)	0.4838025881780501
  (2, 8)	0.4838025881780501
  (2, 10)	0.4838025881780501
  (2, 4)	0.25246826075544676
  (3, 1)	0.506765426545092
  (3, 0)	0.506765426545092
  (3, 5)	0.506765426545092
  (3, 11)	0.3995396830595886
  (3, 4)	0.2644512224141842
{'what': 14, 'is': 4, 'the': 9, 'weather': 13, 'like': 6, 'today': 11, 'for': 3, 'dinner': 2, 'tonight': 12, 'this': 10, 'question': 8, 'worth': 15, 'pondering': 7, 'it': 5, 'beautiful': 0, 'day': 1}


# 手动实现TF-IDF

In [44]:
import math
 
corpus = [
    "what is the weather like today",
    "what is for dinner tonight",
    "this is a question worth pondering",
    "it is a beautiful day today"
]
words = []
# 对corpus分词
for i in corpus:
    words.append(i.split())

#可以先取出停用词

#进行词频统计
def Counter(word_list):
    wordcount = []
    for i in word_list:
        count = {}
        for j in i:
            if not count.get(j):
                count.update({j:1})
            elif count.get(j):
                count[j] += 1
        wordcount.append(count)
    return wordcount

wordcount = Counter(words)
print(wordcount)

[{'what': 1, 'is': 1, 'the': 1, 'weather': 1, 'like': 1, 'today': 1}, {'what': 1, 'is': 1, 'for': 1, 'dinner': 1, 'tonight': 1}, {'this': 1, 'is': 1, 'a': 1, 'question': 1, 'worth': 1, 'pondering': 1}, {'it': 1, 'is': 1, 'a': 1, 'beautiful': 1, 'day': 1, 'today': 1}]


In [67]:
words

[['what', 'is', 'the', 'weather', 'like', 'today'],
 ['what', 'is', 'for', 'dinner', 'tonight'],
 ['this', 'is', 'a', 'question', 'worth', 'pondering'],
 ['it', 'is', 'a', 'beautiful', 'day', 'today']]

In [68]:
#计算TF (word代表被计算的单词，word_list是被计算单词所在文档分词后的字典)
def tf(word, word_list):
    return word_list.get(word) / 23) #这儿应该是总的词数

#统计含有该单词的句子数
def count_sentence(word, wordcount):
    return sum(1 for i in wordcount if i.get(word))

#计算IDF
def idf(word, wordcount):
#     print('len(wordcount):',len(wordcount))
#     print("(count_sentence(word , wordcount)):",(count_sentence(word , wordcount)))
#     print("log:",math.log(len(wordcount) / (count_sentence(word , wordcount))))
    return math.log(len(wordcount) / (count_sentence(word , wordcount)))

#计算TF-IDF
def tfidf(word, word_list, wordcount):
    return tf(word, word_list) * idf(word, wordcount)

SyntaxError: invalid syntax (<ipython-input-68-dc1626cdca47>, line 3)

In [66]:
math.log(2)

0.6931471805599453

In [64]:
for i in wordcount:
    for j,k in i.items():
        print(tf(j, i))

0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.2
0.2
0.2
0.2
0.2
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666
0.16666666666666666


In [65]:
for i in wordcount:
    for j,k in i.items():
        print(idf(j, wordcount))
    print()

len(wordcount): 4
(count_sentence(word , wordcount)): 2
log: 0.6931471805599453
0.6931471805599453
len(wordcount): 4
(count_sentence(word , wordcount)): 4
log: 0.0
0.0
len(wordcount): 4
(count_sentence(word , wordcount)): 1
log: 1.3862943611198906
1.3862943611198906
len(wordcount): 4
(count_sentence(word , wordcount)): 1
log: 1.3862943611198906
1.3862943611198906
len(wordcount): 4
(count_sentence(word , wordcount)): 1
log: 1.3862943611198906
1.3862943611198906
len(wordcount): 4
(count_sentence(word , wordcount)): 2
log: 0.6931471805599453
0.6931471805599453

len(wordcount): 4
(count_sentence(word , wordcount)): 2
log: 0.6931471805599453
0.6931471805599453
len(wordcount): 4
(count_sentence(word , wordcount)): 4
log: 0.0
0.0
len(wordcount): 4
(count_sentence(word , wordcount)): 1
log: 1.3862943611198906
1.3862943611198906
len(wordcount): 4
(count_sentence(word , wordcount)): 1
log: 1.3862943611198906
1.3862943611198906
len(wordcount): 4
(count_sentence(word , wordcount)): 1
log: 1.386294

In [53]:
p = 1
for i in wordcount:
    print("part:{}".format(p))
    p = p+1
    for j,k in i.items():
        print("word: {} ---- TF-IDF:{}".format(j, tfidf(j, i, wordcount)))

part:1
word: what ---- TF-IDF:0.11552453009332421
word: is ---- TF-IDF:0.0
word: the ---- TF-IDF:0.23104906018664842
word: weather ---- TF-IDF:0.23104906018664842
word: like ---- TF-IDF:0.23104906018664842
word: today ---- TF-IDF:0.11552453009332421
part:2
word: what ---- TF-IDF:0.13862943611198905
word: is ---- TF-IDF:0.0
word: for ---- TF-IDF:0.2772588722239781
word: dinner ---- TF-IDF:0.2772588722239781
word: tonight ---- TF-IDF:0.2772588722239781
part:3
word: this ---- TF-IDF:0.23104906018664842
word: is ---- TF-IDF:0.0
word: a ---- TF-IDF:0.11552453009332421
word: question ---- TF-IDF:0.23104906018664842
word: worth ---- TF-IDF:0.23104906018664842
word: pondering ---- TF-IDF:0.23104906018664842
part:4
word: it ---- TF-IDF:0.23104906018664842
word: is ---- TF-IDF:0.0
word: a ---- TF-IDF:0.11552453009332421
word: beautiful ---- TF-IDF:0.23104906018664842
word: day ---- TF-IDF:0.23104906018664842
word: today ---- TF-IDF:0.11552453009332421


In [50]:
for i in wordcount:
    for j,k in i.items():
        print(j,k)

what 1
is 1
the 1
weather 1
like 1
today 1
what 1
is 1
for 1
dinner 1
tonight 1
this 1
is 1
a 1
question 1
worth 1
pondering 1
it 1
is 1
a 1
beautiful 1
day 1
today 1
