/
q3.py
106 lines (87 loc) · 3.03 KB
/
q3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from __future__ import division
import math
def log10(x):
return math.log(x, 10)
def norm(lst):
return math.sqrt(sum(x**2 for x in lst))
# Given data
text = '''
term Dawn Beatrice She Regeneration
happiness 37 30 0 3
surprise 40 10 6 0
family 31 0 12 17
adventure 0 5 13 07
'''
text = '''
term Dawn Beatrice She Regeneration
happiness 37 30 0 3
surprise 40 10 6 0
family 31 0 12 17
adventure 0 5 13 0
'''
# Build data structures
# docs = list of docs
# terms = list of terms
# counts[doc][term] = count of term in doc
lines = text.split('\n')
lines = [ln.strip() for ln in lines if ln.strip()]
words = [ln.split('\t') for ln in lines]
docs = words[0][1:]
terms = []
counts = dict((k,{}) for k in docs)
for wrd in words[1:]:
term, vals = wrd[0], [int(w) for w in wrd[1:]]
for i,doc in enumerate(docs):
counts[doc][term] = vals[i]
terms.append(term)
# doc_total_words[doc] = Total # word in doc
doc_total_words = dict((doc,sum(counts[doc].values())) for doc in docs)
# doc_counts[term] = # of docs that term occurs in
doc_counts = dict((term, sum(1 if counts[doc][term] else 0 for doc in docs)) for term in terms)
# N = total number of docs
N = len(docs)
# Inverse doc frequency
idf = dict((term, log10(N) - log10(doc_counts[term])) for term in terms)
print 'N = %d' % N
print 'doc_counts = %s' % doc_counts
print 'idf = %s' % idf
def get_tf(count): return 1.0 + log10(count) if count else 0.0
# Compute tf-idf
tfidf = dict(
(doc, dict(
(term, get_tf(counts[doc][term]) * idf[term])
for term in terms))
for doc in docs)
# per-document l2 norms
l2norm = dict((doc, norm(tfidf[doc].values())) for doc in docs)
# tf-idf normalized to unit document length
tfidf_norm = dict(
(doc, dict(
(term, tfidf[doc][term] /l2norm[doc])
for term in terms))
for doc in docs)
def cosine(doc1, doc2):
return sum(tfidf_norm[doc1][term] * tfidf_norm[doc2][term] for term in terms)
#
# Print out results
#
print '-' * 80
print 'Original table'
print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))
for term in terms:
print '%20s : %s' % (term, ' '.join(['%10d' % counts[doc][term] for doc in docs]))
print '%20s : %s' % ('Total', ' '.join(['%10d' % doc_total_words[doc] for doc in docs]))
print '-' * 80
print 'tfidf table'
print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))
for term in terms:
print '%20s : %s' % (term, ' '.join(['%10.4f' % tfidf[doc][term] for doc in docs]))
print '-' * 80
print 'normalized tfidf table'
print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))
for term in terms:
print '%20s : %s' % (term, ' '.join(['%10.4f' % tfidf_norm[doc][term] for doc in docs]))
print '-' * 80
doc1 = 'Beatrice'
doc2 = 'Regeneration'
print 'cosine("%s", "%s") = %.2f' % (doc1,doc2, cosine(doc1, doc2))