Skip to content

Commit

Permalink
submitting quiz7
Browse files Browse the repository at this point in the history
  • Loading branch information
peterwilliams97 committed May 14, 2012
1 parent d57d634 commit 8ba2acf
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 33 deletions.
18 changes: 12 additions & 6 deletions quiz7/q1.py
@@ -1,13 +1,19 @@
s1 = 'We all live in that yellow submarine'
s2 = 'The yellow mustard in that submarine sandwich was not yellow'

w1 = set(s1.split())
w2 = set(s2.split())
s1 = 'To be or not to be'
s2 = 'To think and therefore to be'

w1 = set(s1.lower().split())
w2 = set(s2.lower().split())

intersection = w1 & w2
union = w1 | w2

print 'Jacard =%d/%d' % (len(intersection), len(union))
print intersection
print union

print s1
print s2
print 'w1 =', sorted(w1)
print 'w2 =', sorted(w2)
print 'intersection =', sorted(intersection)
print 'union =', sorted(union)
print 'Jacard = %d/%d' % (len(intersection), len(union))
35 changes: 9 additions & 26 deletions quiz7/q2.py
@@ -1,32 +1,15 @@
from __future__ import division
from math import *
import math

N = 806791
text = '''
term document frequency Doc 1 Doc 2 Doc 3
car 18165 27 4 24
auto 6723 3 33 0
insurance 19241 0 39 29
best 25235 14 0 17
'''
lines = text.split('\n')
lines = [ln.strip() for ln in lines if ln.strip()]
words = [ln.split('\t') for ln in lines]
keys = words[0][1:]
W = len(keys)
data = {}
for wrd in words[1:]:
row = [int(w) for w in wrd[1:]]
data[wrd[0]] = dict([(keys[i],row[i]) for i in range(W)])
def log10(x): return math.log(x,10)

print '%20s : %s' % ('', '\t'.join(keys) )
for k,v in data.items():
print '%20s : %s' % (k, '\t'.join([str(v[x]) for x in keys]))
N = 806791
doc_count = 25235
count = 17


df = sum(v['Doc 3'] for v in data.values())
idf = log(N, 10) - log(df, 10)
tf = 1.0 + log(data['best']['Doc 3'], 10)
tfidf = tf * idf
idf = log10(N) - log10(doc_count)
tf = 1.0 + log10(count)
tfidf = tf * idf

print 'tfidf = %.3f' % tfidf

97 changes: 96 additions & 1 deletion quiz7/q3.py
@@ -1,6 +1,13 @@
from __future__ import division
from math import *
import math

def L(x):
return math.log(x, 10)

def norm(lst):
return math.sqrt(sum(x**2 for x in lst))

# Given data
text = '''
term Dawn Beatrice She Regeneration
happiness 37 30 0 3
Expand All @@ -9,3 +16,91 @@
adventure 0 5 13 07
'''

text = '''
term Dawn Beatrice She Regeneration
happiness 37 30 0 3
surprise 40 10 6 0
family 31 0 12 17
adventure 0 5 13 0
'''

# Build data structures
# docs = list of docs
# terms = list of terms
# counts[doc][term] = count of term in doc
lines = text.split('\n')
lines = [ln.strip() for ln in lines if ln.strip()]
words = [ln.split('\t') for ln in lines]
docs = words[0][1:]
terms = []
counts = dict((k,{}) for k in docs)
for wrd in words[1:]:
term, vals = wrd[0], [int(w) for w in wrd[1:]]
for i,doc in enumerate(docs):
counts[doc][term] = vals[i]
terms.append(term)

# doc_total_words[doc] = Total # word in doc
doc_total_words = dict((doc,sum(counts[doc].values())) for doc in docs)

# doc_counts[term] = # of docs that term occurs in
doc_counts = dict((term, sum(1 if counts[doc][term] else 0 for doc in docs)) for term in terms)

# N = total number of docs
N = len(docs)
# Inverse doc frequency
idf = dict((term, L(N) - L(doc_counts[term])) for term in terms)

print 'N = %d' % N
print 'doc_counts = %s' % doc_counts
print 'idf = %s' % idf

def get_tf(count): return 1.0 + L(count) if count else 0.0

# Compute tf-idf
tfidf = dict(
(doc, dict(
(term, get_tf(counts[doc][term]) * idf[term])
for term in terms))
for doc in docs)

# per-document l2 norms
l2norm = dict((doc, norm(tfidf[doc].values())) for doc in docs)

# tf-idf normalized to unit document length
tfidf_norm = dict(
(doc, dict(
(term, tfidf[doc][term] /l2norm[doc])
for term in terms))
for doc in docs)

def cosine(doc1, doc2):
return sum(tfidf_norm[doc1][term] * tfidf_norm[doc2][term] for term in terms)

#
# Print out results
#
print '-' * 80
print 'Original table'
print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))
for term in terms:
print '%20s : %s' % (term, ' '.join(['%10d' % counts[doc][term] for doc in docs]))
print '%20s : %s' % ('Total', ' '.join(['%10d' % doc_total_words[doc] for doc in docs]))

print '-' * 80
print 'tfidf table'
print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))
for term in terms:
print '%20s : %s' % (term, ' '.join(['%10.4f' % tfidf[doc][term] for doc in docs]))

print '-' * 80
print 'normalized tfidf table'
print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))
for term in terms:
print '%20s : %s' % (term, ' '.join(['%10.4f' % tfidf_norm[doc][term] for doc in docs]))

print '-' * 80
doc1 = 'Beatrice'
doc2 = 'Regeneration'
print 'cosine("%s", "%s") = %.2f' % (doc1,doc2, cosine(doc1, doc2))

18 changes: 18 additions & 0 deletions quiz7/q4.py
@@ -0,0 +1,18 @@
from __future__ import division

text = 'N N N R R N R R N R R N R'
text = 'N R R N N N R R N N N N N N R'
results = [p == 'R' for p in text.split()]
N = len(results)

def get_precision(n):
return sum(results[:n])/n

precisions = [get_precision(n) for n in range(1,N+1)]

relevant_precisions = [precisions[i] for i in range(N) if results[i]]
average_precision = sum(relevant_precisions)/len(relevant_precisions)

print 'Average precsion = %.3f' % average_precision


0 comments on commit 8ba2acf

Please sign in to comment.