submitting quiz7

peterwilliams97 · May 14, 2012 · 8ba2acf · 8ba2acf
1 parent d57d634
commit 8ba2acf
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 33 deletions.
diff --git a/quiz7/q1.py b/quiz7/q1.py
@@ -1,13 +1,19 @@
 s1 = 'We all live in that yellow submarine'
 s2 = 'The yellow mustard in that submarine sandwich was not yellow'
 
-w1 = set(s1.split())
-w2 = set(s2.split())
+s1 = 'To be or not to be'
+s2 = 'To think and therefore to be'
+
+w1 = set(s1.lower().split())
+w2 = set(s2.lower().split())
 
 intersection = w1 & w2
 union = w1 | w2
 
-print 'Jacard =%d/%d' % (len(intersection), len(union))
-print intersection
-print union
-
+print s1
+print s2
+print 'w1 =', sorted(w1)
+print 'w2 =', sorted(w2)
+print 'intersection =', sorted(intersection)
+print 'union =', sorted(union)
+print 'Jacard = %d/%d' % (len(intersection), len(union))
diff --git a/quiz7/q2.py b/quiz7/q2.py
@@ -1,32 +1,15 @@
 from __future__ import division
-from math import *
+import math
 
-N = 806791
-text = '''
-term	document frequency	Doc 1	Doc 2	Doc 3
-car	18165	27	4	24
-auto	6723	3	33	0
-insurance	19241	0	39	29
-best	25235	14	0	17
-'''
-lines = text.split('\n')
-lines = [ln.strip() for ln in lines if ln.strip()]
-words = [ln.split('\t') for ln in lines]
-keys = words[0][1:]
-W = len(keys)
-data = {}
-for wrd in words[1:]:
-    row = [int(w) for w in wrd[1:]]
-    data[wrd[0]] = dict([(keys[i],row[i]) for i in range(W)])
+def log10(x): return math.log(x,10)
 
-print '%20s : %s' % ('', '\t'.join(keys)  )  
-for k,v in data.items():
-    print '%20s : %s' % (k, '\t'.join([str(v[x]) for x in keys]))    
+N = 806791 
+doc_count = 25235   
+count = 17
 
-
-df = sum(v['Doc 3'] for v in data.values())   
-idf = log(N, 10) - log(df, 10)
-tf = 1.0 + log(data['best']['Doc 3'], 10)
-tfidf = tf * idf  
+idf = log10(N) - log10(doc_count)
+tf = 1.0 + log10(count)
+tfidf = tf * idf
 
 print 'tfidf = %.3f' % tfidf
+
diff --git a/quiz7/q3.py b/quiz7/q3.py
@@ -1,6 +1,13 @@
 from __future__ import division
-from math import *
+import math 
 
+def L(x): 
+    return math.log(x, 10)
+
+def norm(lst):
+    return math.sqrt(sum(x**2 for x in lst))
+
+# Given data
 text = '''
 term	Dawn	Beatrice	She	Regeneration
 happiness	37	30	0	3
@@ -9,3 +16,91 @@
 adventure	0	5	13	07
 '''
 
+text = '''
+term	Dawn	Beatrice	She	Regeneration
+happiness	37	30	0	3
+surprise	40	10	6	0
+family	31	0	12	17
+adventure	0	5	13	0
+'''
+
+# Build data structures
+#  docs = list of docs
+#  terms = list of terms
+#  counts[doc][term] = count of term in doc 
+lines = text.split('\n')
+lines = [ln.strip() for ln in lines if ln.strip()]
+words = [ln.split('\t') for ln in lines]
+docs = words[0][1:]
+terms = []
+counts = dict((k,{}) for k in docs)
+for wrd in words[1:]:
+    term, vals = wrd[0], [int(w) for w in wrd[1:]]
+    for i,doc in enumerate(docs):
+        counts[doc][term] = vals[i]
+    terms.append(term)    
+
+# doc_total_words[doc] = Total # word in doc   
+doc_total_words = dict((doc,sum(counts[doc].values())) for doc in docs)
+
+# doc_counts[term] = # of docs that term occurs in 
+doc_counts = dict((term, sum(1 if counts[doc][term] else 0 for doc in docs)) for term in terms)
+
+# N = total number of docs
+N = len(docs)  
+# Inverse doc frequency
+idf = dict((term, L(N) - L(doc_counts[term])) for term in terms)
+
+print 'N = %d' % N
+print 'doc_counts = %s' % doc_counts
+print 'idf = %s' % idf
+
+def get_tf(count): return 1.0 + L(count) if count else 0.0
+
+# Compute tf-idf        
+tfidf = dict(
+        (doc, dict(
+                (term, get_tf(counts[doc][term]) * idf[term]) 
+              for term in terms)) 
+        for doc in docs)
+
+# per-document l2 norms
+l2norm = dict((doc, norm(tfidf[doc].values())) for doc in docs)         
+
+# tf-idf normalized to unit document length        
+tfidf_norm = dict(
+        (doc, dict(
+                (term, tfidf[doc][term] /l2norm[doc]) 
+              for term in terms)) 
+        for doc in docs)
+
+def cosine(doc1, doc2):
+    return sum(tfidf_norm[doc1][term] * tfidf_norm[doc2][term] for term in terms)
+
+#
+# Print out results    
+#    
+print '-' * 80 
+print 'Original table'   
+print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))  
+for term in terms:
+    print '%20s : %s' % (term, ' '.join(['%10d' % counts[doc][term] for doc in docs])) 
+print '%20s : %s' % ('Total', ' '.join(['%10d' % doc_total_words[doc] for doc in docs]))    
+
+print '-' * 80 
+print 'tfidf table'   
+print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))  
+for term in terms:
+    print '%20s : %s' % (term, ' '.join(['%10.4f' % tfidf[doc][term] for doc in docs]))   
+
+print '-' * 80 
+print 'normalized tfidf table'   
+print '%20s : %s' % ('term', ' '.join(['%10s' % k for k in docs]))  
+for term in terms:
+    print '%20s : %s' % (term, ' '.join(['%10.4f' % tfidf_norm[doc][term] for doc in docs]))      
+
+print '-' * 80 
+doc1 = 'Beatrice' 
+doc2 = 'Regeneration' 
+print 'cosine("%s", "%s") = %.2f' % (doc1,doc2, cosine(doc1, doc2))
+
diff --git a/quiz7/q4.py b/quiz7/q4.py
@@ -0,0 +1,18 @@
+from __future__ import division
+
+text = 'N N N R R N R R N R R N R'
+text = 'N R R N N N R R N N N N N N R'
+results = [p == 'R' for p in text.split()]
+N = len(results)
+
+def get_precision(n):
+    return sum(results[:n])/n
+
+precisions = [get_precision(n) for n in range(1,N+1)]
+
+relevant_precisions = [precisions[i] for i in range(N) if results[i]]
+average_precision = sum(relevant_precisions)/len(relevant_precisions)
+
+print 'Average precsion = %.3f' % average_precision
+
+