Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
peterwilliams97
committed
May 14, 2012
1 parent
d57d634
commit 8ba2acf
Showing
4 changed files
with
135 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,19 @@ | ||
s1 = 'We all live in that yellow submarine' | ||
s2 = 'The yellow mustard in that submarine sandwich was not yellow' | ||
|
||
w1 = set(s1.split()) | ||
w2 = set(s2.split()) | ||
s1 = 'To be or not to be' | ||
s2 = 'To think and therefore to be' | ||
|
||
w1 = set(s1.lower().split()) | ||
w2 = set(s2.lower().split()) | ||
|
||
intersection = w1 & w2 | ||
union = w1 | w2 | ||
|
||
print 'Jacard =%d/%d' % (len(intersection), len(union)) | ||
print intersection | ||
print union | ||
|
||
print s1 | ||
print s2 | ||
print 'w1 =', sorted(w1) | ||
print 'w2 =', sorted(w2) | ||
print 'intersection =', sorted(intersection) | ||
print 'union =', sorted(union) | ||
print 'Jacard = %d/%d' % (len(intersection), len(union)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,15 @@ | ||
from __future__ import division | ||
from math import * | ||
import math | ||
|
||
N = 806791 | ||
text = ''' | ||
term document frequency Doc 1 Doc 2 Doc 3 | ||
car 18165 27 4 24 | ||
auto 6723 3 33 0 | ||
insurance 19241 0 39 29 | ||
best 25235 14 0 17 | ||
''' | ||
lines = text.split('\n') | ||
lines = [ln.strip() for ln in lines if ln.strip()] | ||
words = [ln.split('\t') for ln in lines] | ||
keys = words[0][1:] | ||
W = len(keys) | ||
data = {} | ||
for wrd in words[1:]: | ||
row = [int(w) for w in wrd[1:]] | ||
data[wrd[0]] = dict([(keys[i],row[i]) for i in range(W)]) | ||
def log10(x): return math.log(x,10) | ||
|
||
print '%20s : %s' % ('', '\t'.join(keys) ) | ||
for k,v in data.items(): | ||
print '%20s : %s' % (k, '\t'.join([str(v[x]) for x in keys])) | ||
N = 806791 | ||
doc_count = 25235 | ||
count = 17 | ||
|
||
|
||
df = sum(v['Doc 3'] for v in data.values()) | ||
idf = log(N, 10) - log(df, 10) | ||
tf = 1.0 + log(data['best']['Doc 3'], 10) | ||
tfidf = tf * idf | ||
idf = log10(N) - log10(doc_count) | ||
tf = 1.0 + log10(count) | ||
tfidf = tf * idf | ||
|
||
print 'tfidf = %.3f' % tfidf | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from __future__ import division | ||
|
||
text = 'N N N R R N R R N R R N R' | ||
text = 'N R R N N N R R N N N N N N R' | ||
results = [p == 'R' for p in text.split()] | ||
N = len(results) | ||
|
||
def get_precision(n): | ||
return sum(results[:n])/n | ||
|
||
precisions = [get_precision(n) for n in range(1,N+1)] | ||
|
||
relevant_precisions = [precisions[i] for i in range(N) if results[i]] | ||
average_precision = sum(relevant_precisions)/len(relevant_precisions) | ||
|
||
print 'Average precsion = %.3f' % average_precision | ||
|
||
|