Skip to content

Commit

Permalink
Create tf-idf.py
Browse files Browse the repository at this point in the history
  • Loading branch information
ludmal committed Nov 27, 2013
1 parent 48def9d commit aaba6e8
Showing 1 changed file with 26 additions and 0 deletions.
26 changes: 26 additions & 0 deletions tf-idf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/usr/bin/python
# -*- coding: latin-1 -*-

import math
from operator import itemgetter

def tfidf(word, doc, docList):
tf = doc.split(None).count(word) / float(len(doc.split(None)))

docs_in_word = 0

for doc in docList:
if doc.split(None).count(word) > 0:
docs_in_word +=1

idf = math.log(len(docList)/ docs_in_word)

return tf * idf

def top_keywords(doc, docList, n=5):
d = {}
for word in set(doc.split(None)):
d[word] = tfidf(word, doc, docList)

sorted_d = sorted(d.items(), key=itemgetter(1), reverse=True)
return [w[0] for w in sorted_d[:n]]

0 comments on commit aaba6e8

Please sign in to comment.