In [60]:
import re
from porter import PorterStemmer
import math


def get_documents(filename):
    file = open(filename)
    document = ['', ''] # [header, body]
    header = True
    for line in file.readlines():
        if line == '\n':
            yield document
            document = ['', '']
            header = True
            continue
        if header:
            document[0] = line
            header = False
        else:
            document[1] += line
    yield document


def load_keywords(filename):
    file = open(filename)
    keywords = file.read().splitlines()
    return keywords


def to_words(text):
    text = text.lower()
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = text.split()
    return text


def to_stemmed_words(text, stemmer):
    words = to_words(text)
    stemmed_words = [stemmer.stem(x, 0, len(x) - 1) for x in words]
    return stemmed_words


In [61]:
class VectorRepresentation:

    def __init__(self, keywords, documents):
        self.stemmer = PorterStemmer()
        self.keywords = keywords
        self.documents = []
        for doc in documents:
            self.documents.append(to_stemmed_words(doc[0] + doc[1], self.stemmer))
        self.no_of_documents = len(documents)


    def _tfidf(self, term, text):
        tf = text.count(term) / max([text.count(x) for x in text])
        n = 0
        for doc in self.documents:
            if term in doc:
                n += 1
        idf = math.log(self.no_of_documents / n)  # div by zero possible
        return tf * idf

    def get_tfidf_vector(self, text):
        vector = [0.0] * len(self.keywords)
        for term in text:
            if term in self.keywords:
                vector[self.keywords.index(term)] = self._tfidf(term, text)
        return vector


In [64]:
# test
stemmer = PorterStemmer()
for d in get_documents('documents.txt'):
    words = to_words(d[0] + d[1])
    text = [stemmer.stem(x, 0, len(x) - 1) for x in words]
print(text)

['appli', 'a', 'machin', 'learn', 'workbench', 'experi', 'with', 'appli', 'a', 'machin', 'learn', 'workbench', 'experi', 'with', 'agricultur', 'databas', 'stephen', 'r', 'garner', 'salli', 'jo', 'cunningham', 'geoffrei', 'holm', 'craig', 'g', 'nevil', 'man']


In [63]:
documents = [d for d in get_documents('documents.txt')]
keywords = load_keywords('keywords.txt')
helper = VectorRepresentation(keywords, documents)