In [92]:
import re
from porter import PorterStemmer
import math
import numpy as np
from scipy.spatial import distance


def get_documents(filename):
    file = open(filename)
    document = ['', ''] # [header, body]
    header = True
    for line in file.readlines():
        if line == '\n':
            yield document
            document = ['', '']
            header = True
            continue
        if header:
            document[0] = line
            header = False
        else:
            document[1] += line
    yield document


def load_keywords(filename):
    file = open(filename)
    keywords = file.read().splitlines()
    return keywords

In [93]:
class TfIdfCalculator:

    def __init__(self, keywords, documents):
        self.stemmer = PorterStemmer()
        self.keywords = self._stemming(" ".join(keywords), self.stemmer)
        self.stemmed_documents = []
        for doc in documents:
            self.stemmed_documents.append(self._stemming(doc[0] + doc[1], self.stemmer))
        self.no_of_documents = len(self.stemmed_documents)

    def _tokenize(self, text):
        text = text.lower()
        text = re.sub(r"[^A-Za-z0-9]", " ", text)
        tokens = text.split()
        return tokens


    def _stemming(self, text, stemmer):
        words = self._tokenize(text)
        stemmed_words = [stemmer.stem(x, 0, len(x) - 1) for x in words]
        return stemmed_words
        

    def _tfidf(self, term, text):
        tf = text.count(term) / max([text.count(x) for x in text])
        n = 0
        for doc in self.stemmed_documents:
            if term in doc:
                n += 1
        idf = math.log(self.no_of_documents / n)  # div by zero possible
        return tf * idf

    def calculate_tfidf(self, text):
        stemmed_text = self._stemming(text, self.stemmer)
        tfidf_vector = np.zeros(len(self.keywords))
        for term in stemmed_text:
            if term in self.keywords:
                tfidf_vector[self.keywords.index(term)] = self._tfidf(term, text)
        return tfidf_vector


In [148]:
class SearchEngine:
    
    def __init__(self, keywords, documents):
        self.tfIdfCalculator = TfIdfCalculator(keywords, documents)
        self.tfidf_by_doc_title = {}
        for doc in documents:
            self.tfidf_by_doc_title[doc[0]] = self.tfIdfCalculator.calculate_tfidf(doc[0] + doc[1])
    
    def query_documents(self, query):
        query_tfidf = self.tfIdfCalculator.calculate_tfidf(query)
        result = []
        for title, doc_tfidf in self.tfidf_by_doc_title.items():
            dist = 1 - distance.cosine(query_tfidf, doc_tfidf)
            result.append((title,  0 if math.isnan(dist) else dist))
        #return sorted(result, key=lambda title_and_distance: title_and_distance[1], reverse=True)
        return sorted(result, key=lambda title_and_distance: title_and_distance[1], reverse=True)

In [149]:
documents = list(get_documents('documents.txt'))
keywords = load_keywords('keywords.txt')
helper = TfIdfCalculator(keywords, documents)

In [150]:
engine = SearchEngine(keywords,documents)

In [151]:
engine.query_documents('The Machine Learning Network Online Information Service provides information')

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


[('Kernel Machines\n', 0.58757906529148574),
 ('Home Page of the UW-Madison Machine Learning Research Group\n',
  0.57995941209826196),
 ('MLnet OiS - Find information and resources on Machine Learning, ... \n',
  0.54039529106007722),
 ('Simple Machines\n', 0.42996280012981281),
 ('The Machine Learning Dictionary\n', 0.38263897647908118),
 ('The NN learning algorithm benchmarking page\n', 0.20871510232887736),
 ('Machine Learning in Games\n', 0.0),
 ('Teaching, Machine Learning\n', 0.0),
 ('ICML 2003\n', 0),
 ('Machine Learning and Applied Statistics - Home\n', 0),
 ('Machine Learning Courses\n', 0),
 ('Yahoo! Groups : machine-learning\n', 0.0),
 ('Machine Learning at the University of Toronto\n', 0.0),
 ('IJCAI 99 Workshop: Machine Learning for Information Filtering. Workshop\n',
  0),
 ('Machine Learning Research Software\n', 0.0),
 ('IDIAP : > MachineLearning > Introduction\n', 0.0),
 ('Weka 3 - Data Mining with Open Source Machine Learning Software ... \n',
  0.0),
 ('Home Pages o