In [42]:
import re
from porter import PorterStemmer
import math
import numpy as np
from scipy.spatial import distance
from nltk.corpus import wordnet as wn
import nltk


def get_documents(filename):
    file = open(filename)
    document = ['', ''] # [header, body]
    header = True
    for line in file.readlines():
        if line == '\n':
            yield document
            document = ['', '']
            header = True
            continue
        if header:
            document[0] = line
            header = False
        else:
            document[1] += line
    yield document


def load_keywords(filename):
    file = open(filename)
    keywords = file.read().splitlines()
    return keywords

In [43]:
#text utils
def tokenize(text):
    text = text.lower()
    text = clear(text)
    tokens = text.split()
    return tokens


def clear(text):
    return re.sub(r"[^A-Za-z0-9]", " ", text).strip()


def stemming(text, stemmer):
    words = tokenize(text)
    stemmed_words = [stemmer.stem(x, 0, len(x) - 1) for x in words]
    return stemmed_words


def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('V'):
        return wn.VERB
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    else:
        return ''

In [44]:
class Document:
    def __init__(self, id, title, body, stemmed_content):
        self.title = title
        self.body = body
        self.id = id
        self.stemmed_content = stemmed_content
        
    def content(self):
        return self.title + self.body

In [45]:
class TfIdfCalculator:

    def __init__(self, stemmed_keywords, stemmed_documents):
        self.keywords = stemmed_keywords
        self.stemmed_documents = stemmed_documents
        self.no_of_documents = len(self.stemmed_documents)      

    def _tfidf(self, term, text):
        tf = text.count(term) / max([text.count(x) for x in text])
        n = 0
        for doc in self.stemmed_documents:
            if term in doc:
                n += 1
        idf = math.log(self.no_of_documents / n)  # div by zero possible
        return tf * idf

    def calculate_tfidf(self, stemmed_text):
        tfidf_vector = np.zeros(len(self.keywords))
        for term in stemmed_text:
            if term in self.keywords:
                tfidf_vector[self.keywords.index(term)] = self._tfidf(term, stemmed_text)
        return tfidf_vector


In [46]:
class SearchEngine:
    
    def __init__(self, keywords, documents):
        self.stemmer = PorterStemmer()
        self.stemmed_keywords = stemming(" ".join(keywords), self.stemmer)
        self.documents_by_id = self._stemmed_docs_by_id(documents)
        self.tfIdfCalculator = TfIdfCalculator(self.stemmed_keywords, [doc.stemmed_content for doc in self.documents_by_id.values()])
        self._tfidf_by_id = self._tfidf_by_id()

    
    def _stemmed_docs_by_id(self, documents):
        documents_by_id = {}
        for idx, doc in enumerate(documents):
            stemmed_doc = Document(idx, clear(doc[0]), doc[1],  stemming(doc[0] + doc[1], self.stemmer))
            documents_by_id[stemmed_doc.id] = stemmed_doc
        return documents_by_id
    
    def _tfidf_by_id(self):
        tfidf_by_id = {}
        for id, doc in self.documents_by_id.items():
            tfidf_by_id[id] = self.tfIdfCalculator.calculate_tfidf(doc.stemmed_content) 
        return tfidf_by_id
    
    
    def get_query_expansions(self, query):
        tokenized_query = tokenize(query)
        pos_tags = [get_wordnet_pos(t[1]) for t in nltk.pos_tag(tokenized_query)]
        synonyms_by_token = {}
        for token, pos_tag in zip(tokenized_query, pos_tags):
            synonyms = [l.name() for s in wn.synsets(token, pos=pos_tag)[:3] for l in s.lemmas() if l.name() != token]
            synonyms = list(set(synonyms))
            synonyms = [s.lower().replace('_', ' ') for s in synonyms]
            synonyms_by_token[token] = synonyms
        extended_queries = []
        extended_queries.append(query)
        for idx, token in enumerate(tokenized_query):
            for synonym in synonyms_by_token[token]:
                added_synonym = tokenized_query.copy() + [synonym]
                replaced_by_synonym = tokenized_query.copy()
                replaced_by_synonym[idx] = synonym
                extended_queries.append(' '.join(added_synonym))
                extended_queries.append(' '.join(replaced_by_synonym))
        return extended_queries
    
    
    def print_stemmed_doc(self, title):
        for id, doc in self.documents_by_id.items():
            if doc.title == title:
                print("Title:{}\nContent:{}".format(doc.title,doc.stemmed_content))
    
    def query_documents(self, query):
        query_tfidf = self.tfIdfCalculator.calculate_tfidf(stemming(query, self.stemmer))
        result = []
        for id, doc_tfidf in self._tfidf_by_id.items():
            similarity = 1 - distance.cosine(query_tfidf, doc_tfidf)
            result.append((self.documents_by_id[id].title,  0 if math.isnan(similarity) else similarity))
        for title, score in sorted(result, key=lambda title_and_similarity: title_and_similarity[1], reverse=True):
            print("Title: {}\nScore: {}\n".format(title,score))

In [47]:
documents = list(get_documents('documents.txt'))
keywords = load_keywords('keywords.txt')
search = SearchEngine(keywords,documents)

In [48]:
search.print_stemmed_doc('Kernel Machines')

Title:Kernel Machines
Content:['kernel', 'machin', 'descript', 'a', 'central', 'sourc', 'of', 'inform', 'on', 'kernel', 'base', 'method', 'includ', 'support', 'vector', 'machin', 'gaussian']


In [49]:
search.query_documents('The Machine Learning Network Online Information Service provides information')

Title: Training   Machine Learning network Online Information Service
Score: 0.7372282736923004

Title: MLnet OiS   Find information and resources on Machine Learning
Score: 0.5580235548154249

Title: Machine Learning for Information Retrieval  Neural Networks
Score: 0.43154992335958675

Title: Kernel Machines
Score: 0.396182503101545

Title: Machine Learning for Information Extraction
Score: 0.3622035054837187

Title: Machine Learning and Information Retrieval  Belew Shavlik
Score: 0.34126844538586076

Title: IJCAI 99 Workshop  Machine Learning for Information Filtering  Workshop
Score: 0.30642135295122996

Title: Machine Learning
Score: 0.2853461306482754

Title: Machine Learning and Neural Networks Group   Universities of
Score: 0.2725552926732029

Title: The Machine Learning Dictionary
Score: 0.2683705107604477

Title: Simple Machines
Score: 0.2553517300539845

Title: BYU Neural Networks and Machine Learning Lab
Score: 0.25382319565733646

Title: AAAI Spring Symposium  Machine Lear

In [50]:
search = SearchEngine([], [])

In [51]:
search.get_query_expansions('Call the dog')

['Call the dog',
 'call the dog telephone',
 'telephone the dog',
 'call the dog phone',
 'phone the dog',
 'call the dog call up',
 'call up the dog',
 'call the dog ring',
 'ring the dog',
 'call the dog name',
 'name the dog',
 'call the dog canis familiaris',
 'call the canis familiaris',
 'call the dog frump',
 'call the frump',
 'call the dog domestic dog',
 'call the domestic dog']