# Corpus

In [13]:
doc0 = "Big cats are nice and funny."
doc1 = "Small dogs are better than big dogs."
doc2 = "Small cats are afraid of small dogs."
doc3 = "Big cats are not afraid of small dogs"
doc4 = "Funny cats are not afraid of small dogs"
corpus = [doc0, doc1, doc2, doc3, doc4]

# Preprocessing 

In [14]:
from string import punctuation 
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk.tag import pos_tag

#nltk.download('wordnet')
#nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer() 

stopwords = stopwords.words('english')

punctuation = "["+punctuation+"]"

#Remove punctuation 
def remove_punctuation(string) :
    return(re.sub(punctuation, "", string))

#Remove stopwords
def remove_stopwords(tokens) :
    without_stopwords = []
    for token in tokens :
        if token not in stopwords :
            without_stopwords.append(token.lower())
    return without_stopwords

#Lemmatize
def lemmatize(tokens) :
    lemmas = []
    pos = pos_tag(tokens)
    for word, tag in pos :
        tag_starting = tag[0].lower()
        tag_starting = tag_starting if tag_starting in ['a', 'r', 'n', 'v'] else None
        if not tag_starting :
            lemma = word
        else :
            lemma = lemmatizer.lemmatize(word, tag_starting)
        lemmas.append(lemma)
    return lemmas

In [15]:
#Dictionary of document:[words]
doc_terms = {}
#Getting terms in each document(Removing spacing)
doc_no = 0
for doc in corpus :
    doc_terms[doc_no] = []
    doc = remove_punctuation(doc.lower())
    doc_terms[doc_no]= doc.split()
    print(doc_terms[doc_no])
    doc_no += 1

['big', 'cats', 'are', 'nice', 'and', 'funny']
['small', 'dogs', 'are', 'better', 'than', 'big', 'dogs']
['small', 'cats', 'are', 'afraid', 'of', 'small', 'dogs']
['big', 'cats', 'are', 'not', 'afraid', 'of', 'small', 'dogs']
['funny', 'cats', 'are', 'not', 'afraid', 'of', 'small', 'dogs']


In [16]:
indexes = []
for doc, terms in doc_terms.items() :
    terms = remove_stopwords(terms)
    #print(doc_terms[doc])
    terms = lemmatize(terms)
    doc_terms[doc] = terms
    #print(doc_terms[doc])
    indexes.extend(terms) 
indexes = list(set(indexes))

print(doc_terms)
print(indexes)

{0: ['big', 'cat', 'nice', 'funny'], 1: ['small', 'dog', 'well', 'big', 'dog'], 2: ['small', 'cat', 'afraid', 'small', 'dog'], 3: ['big', 'cat', 'afraid', 'small', 'dog'], 4: ['funny', 'cat', 'afraid', 'small', 'dog']}
['funny', 'dog', 'cat', 'big', 'afraid', 'well', 'nice', 'small']


# Term Incidence Matrix

In [17]:
import numpy as np
import pandas as pd

term_incidence = {}
for term in indexes :
    term_incidence[term] = np.array([0]*len(corpus))
    for i in range(len(corpus)) :
        if term in doc_terms[i] :
            term_incidence[term][i] = 1

for term, incidence_vector in term_incidence.items() :
    print(term,":", incidence_vector)

funny : [1 0 0 0 1]
dog : [0 1 1 1 1]
cat : [1 0 1 1 1]
big : [1 1 0 1 0]
afraid : [0 0 1 1 1]
well : [0 1 0 0 0]
nice : [1 0 0 0 0]
small : [0 1 1 1 1]


In [18]:
def And(top1, top2) :
    res = []
    for i in range(len(top1)):
        if(top1[i] == 0 or top2[i] == 0) :
            res.append(0)
        else :
            res.append(1)
    return res

def Or(top1, top2) :
    res = []
    for i in range(len(top1)):
        if(top1[i] == 1 or top2[i] == 1) :
            res.append(1)
        else :
            res.append(0)
    return res

In [19]:
OPERATORS = set(["and", "or", "not", "(", ")"])
PRIORITY = {"and":1, "or":1, "not":2}

def infix_to_postfix(query):
    output = []
    stack = []
    
    for term in query :
        if term not in OPERATORS :
            output.append(term)
        elif(term == "(") :
            stack.append(term)
        elif(term == ")") :
            while stack and stack[-1]!= '(':
                output.append(stack.pop())
            stack.pop()
        else :
            while stack and stack[-1]!='(' and PRIORITY[term]<=PRIORITY[stack[-1]]:
                output.append(stack.pop())
            stack.append(term)
    while stack: 
        output.append(stack.pop())
    return output

In [20]:
infix_to_postfix("( not cat and dog ) and dog".split())

['cat', 'not', 'dog', 'and', 'dog', 'and']

In [25]:
from nltk.corpus import wordnet as wn

def synonyms(word) :
    synonyms = []
    for ss in wn.synsets(word) :
        for lemma in ss.lemmas() :
            synonyms.append(lemma.name())
    return list(set(synonyms))

In [29]:
operations = ["not", "and", "or"]
def query_processing(query) :
    stack = []
    postfix = infix_to_postfix(query)
    if(not set(postfix).intersection(operations)) :
        postfix.extend(["and"]*(len(postfix)-1))
    print(postfix)
    for term in postfix :
        #print(stack)
        if term not in operations :
            if term in indexes :
                stack.append(list(term_incidence[term]))
            elif set(lemmatize(synonyms(term))).intersection(indexes):
                term = list(set(lemmatize(synonyms(term))).intersection(indexes))[0]
                stack.append(list(term_incidence[term]))
            else :
                stack.append([0]*len(corpus))
        else :
            if(term == "not") :
                if(len(stack) >= 1) :
                    top = stack.pop()
                    top = [not i for i in top]
                    #print("not", top)
                    stack.append([1 if i else 0 for i in top])
                else :
                    return []
            elif(term == "and") :
                if(len(stack) >= 2) :
                    top1 = stack.pop()
                    top2 = stack.pop()
                    #print("and", top1, top2)
                    stack.append(And(top1, top2))
                else :
                    return []
            elif(term == "or") :
                if(len(stack) >= 2) :
                    top1 = stack.pop()
                    top2 = stack.pop()
                    #print("or", top1, top2)
                    stack.append(Or(top1, top2))
                else :
                    return []
    if(len(stack) != 1) :
        return []
    return stack[-1]

In [23]:
def get_query() :
    query = input()
    query = query.lower()
    #query = remove_stopwords(query.split())
    query = lemmatize(query.split())
    #print(query)
    common_terms = list(set(query).intersection(indexes))
    for term in query :
        common_terms.extend(set(lemmatize(synonyms(term))).intersection(indexes))
    if(common_terms) :
        if(len(query) == 1) :
            relevance_vector = term_incidence[query[0]]
        else :
            relevance_vector = query_processing(query)
        print(relevance_vector)
        if(len(relevance_vector)) :
            print("Relevant documents :")
            i = 0
            for relevance in relevance_vector :
                if(relevance) :
                    print(corpus[i])
                i += 1
        else :
            print("Either type a valid boolean query, or without any of the operators")
            get_query()
    else :
        print("None of the query terms match the corpus")

In [27]:
syns = synonyms('funny')
print(syns)
syns = synonyms('small')
print(syns)

['shady', 'amusing', 'laughable', 'funny_story', 'fishy', 'comical', 'funny', 'singular', 'suspicious', 'curious', 'risible', 'rum', 'peculiar', 'suspect', 'queer', 'funny_remark', 'rummy', 'mirthful', 'good_story', 'comic', 'odd']
['diminished', 'modest', 'minor', 'lowly', 'minuscule', 'small-scale', 'little', 'pocket-sized', 'low', 'humble', 'pocket-size', 'belittled', 'small']


In [31]:
get_query()

funny and small
['funny', 'small', 'and']
[0, 0, 0, 0, 1]
Relevant documents :
Funny cats are not afraid of small dogs


In [32]:
get_query()

amusing and minuscule
['amusing', 'minuscule', 'and']
[0, 0, 0, 0, 1]
Relevant documents :
Funny cats are not afraid of small dogs


In [34]:
get_query()

( not cat and dog ) and dog
['cat', 'not', 'dog', 'and', 'dog', 'and']
[0, 1, 0, 0, 0]
Relevant documents :
Small dogs are better than big dogs.
