Read Files

This part opens a directory and reads all of files in it and saves their context in a list as string.
It also saves the names of files to use later for result.

In [2]:
import os 

# where the folder of all docs is
path = "C:\\Users\\ASC\\OneDrive\\Desktop\\temp\\documents" 
os.chdir(path) 
name_of_docs = []  # name of documents
list_of_docs = []  # list of context of documents

for file in os.listdir(): 
    file_path = f"{path}\{file}"
    name_of_docs.append(file[:-4])
    doc = "" 
    # read the file context and put it in list
    with open(file_path, 'r') as f: 
        doc = f.read()
        list_of_docs.append(doc)

print("All of the documents are: ")
for name in name_of_docs:
    print(name, end=" - ")

All of the documents are: 
A Festival of Books - A Murder-Suicide - Better To Be Unlucky - Cloning Pets - Crazy Housing Prices - Food Fight Erupted in Prison - Freeway Chase Ends at Newsstand - Gasoline Prices Hit Record High - Happy and Unhappy Renters - Jerry Decided To Buy a Gun - Man Injured at Fast Food Place - Pulling Out Nine Tons of Trash - Rentals at the Oceanside Community - Sara Went Shopping - Trees Are a Threat - 

Tokenizing

In this part we go through all documents and tokenize their terms
and save the tokens of each document in a 2D array.
we can also stem them or not.

In [3]:
import re
import nltk
import string

all_tokens = []         # a list to save all of tokens of all documents
for i, doc in enumerate(list_of_docs):
    lower_doc = doc.lower()              # make all of contexts lower case
    list_of_docs[i] = lower_doc          
    tokens = re.findall(r'\d+(?:,\d+)*(?:\.\d+)?|\w+', list_of_docs[i])   # tokenize the text with regex
    ## Stem the tokens(This part is ignored for this project)
    # stemmer = nltk.stem.PorterStemmer()
    # tokens = [stemmer.stem(token) for token in tokens]
    all_tokens.append(tokens)

sum_of_tokens = 0
for doc in all_tokens:
    sum_of_tokens += len(doc)
print("Total number of tokens: ", sum_of_tokens)

Total number of tokens:  4114


Delete Unneccessary Tokens

In this part we try to delete useless tokens like single characters and stopwords from tokens.
skip the stop words for now and delete them later.

In [4]:
for i, doc in enumerate(all_tokens):
    new_tokens = []
    for token in doc:                           # delete all of single character tokens except numbers from token list
        if len(token) < 2 and token.isalpha():
            continue
        else:
            new_tokens.append(token)
    all_tokens[i] = new_tokens

sum_of_tokens1 = 0
for doc in all_tokens:
    sum_of_tokens1 += len(doc)
print("Total number of tokens after removing single chars: ", sum_of_tokens1)

# Download the stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Get the list of stopwords for English
stop_words = set(nltk.corpus.stopwords.words('english'))

Total number of tokens after removing single chars:  3887


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Data Structure for Inverted Index

Build a trie tree to save all the terms and for each term save its posting list too.

In [5]:
class TrieNode:
    def __init__(self):
        self.children = {}
        self.posting_list = []
        self.is_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    # function to insert new terms to the trie tree
    def insert(self, word, docID):
        current_node = self.root
        for char in word:
            # If the character is not in the current node's children, add it as a new child node
            if char not in current_node.children:
                current_node.children[char] = TrieNode()
            current_node = current_node.children[char]
        # Mark the end of the word
        current_node.is_word = True
        # add document ID to posting list of the added term
        current_node.posting_list.append(docID)

    # function to check if a term is present in the tree or not
    def search(self, word):
        current_node = self.root
        for char in word:
            # If the character is not in the current node's children, the word is not in the trie
            if char not in current_node.children:
                return False
            current_node = current_node.children[char]
        return current_node.is_word

    # function to return the term's node when it is present in the tree
    def get(self, word):
        current_node = self.root
        for char in word:
            current_node = current_node.children[char]
        # Return the last node itself
        return current_node

    # A function to print all the terms in the trie
    def print_words(self, node=None, prefix=""):
        if node is None:
            node = self.root
        # If the node is a word, print the prefix
        if node.is_word:
            print(prefix, node.posting_list)
        # Loop through each child node
        for char, child in node.children.items():
            # Recursively call the function with the child node and the updated prefix
            self.print_words(child, prefix + char)


Data Structure for Positional Index and Permuterm Index

build a trie tree to save all terms(and their permuterms) and their posting list with position of term in each document

In [6]:
class TrieNode2:
    def __init__(self):
        self.children = {}
        self.posting_list = {}
        self.is_word = False

class Trie2:
    def __init__(self):
        self.root = TrieNode2()

    # function to insert new terms to the trie tree
    def insert2(self, word, docID, position):
        current_node = self.root
        for char in word:
            # If the character is not in the current node's children, add it as a new child node
            if char not in current_node.children:
                current_node.children[char] = TrieNode2()
            current_node = current_node.children[char]
        # Mark the end of the word
        current_node.is_word = True
        # add docId and position of term in documnet to the posting list
        keysList = list(current_node.posting_list.keys())
        if docID not in keysList:  # doc ID is not in the term's posting list, add docId and postion of term to posting list
            current_node.posting_list.update({docID:[position]})
        else:    # doc ID is currently in the term's posting list, just add a new position
            current_node.posting_list.get(docID).append(position)

    # function to check if a term is present in the tree or not
    def search2(self, word):
        current_node = self.root
        for char in word:
            # If the character is not in the current node's children, the word is not in the trie
            if char not in current_node.children:
                return False
            current_node = current_node.children[char]
        return current_node.is_word

    # function to return the term's node when it is present in the tree
    def get2(self, word):
        current_node = self.root
        for char in word:
            if char not in current_node.children:
                return None
            current_node = current_node.children[char]
        # Return the last node
        return current_node

    # A function to print all the terms in the trie
    def print_words2(self, node=None, prefix=""):
        if node is None:
            node = self.root
        # If the node is a word, print the prefix
        if node.is_word:
            print(prefix, node.posting_list)

        for char, child in node.children.items():
            # Recursively call the function with the child node and the updated prefix
            self.print_words2(child, prefix + char)

    # A function to find all the words that match a wildcard query in a trie
    def find_words(self, prefix):
        # list to store the matching words
        words = []
        # Find the node that corresponds to the prefix in the trie
        node = self.get2(prefix)
        # If the node exists, recursively collect the words that start with the prefix
        if node:
            collect_words(node, prefix, words)
        return words

# a function to collect all terms with some given prefix in the trie tree
def collect_words(node, prefix, words):
    # If the node is a word, append the prefix to the list of words
    if node.is_word:
        words.append(prefix)
    # Loop through each child node
    for char, child in node.children.items():
        # Recursively call the function with the child node and the updated prefix
        collect_words(child, prefix + char, words)
        

Build Inverted Index and Positional Index

for all tokens fill trie tree of inverted index with terms and doc ids and trie tree of positional index with terms, doc ids and positions.
remove stop words in this step

In [7]:
inverted_index = Trie()       # trie tree for saving inverted index
positional_index = Trie2()    # trie tree for saving positional index and permuterm index

sum_of_tokens2 = 0
for i, doc in enumerate(all_tokens):
    count = 0
    for term in doc:
        count += 1
        if term not in stop_words:                     # skip the stop words and don't save them in tries
            sum_of_tokens2 += 1
            if inverted_index.search(term) == False:   # if term is not currently in trie, insert it
                inverted_index.insert(term, i)
            else:                                      # if term is currently in trie, 
                node = inverted_index.get(term)
                if i not in node.posting_list:         # just if it is in a new doc, add doc id to posting list
                    node.posting_list.append(i)

            positional_index.insert2(term, i, count)   # insert term and its doc id and position to positional index trie


print("Total number of tokens after removing stop words: ", sum_of_tokens2)

Total number of tokens after removing stop words:  2226


Boolean Query

getting the query and tokenize and lower it the same as the documents.
check the boolean query, if it is NOT, find the documents that the mentioned term is not appeared in them,
for AND, intersect the incidence lists of first and second term and for OR union.

In [53]:
def boolean_query(query):
    query = query.lower()                    # make the query lower case
    query_list = re.findall(r'\d+(?:,\d+)*(?:\.\d+)?|\w+', query)   # tokenize the query like documents texts
    if len(query_list) == 2:  # not
        if inverted_index.search(query_list[1]):      # if given term is in inverted index
            list1 = inverted_index.get(query_list[1]).posting_list  # get the posting list of given term
            result = []
            n = len(name_of_docs)
            for i in range(n):              # all of the docs that the term is not in them is the result
                if i not in list1:
                    result.append(i)
        else:
            print("Term not found.")
    else:      # or / and     
        list1, list2 = [], [] 
        if inverted_index.search(query_list[0]):    # get the posting lists of the given terms
            list1 = inverted_index.get(query_list[0]).posting_list
        if inverted_index.search(query_list[2]):
            list2 = inverted_index.get(query_list[2]).posting_list     

        if "and" in query_list:
            result = list(set(list1) & set(list2))
        elif "or" in query_list:
            result = list(set(list1) | set(list2))

    print("The result is: ")
    for r in result:
        print(r, name_of_docs[r])

query = input("Enter a boolean query: ")
boolean_query(query)



The result is: 
2 Better To Be Unlucky
4 Crazy Housing Prices


get proximity query

get the query and tokenize it and separate the number in the middle of it.
search the documents that have both terms and find out if the distance of position of the 2 terms is less than given number.

In [54]:
def proximity_query(query):
    query = query.lower()          # make the query lower case
    query_list = re.findall(r'\d+(?:,\d+)*(?:\.\d+)?|\w+', query)   # tokenize the query like documents texts
    query_list[2] = int(query_list[2])   # find the proxomity number
    query_list = query_list[:1]+query_list[2:]   # delete the word "near" from the query tokens(useless)

    list1, list2 = [], []
    if positional_index.search2(query_list[0]):   # find the posting lists of each term
        list1 = list(positional_index.get2(query_list[0]).posting_list.keys())
    if positional_index.search2(query_list[2]):
        list2 = list(positional_index.get2(query_list[2]).posting_list.keys())

    intersect_docs = list(set(list1) & set(list2))   # find the documents that both of terms are in
    result = []
    for i in intersect_docs:
        a = positional_index.get2(query_list[0]).posting_list.get(i)   # find the position of occuring terms in the common documents
        b = positional_index.get2(query_list[2]).posting_list.get(i)
        for j in a:
            for k in b:
                if abs(j-k)-1 <= query_list[1]:   # find the distance of terms in each common document and check it with limit
                    if i not in result:
                        result.append(i)
    print("the result is: ")
    for r in result:
        print(r, name_of_docs[r])

query = input("Enter a proximity query: ")
proximity_query(query)

the result is: 
8 Happy and Unhappy Renters


Find permuterms of tokens

In [10]:
def rotate(word):
    return word + "$"

# A function to generate a permuterm index for a word
def permuterm(word):
    # Rotate the word and append $
    word = rotate(word)
    # store the permutations
    permutations = []
    for i in range(len(word)):
        permutation = word[i:] + word[:i]
        permutations.append(permutation)
    return permutations

Compute soundex to restrict domain of calculating distance for spell checking

In [11]:
def compute_soundex(word):
    # Retain the first letter and convert it to uppercase
    code = word[0].upper()
    # Remove any vowels, H, W, or Y, unless they are the first letter
    word = word[0] + "".join(char for char in word[1:] if char not in "aehiouwy")
    # Replace the remaining consonants with their corresponding digits
    for char, digit in (("b", "1"),("f", "1"),("p", "1"),("v", "1"),
    ("c", "2"), ("g", "2"),("j", "2"),("k", "2"),("q", "2"),("s", "2"),("x", "2"),("z", "2"),
    ("d", "3"), ("t", "3"),("l", "4"), ("m", "5"), ("n", "5"),("r", "6")):
        word = word.replace(char, digit)
    # Remove any adjacent digits that are the same
    for digit in "123456":
        word = word.replace(digit + digit, digit)
    # Pad the code with zeros if it has less than four characters
    code += word[1:]
    code = code.ljust(4, "0")
    # Truncate the code if it has more than four characters
    code = code[:4]

    return code

Build permuterm index 

insert each term and all of its permutations to trie and create a 2D array to save the soundex of each term

In [12]:
permuterm_index = Trie2()
soundex_map = [[],[]]
for i, doc in enumerate(all_tokens):
    count = 0
    for term in doc:
        count += 1
        if term not in stop_words:
            code = compute_soundex(term)         # compute the soundex of term
            if term not in soundex_map[0]:
                soundex_map[0].append(term)      # insert each term with its soundex to the array
                soundex_map[1].append(code)
            permuterm_index_list = permuterm(term)
            for perm in permuterm_index_list:
                permuterm_index.insert2(perm, i, count)    # insert permutations of term one by one

Cheack if 2 soundex codes are near each other or not

In [13]:
def and_soundexes(code1, code2):
    result = []
    for i in range(4):
        if code1[i] == code2[i]:
            result.append(1)
        else:
            result.append(0)

    if result.count(0) <= 1:
        return True
    else:
        return False

Calculate the Levenshtein distance between two words

In [14]:
def levenshtein_distance(word1, word2):
    # If one of the words is empty, return the length of the other word
    if not word1:
        return len(word2)
    if not word2:
        return len(word1)
    # If the first characters of the words are the same, ignore them and recurse on the rest of the words
    if word1[0] == word2[0]:
        return levenshtein_distance(word1[1:], word2[1:])
    # Otherwise, consider three possible operations: insertion, deletion, or substitution
    # Recurse on each operation and return the minimum cost
    insert = 1 + levenshtein_distance(word1, word2[1:])
    delete = 1 + levenshtein_distance(word1[1:], word2)
    substitute = 1 + levenshtein_distance(word1[1:], word2[1:])
    return min(insert, delete, substitute)

Get the misspelled query and find the expected query with levenshtein distance 

In [55]:
def misspelled_query(query):
    query.lower()
    query_list = re.findall(r'\d+(?:,\d+)*(?:\.\d+)?|\w+', query)
    expected_query = []
    for token in query_list:        # check each term in query 
        distance_check = []
        if not permuterm_index.search2(token):   # if the query term is misspelled
            code = compute_soundex(token)           # compute its soundex
            for i in range(len(soundex_map[1])):      # check all of the soundex map and find the codes near query term's code
                code1 = soundex_map[1][i]
                result = and_soundexes(code, code1)
                if result:
                    distance_check.append(soundex_map[0][i])  # add the term to list of checking words
        else:
            distance_check.append(token)
        min_dis = 1e10
        spell_res = ""
        for i in distance_check:       # check all of the near words for levenshtein distance and return the nearest one as a result
            dist = levenshtein_distance(i, token)
            if dist < min_dis:     # save the min distance and related word
                spell_res = i
                min_dis = dist

        expected_query.append(spell_res)
    return expected_query

spell_query = input("Enter misspelled query: ")
expected_query = misspelled_query(spell_query)

for w in expected_query:
    print(w, end=" ")

festival founders 

Get wildcard query

In [56]:
def wildcard_query(query):
    query = query.lower()
    query_list = query.split("*")
    result = []
    if query.count("*") == 2:     # for 2 star wildcards
        new_query = query_list[2] + "$" + query_list[0] + "*"    # turn A*B*C to C$A*
        matchs = permuterm_index.find_words(new_query[:-1])   # find all matchs for C$A as a prefix in trie

        for i, word in enumerate(matchs):
            list_word = word.split("$")
            new_word = list_word[1] + list_word[0]
            matchs[i] = new_word                        # for all matches turn them to their true format of word

        len0 = len(query_list[0])
        len2 = len(query_list[2])
        matchs2 = []
        for word in matchs:
            word = word[len0:-len2]       # for all matches delete the parts which match the wildcard
            matchs2.append(word)

        for i, word in enumerate(matchs2):
            if query_list[1] in word:         # if the middle part matches too, find the posting list of the term
                print(matchs[i])
                result.append(matchs[i])
                doc_list = permuterm_index.get2(matchs[i]+"$").posting_list
                print(doc_list)


    if query.count("*") == 1:       # for 1 * wildcard
        new_query = query_list[1] + "$" + query_list[0] + "*"
        matchs = permuterm_index.find_words(new_query[:-1])   # find all B$A* for A*B$s

        for i, word in enumerate(matchs):
            list_word = word.split("$")
            new_word = list_word[1] + list_word[0]      # find the real words
            matchs[i] = new_word

        for i, word in enumerate(matchs):
            print(matchs[i])
            result.append(matchs[i])
            doc_list = permuterm_index.get2(matchs[i]+"$").posting_list   # fir all matches get the posting list
            print(doc_list)
    return result

query = input("Enter the wildcard query: ")
result = wildcard_query(query)
print(result)

nobody
{0: [237]}
nearby
{14: [148]}
['nobody', 'nearby']


Enhanced query

In [57]:
def enhanced_query(query):
    query = query.lower()
    query_list = query.split(" ")
    result, result1, result2 = [], [], []
    if ("and" in query_list) | ("or" in query_list):
        if "*" in query_list[0]:
            result1 = wildcard_query(query_list[0])
        else:
            result1 = misspelled_query(query_list[0])
        if "*" in query_list[2]:
            result2 = wildcard_query(query_list[2])
        else:
            result2 = misspelled_query(query_list[2])
            print(result2)

        for w1 in result1:
            for w2 in result2:
                new_query = w1 + " " + query_list[1] + " " + w2
                boolean_query(new_query)

    elif "not" in query_list:
        if "*" in query_list[1]:
            result = wildcard_query(query_list[1])
        else:
            result = misspelled_query(query_list[1])
        for w1 in result:
            new_query = query_list[0] + " " + w1
            boolean_query(new_query)

    elif "near" in query_list[1]:
        if "*" in query_list[0]:
            result1 = wildcard_query(query_list[0])
        else:
            result1 = misspelled_query(query_list[0])
        if "*" in query_list[2]:
            result2 = wildcard_query(query_list[2])
        else:
            result2 = misspelled_query(query_list[2])

        for w1 in result1:
            for w2 in result2:
                new_query = w1 + " " + query_list[1] + " " + w2
                proximity_query(new_query)

query = input("Enter enhanced query: ")
enhanced_query(query)

found
{11: [191, 201]}
founders
{0: [268]}
The result is: 
0 A Festival of Books
11 Pulling Out Nine Tons of Trash
The result is: 
0 A Festival of Books
