In [None]:
"""
Indexer

CS 6200 Information Retreival 
Homework 2
Melanie Platt

Implementation of a document indexer. Parse, tokenize, and index documents to use ranking models on for query searching.
                                                                    
"""

In [1]:
import pickle
from collections import OrderedDict 
from itertools import islice
import os
import re

STEMMING = True
if STEMMING:
    DOC_HASHES_PATH = "C:/6200-IR/doc_hashes.txt"
else:
    DOC_HASHES_PATH = "C:/6200-IR/doc_hashes_nostemming.txt"

In [None]:
""" Part 2: Indexing (Partial Indicies)"""

# 1. Read in pickple files of doc_dict with tokens for each dict
# 2. In groups of 1000 docs at a time, write two parallel files to create the inverted index: 
#    a. A file with a list of term, offset in parellel file, length in parellel file pairs
#    b. A file listing document (that term is in) : position in doc
#        * Positions are seperated by commas if there is more than 1
#        * Each doc is seperated by a pipe


In [None]:
"""
The Indexer class collects tokens from the set of input documents and maps them to the documents that they exist in,
and the positions that they occur in that document. It also writes this token dict as two files to create an 
inverted index. This can be used with any size set of documents, but is best with 1000 or less at a time. 
"""
class Indexer:
    
    def __init__(self):
            
        # a dict of tokens mapped to a dict (doc --> list of positions that the term occurs in that doc)
        self.tokens = OrderedDict()
        
            
    """
    Function: index_docs
    Input: A dictonary of documents mapped to a dictionary of tokens mapped to a list of those tokens' positions
    Output: None
    Description: For the set of input documents, creates an inverted index in the form of a dictionary (self.tokens)
    mapping the tokens to a dictionary of documents mapped to a list of their positions in the doc
    """
    def index_docs(self, d):
        for doc, tokens in d.items():
            for token in tokens: # token is (term, position)
                if token[0] not in self.tokens.keys():
                    self.tokens[token[0]] = {}
                if doc not in self.tokens[token[0]].keys():
                    self.tokens[token[0]][doc] = []
                self.tokens[token[0]][doc].append(token[1])
    
    """
    Function: sort_tokens_dict
    Input: None
    Outpout: None
    Descriptions: Sorts the inverted index alphabetically by token
    """
    def sort_tokens_dict(self):
        self.tokens = OrderedDict(sorted(self.tokens.items(), key = lambda t: t[0]))
                

    """
    Function: write_index_to_files
    Input: path to save files, group #
    Output: None
    Description: Saves two parallel files: catalog and index, where the catelog lists the term, offset, and length of
    information. This maps to the index file which lists the documents that the term appers in, as well as the positions
    that the term is in, in those docs. Group No. is added to the file names. 
    """
    def write_index_to_files(self, save_path, group_no):
        
        catalog_name = "catalog" + str(group_no) + ".txt"
        inverted_index_name = "inverted_index" + str(group_no) + ".txt"
        
        catalog = open(save_path + catalog_name, "w")
        inverted_index = open(save_path + inverted_index_name, "w")
        
        offset = 0
        for token, docs in self.tokens.items():
            index_string = ""
            for doc, pos_list in docs.items():
                index_string = index_string + str(doc).strip() + ":"
                pos_count = 0
                for pos in pos_list:
                    if pos_count == 0:
                        index_string = index_string + str(pos).strip()
                    else:
                        index_string = index_string + "," + str(pos).strip()
                    pos_count += 1
                index_string = index_string + "|"
            
            inverted_index.write(index_string)
            catalog.write(token.strip() + ":" + str(offset).strip() + "," + str(len(index_string)) + "\n")
            offset += len(index_string)
        catalog.close()
        inverted_index.close()
        

In [None]:
"""
Function: split_dicts
Input: dictionary, number of keys we want in each dict (keys = doc)
Output: A list dicts (each sub-dict contains n keys)
"""
def split_dicts(d, n):
    for i in range(0, len(d), n):
        yield OrderedDict(islice(d.items(), i, i+n))

In [None]:
""" Main code """

# read in doc_dict from pickle file
if STEMMING:
    handle = open('C:/6200-IR/doc_dict.pickle', 'rb')
    doc_dict = pickle.load(handle)
    handle.close()
else:
    handle = open('C:/6200-IR/doc_dict_nostemming.pickle', 'rb')
    doc_dict = pickle.load(handle)
    handle.close()
    
# Create list that holds all dicts of documents with tokens (each sub-dict has 1000 docs in it)
split_dicts_list = list(split_dicts(doc_dict, 1000))


# Check that the list split correctly
total = 0
for each in split_dicts_list:
    total += len(each)
    
print("We have {} documents to index".format(total))


In [None]:
# For each sub-dict of 1000 docs, we want to index these docs
# create an Indexer object, which will create a dict of tokens --> {doc --> [positions of token]}
# Write files for this set of docs (catalog and index)
print("There are {} groups of 1000 documents to index".format(len(split_dicts_list)))
for i in range(len(split_dicts_list)):
    indexer = Indexer()
    indexer.index_docs(split_dicts_list[i])
    indexer.sort_tokens_dict()
    indexer.write_index_to_files("C:/6200-IR/", i+1)
    
print("Index files have been generated!")
    

In [None]:
""" Part 3: Merging the partial indicies """

# 1. Create a list of the file names for the partial catalog and indicies
# 2. Create one larger catalog and index by cycling through partial files, each time appending
#    another file to the larger one. Use merging algorithm to accomplish this.
# 3. 

In [2]:
# Functions for completing part 3

"""
Function: get_files_in_dir()
Input: folder_path: a path to a folder of files
Output: file_path_list: a list of paths to each file in the folder
Description: Gets the names of all files in the folder then appends each file's path name to a list to return
"""
def get_files_in_dir(folder_path):
    # gets all names of files in directory
    file_list = os.listdir(folder_path)

    # append them to list with their full paths
    file_path_list = []
    for file in file_list:
        file_path_list.append(os.path.join(folder_path, file))

    return file_path_list


"""
Function: read_catalog()
Input: file path to the catalog file
Output: the catalog as a dictionary (term --> (offset, length))
"""
def read_catalog(file_path):
    cat = OrderedDict()
    with open(file_path, encoding="ISO-8859-1", errors='ignore') as f:
        for line in f:
            line_list = re.split(r'[:,]', line)
            # term --> (offset, length)
            cat[line_list[0].strip()] = (line_list[1].strip(), line_list[2].strip())
            
    f.close()
        
    return cat


"""
Function: merge_indicies()
Input: merging_file_path: path where merged files will be saved,  m_no: the number merge we are on for file naming,
       cat1_list: the catalog we are adding to in the form of a list of tuples (term, offset, len), 
       ind1_path: the path of the index we are adding to, cat2_listL the catalog we are adding, ind2_path: the path 
       of the index we are adding
Output: None
Description: Merges two catalog and index files together and writes two new files for each in the provided directory
"""
def merge_indicies(merge_path, m_no, cat1_list, ind1_path, cat2_list, ind2_path):
    
    # initialize files that we will write the merged catalog and index to
    merged_cat = open(merge_path + "merged" + str(merge_no) + "_catalog.txt", "x")
    merged_ind = open(merge_path + "merged" + str(merge_no) + "_index.txt", "x")

    # pointers to the term we are on in each partial catalog
    p1 = 0 
    p2 = 0
    
    # offset for saving the index file
    new_offset = 0
    
    # loop until we have finished the terms in the shorter catalog
    while (p1 < len(cat1_list) and p2 < len(cat2_list)):
        if cat1_list[p1][0] == cat2_list[p2][0]:

            # get the start and offsets for each
            term = cat1_list[p1][0].strip()
            cat1_start = cat1_list[p1][1][0]
            cat1_len = cat1_list[p1][1][1]
            cat2_start = cat2_list[p2][1][0]
            cat2_len = cat2_list[p2][1][1]

            # get the doc information for each
            f1, f2 = open(ind1_path, "r"), open(ind2_path, "r")
            f1.seek(int(cat1_start))
            f2.seek(int(cat2_start))
            p1_doc_info = f1.read(int(cat1_len))
            p2_doc_info = f2.read(int(cat2_len))
            f1.close()
            f2.close()

            # combine it
            new_info = p1_doc_info + p2_doc_info

            # write to files
            merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(new_info)) + "\n")
            merged_ind.write(new_info)

            p1 += 1
            p2 += 1
            new_offset += len(new_info)
        elif cat1_list[p1][0] > cat2_list[p2][0]:
                      
            # just append from cat2
            term = cat2_list[p2][0].strip()
            cat2_start = cat2_list[p2][1][0]
            cat2_len = cat2_list[p2][1][1]
            f2 = open(ind2_path, "r")
            f2.seek(int(cat2_start))
            p2_doc_info = f2.read(int(cat2_len))
            f2.close()

            # write to files
            merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(p2_doc_info)) + "\n")
            merged_ind.write(p2_doc_info)
            p2 += 1
            new_offset += len(p2_doc_info)
        else:
                      
            # just append from cat1
            term = cat1_list[p1][0].strip()
            cat1_start = cat1_list[p1][1][0]
            cat1_len = cat1_list[p1][1][1]
            f1 = open(ind1_path, "r")
            f1.seek(int(cat1_start))
            p1_doc_info = f1.read(int(cat1_len))
            f1.close()

            # write to files
            merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(p1_doc_info)) + "\n")
            merged_ind.write(p1_doc_info)
            p1 += 1
            new_offset += len(p1_doc_info)


    # finish appending whatever is left over in each file
    while p1 < len(cat1_list):
        term = cat1_list[p1][0].strip()
        cat1_start = cat1_list[p1][1][0]
        cat1_len = cat1_list[p1][1][1]
        f1 = open(ind1_path, "r")
        f1.seek(int(cat1_start))
        p1_doc_info = f1.read(int(cat1_len))
        f1.close()

        merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(p1_doc_info)) + "\n")
        merged_ind.write(p1_doc_info)
        p1 += 1
        new_offset += len(p1_doc_info)

    while p2 < len(cat2_list):
        term = cat2_list[p2][0].strip()
        cat2_start = cat2_list[p2][1][0]
        cat2_len = cat2_list[p2][1][1]
        f2 = open(ind2_path, "r")
        f2.seek(int(cat2_start))
        p2_doc_info = f2.read(int(cat2_len))
        f2.close()

        merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(p2_doc_info)) + "\n")
        merged_ind.write(p2_doc_info)
        p2 += 1
        new_offset += len(p2_doc_info)

    merged_cat.close()
    merged_ind.close()
    print("Merging complete! The documents for merge {} have been saved.".format(m_no))


In [3]:
# file paths for partial catalog and indicies
partial_catalogs_path = 'C:/6200-IR/homework-2-mplatt27/inverted-index-files/catalogs/'
partial_indicies_path = 'C:/6200-IR/homework-2-mplatt27/inverted-index-files/indicies/'

# file path for saving files as we merge them
merging_file_path = 'C:/6200-IR/homework-2-mplatt27/merging/'

# collect list of file names
catalogs = get_files_in_dir(partial_catalogs_path)
indicies = get_files_in_dir(partial_indicies_path)

# for testing only DELETE LATER
# cat_test = catalogs[:3]
# ind_test = indicies[:3]

print("We have {} catalogs".format(len(catalogs)))
print("We have {} inverted indicies".format(len(indicies)))


We have 85 catalogs
We have 85 inverted indicies


In [4]:
# Merge all files together

# get first catalog and read in as dict and list; get corresponding index
# we will iterativly add to this one
full_cat_file = catalogs[0] # the file name of the first partial catalog
full_cat_dict = read_catalog(full_cat_file) # a dictionary of the first catalog
full_cat_list = list(full_cat_dict.items()) # (term, (start, offset))
full_index_file = indicies[0] # the file name of the corresponding first index

# iterate 84 times to merge each catalog and index to the larger one
merge_no = 1
for i in range(1,len(catalogs)): 
    
    # get catalog to add in read in as dict and list; get corresponding index
    par_cat2_file = catalogs[i] # file name of the second catalog (10)
    par_cat2_dict = read_catalog(par_cat2_file) # a dict of the second catalog
    par_cat2_list = list(par_cat2_dict.items()) # a list of the second catalog
    par_ind2_file = indicies[i] # the file name of the corresponding second index (10)
    
    # merge together and write to file (merged catalog and index)
    merge_indicies(merging_file_path, merge_no, full_cat_list, full_index_file, par_cat2_list, par_ind2_file)
    
    # set new full list to the one we just created
    full_cat_file = merging_file_path + "merged" + str(merge_no) + "_catalog.txt"
    full_cat_dict = read_catalog(full_cat_file)
    full_cat_list = list(full_cat_dict.items())
    full_index_file = merging_file_path + "merged" + str(merge_no) + "_index.txt"

    merge_no += 1
    

    
print("All docs have been merged. File number {} has the full merged index.".format(merge_no))

Merging complete! The documents for merge 1 have been saved.
Merging complete! The documents for merge 2 have been saved.
Merging complete! The documents for merge 3 have been saved.
Merging complete! The documents for merge 4 have been saved.
Merging complete! The documents for merge 5 have been saved.
Merging complete! The documents for merge 6 have been saved.
Merging complete! The documents for merge 7 have been saved.
Merging complete! The documents for merge 8 have been saved.
Merging complete! The documents for merge 9 have been saved.
Merging complete! The documents for merge 10 have been saved.
Merging complete! The documents for merge 11 have been saved.
Merging complete! The documents for merge 12 have been saved.
Merging complete! The documents for merge 13 have been saved.
Merging complete! The documents for merge 14 have been saved.
Merging complete! The documents for merge 15 have been saved.
Merging complete! The documents for merge 16 have been saved.
Merging complete!