In [None]:
"""
Indexer - Compressed

CS 6200 Information Retreival 
Homework 2
Melanie Platt

Implementation of a document indexer with compression using zlib. 
Parse, tokenize, and index documents to use ranking models on for query searching.

This will use the catalog and inverted index files generated to create the stemmed index, before merging. Compression
will be done in parallel with the merging step. 
                                                                    
"""

In [1]:
import zlib 
from collections import OrderedDict 
from itertools import islice
import os
import re

STEMMING = True

# these should actually be the same
if STEMMING:
    DOC_HASHES_PATH = "C:/6200-IR/doc_hashes.txt"
else:
    DOC_HASHES_PATH = "C:/6200-IR/doc_hashes_nostemming.txt"

In [None]:
""" Part 4: Merging the partial indicies with compression """

# 1. Create a list of the file names for the partial catalog and indicies
# 2. Create one larger catalog and index by cycling through partial files, each time appending
#    another file to the larger one. Use merging algorithm to accomplish this. Add compressed rather than regular string
#    of data as we go.

In [2]:
# Functions for completing part 3

"""
Function: get_files_in_dir()
Input: folder_path: a path to a folder of files
Output: file_path_list: a list of paths to each file in the folder
Description: Gets the names of all files in the folder then appends each file's path name to a list to return
"""
def get_files_in_dir(folder_path):
    # gets all names of files in directory
    file_list = os.listdir(folder_path)

    # append them to list with their full paths
    file_path_list = []
    for file in file_list:
        file_path_list.append(os.path.join(folder_path, file))

    return file_path_list


"""
Function: read_catalog()
Input: file path to the catalog file
Output: the catalog as a dictionary (term --> (offset, length))
"""
def read_catalog(file_path):
    cat = OrderedDict()
    with open(file_path, encoding="ISO-8859-1", errors='ignore') as f:
        for line in f:
            line_list = re.split(r'[:,]', line)
            # term --> (offset, length)
            cat[line_list[0].strip()] = (line_list[1].strip(), line_list[2].strip())
            
    f.close()
        
    return cat


"""
Function: merge_indicies()
Input: merging_file_path: path where merged files will be saved,  m_no: the number merge we are on for file naming,
       cat1_list: the catalog we are adding to in the form of a list of tuples (term, offset, len), 
       ind1_path: the path of the index we are adding to, cat2_listL the catalog we are adding, ind2_path: the path 
       of the index we are adding
Output: None
Description: Merges two catalog and index files together and writes two new files for each in the provided directory
"""
def merge_indicies(t, merge_path, m_no, cat1_list, ind1_path, cat2_list, ind2_path):
    
    # initialize files that we will write the merged catalog and index to
    merged_cat = open(merge_path + "merged" + str(m_no) + "_catalog.txt", "x")
    merged_ind = open(merge_path + "merged" + str(m_no) + "_index.txt", "wb") # index file will be compressed

    # pointers to the term we are on in each partial catalog
    p1 = 0 
    p2 = 0
    
    # offset for saving the index file
    new_offset = 0
    
    # loop until we have finished the terms in the shorter catalog
    while (p1 < len(cat1_list) and p2 < len(cat2_list)):
        if cat1_list[p1][0] == cat2_list[p2][0]:

            # get the start and offsets for each
            term = cat1_list[p1][0].strip()
            cat1_start = cat1_list[p1][1][0]
            cat1_len = cat1_list[p1][1][1]
            cat2_start = cat2_list[p2][1][0]
            cat2_len = cat2_list[p2][1][1]

            # get the doc information for each
            if t == 1:
                f1 = open(ind1_path, "r")
            else:
                f1 = open(ind1_path, "rb")
                
            f2 = open(ind2_path, "r")
            f1.seek(int(cat1_start))
            f2.seek(int(cat2_start))
            p1_doc_info = f1.read(int(cat1_len))
            p2_doc_info = f2.read(int(cat2_len))
            f1.close()
            f2.close()

            # decompress info if after first file
            if t != 1:
                p1_doc_info = zlib.decompress(p1_doc_info)
                p1_doc_info = str(p1_doc_info, 'utf-8')

            # combine it
            new_info = p1_doc_info + p2_doc_info

            # compress nad write to files
            new_info_c = zlib.compress(new_info.encode('utf-8'),6)
            merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(new_info_c)) + "\n")
            merged_ind.write(new_info_c)

            p1 += 1
            p2 += 1
            new_offset += len(new_info_c)

                
        elif cat1_list[p1][0] > cat2_list[p2][0]:
                      
            # just append from cat2 (will always be txt file)
            term = cat2_list[p2][0].strip()
            cat2_start = cat2_list[p2][1][0]
            cat2_len = cat2_list[p2][1][1]
            f2 = open(ind2_path, "r")
            f2.seek(int(cat2_start))
            p2_doc_info = f2.read(int(cat2_len))
            f2.close()

            # compress and write to files
            p2_doc_info_c = zlib.compress(p2_doc_info.encode('utf-8'),6)
            merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(p2_doc_info_c)) + "\n")
            merged_ind.write(p2_doc_info_c)
            p2 += 1
            new_offset += len(p2_doc_info_c)
        else:
                      
            # just append from cat1
            term = cat1_list[p1][0].strip()
            cat1_start = cat1_list[p1][1][0]
            cat1_len = cat1_list[p1][1][1]
            
            if t == 1:
                f1 = open(ind1_path, "r")
            else:
                f1 = open(ind1_path, "rb")
            f1.seek(int(cat1_start))
            p1_doc_info = f1.read(int(cat1_len))
            f1.close()
            
            # copmress if from first file
            if t == 1:
                p1_doc_info = zlib.compress(p1_doc_info.encode('utf-8'),6)

            # write to files
            merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(p1_doc_info)) + "\n")
            merged_ind.write(p1_doc_info)
            p1 += 1
            new_offset += len(p1_doc_info)


    # finish appending whatever is left over in each file
    while p1 < len(cat1_list):
        term = cat1_list[p1][0].strip()
        cat1_start = cat1_list[p1][1][0]
        cat1_len = cat1_list[p1][1][1]
        
        if t == 1:
            f1 = open(ind1_path, "r")
        else:
            f1 = open(ind1_path, "rb")
        f1.seek(int(cat1_start))
        p1_doc_info = f1.read(int(cat1_len))
        f1.close()
        
        # copmress if from first file
        if t == 1:
            p1_doc_info = zlib.compress(p1_doc_info.encode('utf-8'),6)

        merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(p1_doc_info)) + "\n")
        merged_ind.write(p1_doc_info)
        p1 += 1
        new_offset += len(p1_doc_info)

    while p2 < len(cat2_list):
        term = cat2_list[p2][0].strip()
        cat2_start = cat2_list[p2][1][0]
        cat2_len = cat2_list[p2][1][1]
        f2 = open(ind2_path, "r")
        f2.seek(int(cat2_start))
        p2_doc_info = f2.read(int(cat2_len))
        f2.close()
        
        # compress and write to files
        p2_doc_info_c = zlib.compress(p2_doc_info.encode('utf-8'),6)
        merged_cat.write(term + ":" + str(new_offset).strip() + "," + str(len(p2_doc_info_c)) + "\n")
        merged_ind.write(p2_doc_info_c)
        p2 += 1
        new_offset += len(p2_doc_info_c)

    merged_cat.close()
    merged_ind.close()
    print("Merging complete! The documents for merge {} have been saved.".format(m_no))


In [3]:
# file paths for partial catalog and indicies
partial_catalogs_path = 'C:/6200-IR/homework-2-mplatt27/inverted-index-files/stemmed/catalogs/'
partial_indicies_path = 'C:/6200-IR/homework-2-mplatt27/inverted-index-files/stemmed/indicies/'

# file path for saving files as we merge them
merging_file_path = 'C:/6200-IR/homework-2-mplatt27/merging/'

# collect list of file names
catalogs = get_files_in_dir(partial_catalogs_path)
indicies = get_files_in_dir(partial_indicies_path)

print("We have {} catalogs".format(len(catalogs)))
print("We have {} inverted indicies".format(len(indicies)))

We have 85 catalogs
We have 85 inverted indicies


In [5]:
# Merge all files together

# get first catalog and read in as dict and list; get corresponding index
# we will iterativly add to this one
full_cat_file = catalogs[0] # the file name of the first partial catalog
full_cat_dict = read_catalog(full_cat_file) # a dictionary of the first catalog
full_cat_list = list(full_cat_dict.items()) # (term, (start, offset))
full_index_file = indicies[0] # the file name of the corresponding first index

# iterate 84 times to merge each catalog and index to the larger one
merge_no = 1
t = 1
for i in range(1,len(catalogs)): 
    
    # get catalog to add in read in as dict and list; get corresponding index
    par_cat2_file = catalogs[i] # file name of the second catalog (10)
    par_cat2_dict = read_catalog(par_cat2_file) # a dict of the second catalog
    par_cat2_list = list(par_cat2_dict.items()) # a list of the second catalog
    par_ind2_file = indicies[i] # the file name of the corresponding second index (10)
    
    # merge together and write to file (merged catalog and index)
    merge_indicies(t, merging_file_path, merge_no, full_cat_list, full_index_file, par_cat2_list, par_ind2_file)
    
    # set new full list to the one we just created
    full_cat_file = merging_file_path + "merged" + str(merge_no) + "_catalog.txt"
    full_cat_dict = read_catalog(full_cat_file)
    full_cat_list = list(full_cat_dict.items())
    full_index_file = merging_file_path + "merged" + str(merge_no) + "_index.txt"

    merge_no += 1
    t += 1
       
print("All docs have been merged. File number {} has the full merged index.".format(merge_no))

Merging complete! The documents for merge 1 have been saved.
Merging complete! The documents for merge 2 have been saved.
Merging complete! The documents for merge 3 have been saved.
Merging complete! The documents for merge 4 have been saved.
Merging complete! The documents for merge 5 have been saved.
Merging complete! The documents for merge 6 have been saved.
Merging complete! The documents for merge 7 have been saved.
Merging complete! The documents for merge 8 have been saved.
Merging complete! The documents for merge 9 have been saved.
Merging complete! The documents for merge 10 have been saved.
Merging complete! The documents for merge 11 have been saved.
Merging complete! The documents for merge 12 have been saved.
Merging complete! The documents for merge 13 have been saved.
Merging complete! The documents for merge 14 have been saved.
Merging complete! The documents for merge 15 have been saved.
Merging complete! The documents for merge 16 have been saved.
Merging complete!

In [6]:
""" Testing compression before using all files """


# file paths for partial catalog and indicies
partial_catalogs_path_t = 'C:/6200-IR/homework-2-mplatt27/inverted-index-files/testing/catalogs/'
partial_indicies_path_t = 'C:/6200-IR/homework-2-mplatt27/inverted-index-files/testing/indicies/'

# file path for saving files as we merge them
merging_file_path_t = 'C:/6200-IR/homework-2-mplatt27/merging/testing'

# collect list of file names
catalogs_t = get_files_in_dir(partial_catalogs_path_t)
indicies_t = get_files_in_dir(partial_indicies_path_t)

print("We have {} catalogs".format(len(catalogs_t)))
print("We have {} inverted indicies".format(len(indicies_t)))

We have 2 catalogs
We have 2 inverted indicies


In [7]:
# Merge all files together testing

# get first catalog and read in as dict and list; get corresponding index
# we will iterativly add to this one
full_cat_file_t = catalogs_t[0] # the file name of the first partial catalog
full_cat_dict_t = read_catalog(full_cat_file_t) # a dictionary of the first catalog
full_cat_list_t = list(full_cat_dict_t.items()) # (term, (start, offset))
full_index_file_t = indicies_t[0] # the file name of the corresponding first index

# iterate 84 times to merge each catalog and index to the larger one
merge_no_t = 1
t = 1 # the round we are on, if greater than 1, we are merging a text file with a bytes file (first time is two txt files)
for i in range(1,len(catalogs_t)): 
    
    # get catalog to add in read in as dict and list; get corresponding index
    par_cat2_file_t = catalogs_t[i] # file name of the second catalog (10)
    par_cat2_dict_t = read_catalog(par_cat2_file_t) # a dict of the second catalog
    par_cat2_list_t = list(par_cat2_dict_t.items()) # a list of the second catalog
    par_ind2_file_t = indicies_t[i] # the file name of the corresponding second index (10)
    
    # merge together and write to file (merged catalog and index)
    merge_indicies(t, merging_file_path_t, merge_no_t, full_cat_list_t, full_index_file_t, par_cat2_list_t, par_ind2_file_t)
    
    # set new full list to the one we just created
    full_cat_file_t = merging_file_path_t + "merged" + str(merge_no_t) + "_catalog.txt"
    full_cat_dict_t = read_catalog(full_cat_file_t)
    full_cat_list_t = list(full_cat_dict_t.items())
    full_index_file_t = merging_file_path_t + "merged" + str(merge_no_t) + "_index.txt"

    merge_no_t += 1

    
print("All docs have been merged. File number {} has the full merged index.".format(merge_no_t))

Merging complete! The documents for merge 1 have been saved.
All docs have been merged. File number 2 has the full merged index.


In [10]:
# test opening the merged file that was created

# file path for merged catalog and index
cat_m = 'C:/6200-IR/homework-2-mplatt27/merging/testingmerged1_catalog.txt'
ind_m = 'C:/6200-IR/homework-2-mplatt27/merging/testingmerged1_index.txt'

t2 = open(ind_m, "rb")
t2.seek(45)
details = t2.read(46)
t2.close()

# |30:1,45|46:23,24,25| + 30:71|50:3,47|48:2,4,45|

print(details)
details_c = zlib.decompress(details)
print(details_c)
print(str(details_c, 'utf-8'))

b'x\x9c\x15\xca\xb1\r\x000\x08\x03\xb0\x872@\x08\xa5\xcam\x1c_\xd5\xb3+\x9cP\xaf\x8eY\xa0\xc0\xde\nOn\x87\x0b\x9a\xd55\xa1\x7f\x1e\xe0Y\n9'
b'30:1,45|46:23,24,25|30:71|50:3,47|48:2,4,45|'
30:1,45|46:23,24,25|30:71|50:3,47|48:2,4,45|


In [34]:
# Testing zlib and writing bytes to file

# get strings
string_1 = "58:57,68,84,144|"
string_2 = "148:72|30:1,45|46:23,24,25|"

# merge strings and compress
string_3 = string_1 + string_2
string_3_c = zlib.compress(string_3.encode('utf-8'),6)

# see what they look like
print(string_3)
print(string_3_c)

# write to files as string
c = open("C:/6200-IR/homework-2-mplatt27/merging/testing/string_catalog.txt", "x")
c.write("apple"+":"+"0"+","+str(len(string_3)))
c.close()
t = open("C:/6200-IR/homework-2-mplatt27/merging/testing/string_index.txt", "x")
t.write(string_3)
t.close()

# write to files as bytes string
c = open("C:/6200-IR/homework-2-mplatt27/merging/testing/bytes_catalog.txt", "x")
c.write("apple"+":"+"0"+","+str(len(string_3_c)))
c.close()
t = open("C:/6200-IR/homework-2-mplatt27/merging/testing/bytes_index.txt", "wb")
t.write(string_3_c)
t.close()




58:57,68,84,144|148:72|30:1,45|46:23,24,25|
b'x\x9c\x05\xc1\xc1\x11\x000\x08\x02\xb0\x85xTD\xe5\x9c\xcd\xe1\x9b\x94\xb7\x06mX\x08\xe9B\xde\xe1\xe5\xdb\x80\xea\xd4\xcb\x04\x05\xd6}\xd2!\t\xcc'


In [38]:
# open from files
t2 = open("C:/6200-IR/homework-2-mplatt27/merging/testing/bytes_index.txt", "rb")
t2.seek(0)
details = t2.read(44)
t2.close()

print(details)
string_3_d = zlib.decompress(details)
print(string_3_d)
print(str(string_3_d, 'utf-8'))

b'x\x9c\x05\xc1\xc1\x11\x000\x08\x02\xb0\x85xTD\xe5\x9c\xcd\xe1\x9b\x94\xb7\x06mX\x08\xe9B\xde\xe1\xe5\xdb\x80\xea\xd4\xcb\x04\x05\xd6}\xd2!\t\xcc'
b'58:57,68,84,144|148:72|30:1,45|46:23,24,25|'
58:57,68,84,144|148:72|30:1,45|46:23,24,25|


In [16]:
""" Compress the stemmed index that we made previously without merging """

# file paths for partial catalog and indicies
catalog_path = 'C:/6200-IR/homework-2-mplatt27/index-stemmed/full_catalog.txt'
index_path = 'C:/6200-IR/homework-2-mplatt27/index-stemmed/full_index.txt'
# catalog_path = 'C:/6200-IR/homework-2-mplatt27/inverted-index-files/testing/catalogs/cat1_tester.txt'
# index_path = 'C:/6200-IR/homework-2-mplatt27/inverted-index-files/testing/indicies/ind1_tester.txt'

catalog_dict = read_catalog(catalog_path) # a dictionary of the first catalog (term --> (start, offset))

offset = 0
# compressed_catalog = open("C:/6200-IR/homework-2-mplatt27/merging/testing/bytes_catalog.txt", "x")
# compressed_index = open("C:/6200-IR/homework-2-mplatt27/merging/testing/bytes_index.txt", "wb")
compressed_catalog = open("C:/6200-IR/homework-2-mplatt27/merging/compressed_catalog_9.txt", "x")
compressed_index = open("C:/6200-IR/homework-2-mplatt27/merging/compressed_index_9.txt", "wb")

for term, pos in catalog_dict.items():
    index_f = open(index_path, "r")
    index_f.seek(int(pos[0]))
    details = index_f.read(int(pos[1]))
    index_f.close()
    
    details_c = zlib.compress(details.encode('utf-8'),9)
    
    compressed_catalog.write(term + ":" + str(offset).strip() + "," + str(len(details_c)) + "\n")
    compressed_index.write(details_c)
    offset += len(details_c)

compressed_catalog.close()
compressed_index.close()
    
    


In [12]:
compressed_catalog.close()
compressed_index.close()