In [40]:
import re
from nltk.stem import PorterStemmer 
from collections import OrderedDict 
import os
import operator
from collections import Counter
import math
import zlib 
import time

STOP_PATH = "C:/6200-IR/homework-2-mplatt27/stop_words_list.txt"
QUERIES_PATH = "C:/6200-IR/homework-2-mplatt27/queries_modified_x.txt"
QUERIES_PATH_UNMODIFIED = "C:/6200-IR/homework-2-mplatt27/queries_unmodified.txt"
CATALOG_PATH = "C:/6200-IR/homework-2-mplatt27/index-not-stemmed/full_catalog_nostemming.txt"
INVERTED_INDEX_PATH = "C:/6200-IR/homework-2-mplatt27/index-not-stemmed/full_index_nostemming.txt"
DOC_HASHES_PATH = "C:/6200-IR/homework-2-mplatt27/index-not-stemmed/doc_hashes_nostemming.txt"
STEMMING = False
if STEMMING:
    ps = PorterStemmer() 

In [None]:
""" Part 1: Read in modified queries, catalog, doc_hashes """

In [41]:
# Functions for part 1

"""
Function: get_stop_words()
Input: File path to stop words list
Output: A list of stop words to leave out (of queries, index)
"""
def get_stop_words(file_path):
    stop_words = []
    for line in open(file_path, encoding="ISO-8859-1", errors='ignore'):
        stop_words.append(line.strip())
    return stop_words


"""
Function: query_analyzer()
Input: The full query as a string (one or more words)
Output: A list of strings where each string is one word (token) of the query
"""
def query_analyzer(query, stop):
    
    cleaned_queries = []
    query_list = re.findall(r"\w+|[^\w\s]|[\n\r]+", query, re.UNICODE)
    for term in query_list:
        if term.isalnum() and term not in stop: # leave out punct and stop words
            # stem the token if required
            if STEMMING:
                term = ps.stem(term)
            cleaned_queries.append(term)
    return cleaned_queries

"""
Function: read_queries()
Input: The folder path to the queries file as a string
Output: A dictionary mapping each query ID to a list of terms in that query (as str)
"""
def read_queries(folder_path, stop):
    # iterate over each line in the query
    lines = []
    ids = []
    for line in open(folder_path, encoding="ISO-8859-1", errors='ignore'):
        curr_query = str(line)
        id_end = curr_query.find(".")
        q_id = curr_query[:id_end].strip()
        ids.append(q_id)
        curr_query = curr_query[id_end + 3:].strip()
        lines.append(curr_query)

    # clean and stemp (remove stop words)
    cleaned_queries = {}
    for i in range(len(lines)):
        # cleaned_query will be a list of the query words as strings
        cleaned_query = query_analyzer(lines[i], stop)
        cleaned_queries[ids[i]] = cleaned_query
    return cleaned_queries


"""
Function: read_catalog()
Input: file path to the catalog file
Output: the catalog as a dictionary (term --> (offset, length))
"""
def read_catalog(file_path):
    cat = OrderedDict()
    with open(file_path, encoding="ISO-8859-1", errors='ignore') as f:
        for line in f:
            line_list = re.split(r'[:,]', line)
            # term --> (offset, length)
            cat[line_list[0].strip()] = (line_list[1].strip(), line_list[2].strip())
    f.close()
        
    return cat


"""
Function: parse_doc_details
Input: String from index to parse of the form: doc:positions|doc:positions (e.g., "58:57,68,84,144|148:72|")
Output: A dictoinary of docs mapped to a list of the term positions in that doc
"""
def parse_doc_details(info_string):
    info_string_list = info_string.split("|")

    # list of doc:positions, always ends in ""
    docs_dict = {}
    for doc in info_string_list:
        if doc == "":
            continue
        temp_list = doc.split(":")
        docs_dict[temp_list[0]] = temp_list[1].split(",")
        
    return docs_dict 


"""
Function: read_doc_hashes
Input: File path to doc_hashes; Lines are of the form "AP890101-0001 1|571" (doc_name doc_hash|len(d))
Output: Dictionary that maps doc hash --> (doc_name, len(d))
"""
def read_doc_hashes(file_path):
    docs = OrderedDict()
    with open(file_path, encoding="ISO-8859-1", errors='ignore') as f:
        for line in f:
            line_list = line.split()
            # name --> hash, len
            info = line_list[1].split("|")
            
            docs[info[0].strip()] = (line_list[0].strip(), info[1].strip())
    f.close()
    
    return docs


"""
Function: calc_avg_len_d
Input: doc_hashes dict and length of dict (number of documents)
Output: averge length of all documents in corpus
"""
def calc_avg_len_d(docs, n):
    # doc hash --> (name, length)
    sum_lens = 0
    for doc_hash, info in docs.items():
        sum_lens += int(info[1])
        
    avg_len_d = sum_lens / n
    return avg_len_d


"""
Function: sort_scores_dict()
Input: scores, a dictionary that maps query # to a dictionary (docno --> score)
Output: The same dictionary, but the value (dict that each key maps to) is now sorted by the scores
"""
def sort_scores_dict(scores):
    for q_id, d in scores.items():
        sorted_dict = dict(sorted(d.items(), key=operator.itemgetter(1), reverse=True))
        scores[q_id] = sorted_dict

    return scores


"""
# Function: write_scores_to_file()
# Input: A dictionary of scored documents for each query and a name for the file
# Output: None
# Does: Writes a file for the output. Assumes scores are already sorted. For each query response, writes a line
# for each document that was returned that includes the query number, doc number, rank, and score.
# Each line should be of the form: <query-number> Q0 <docno> <rank> <score> Exp
# This is for all models, except the ES built in, due to the differing format of results.
"""
def write_scores_to_file(scores, name):
    # assumes scores are already sorted
    # scores is dict of query id --> dict (doc_id --> score)
    file_name = name + ".txt"
    if os.path.exists(file_name):
        os.remove(file_name)
    output = open(file_name, "w")

    # iterate over query id responses
    for q_id, dict in scores.items():
        query_number = q_id
        rank = 1
        for doc_id, score in dict.items():
            if rank > 1000:
                break
            new_line = str(query_number) + " " + "Q0" + " " + doc_id + " " + str(rank) + " " + str(score) + " Exp\n"
            output.write(new_line)
            rank += 1
    output.close()
    


In [5]:
# Main code to get the queries and print them out (check for appropriate stemming)
stop_words = get_stop_words(STOP_PATH)
queries = read_queries(QUERIES_PATH, stop_words)
for key, value in queries.items():
    print(key,value)

85 ['alleg', 'corrupt', 'public', 'offici', 'govern']
59 ['weather', 'caus', 'fatal']
56 ['predict', 'prime', 'lend', 'rate', 'prime', 'rate', 'move']
71 ['incurs', 'border', 'militari', 'forc', 'guerrilla']
64 ['polit', 'hostag']
62 ['militari', 'coup', 'd', 'etat']
93 ['support', 'nation', 'rifl', 'associ', 'nra']
99 ['iran', 'contra', 'affair']
58 ['rail', 'strike']
77 ['poach', 'wildlif']
54 ['contract', 'preliminari', 'agreement', 'tent', 'reserv', 'launch', 'commerci', 'satellit']
87 ['crimin', 'offic', 'fail', 'U', 'S', 'financi', 'institut']
94 ['crime', 'aid', 'comput']
100 ['non', 'communist', 'industri', 'state', 'regul', 'transfer', 'high', 'tech', 'good', 'technolog', 'undesir', 'nation']
89 ['exist', 'pend', 'invest', 'opec', 'member', 'state', 'downstream', 'oper']
61 ['israel', 'iran', 'contra', 'affair']
95 ['comput', 'crime', 'solv']
68 ['studi', 'safeti', 'manufactur', 'employe', 'instal', 'worker', 'fine', 'diamet', 'fiber', 'insul']
57 ['mci', 'bell', 'system', 'br

In [42]:
# read in the catalog
catalog = read_catalog(CATALOG_PATH)
print("We have {} terms in the catalog".format(len(catalog))) # less terms than w/ Elsasticsearch; 
                                                              # I didn't include anything with punctuation

# read in doc_hashes as dictionary so they are easy to access
doc_hashes = read_doc_hashes(DOC_HASHES_PATH)
print("We have {} docs".format(len(doc_hashes)))

# calc avg len d
AVG_LEN_D = calc_avg_len_d(doc_hashes, len(doc_hashes))

We have 169974 terms in the catalog
We have 84678 docs


In [None]:
"""

Part 2: Retrieval models
    
"""


In [43]:
# ****************************************************************************************************************** #
# Model 1: Okapi TF


"""
Function: okapi_tf_calc, a helper function to calculate score
"""
def okapi_tf_calc(tf, doc_len, avg_corp_len):
    score = tf / (tf + 0.5 + (1.5 * (doc_len / avg_corp_len)))
    return score

"""
Function: okapi_tf
Input: Catalog dict (term --> offset,len in index); doc_names dict (doc_hash --> (name, len)),
       queries_dict (query id --> dictionary (doc_no --> score))
Output: A scores dictionary (query id --> dictionary (doc-id --> score)
Does: Iterates through each query term, and through each doc that the term appers in and calculates okapi-tf 
for the document-word combination. Sums score and returns as dict.
"""
def okapi_tf(catalog_dict, doc_names, query_dict):
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    # populate with query ids mapped to empty dict
    for q_id, query in query_dict.items():
        scores[q_id] = {}
        
    # iterate over each query
    for q_id, query in query_dict.items():
        # for each word in the query, get the docs that have that word
        for word in query:
            
            # access placement details from catalog
            index_placement = catalog_dict.get(word,0)
            if index_placement != 0:
                offset = index_placement[0]
                length = index_placement[1]

                # access string of information from invertex index
                f = open(INVERTED_INDEX_PATH, "r")
                f.seek(int(offset))
                doc_details = f.read(int(length))
                doc_details_dict = parse_doc_details(doc_details)
                f.close()

                # iterate over each doc that the word appears in
                for doc, positions in doc_details_dict.items():
                    tf = len(positions)
                    len_d = int(doc_names[doc][1])
                    avg_len = AVG_LEN_D
                    temp_score = okapi_tf_calc(tf, len_d, avg_len)

                    # add score to dictionary
                    doc_name = doc_names[doc][0]
                    if doc_name not in scores[q_id].keys():
                        scores[q_id][doc_name] = temp_score
                    else:
                        scores[q_id][doc_name] += temp_score
                    
    return scores


# ****************************************************************************************************************** #
# Model 4: Okapi BM25


"""
Function for calculating BM25
"""
def okapi_BM25_calc(total_docs, df, tf, tf_query, doc_len, avg_doc_len, k_1, k_2, b):
    calc1_num = total_docs + 0.5
    calc1_den = df + 0.5
    calc1 = math.log((calc1_num/calc1_den), 2)

    calc2_num = tf + (k_1 * tf)
    cal2_den = tf + k_1 * ((1-b) + (b * (doc_len/avg_doc_len)))
    calc2 = calc2_num/cal2_den

    calc3_num = tf_query + (k_2 * tf_query)
    calc3_den = tf_query + k_2
    calc3 = calc3_num/calc3_den

    return calc1 * calc2 * calc3


"""
Function: okapi_BM25
Input: Catalog dict (term --> offset,len in index); doc_names dict (doc_hash --> (name, len)),
       queries_dict (query id --> dictionary (doc_no --> score))
Output: A scores dictionary (query id --> dictionary (doc-id --> score)
Does: Iterates through each query term, and through each doc that the term appers in and calculates okapi_BM25 
for the document-word combination. Sums score and returns as dict.
"""
def okapi_BM25(catalog_dict, doc_names, query_dict):
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    # maps the query # --> Counter (for each query)
    queries_counter = {}
    # populate with query ids mapped to empty dict and queries counter with tf_queries
    # Counter is number of times the word occurs in the query
    for q_id, query in query_dict.items():
        scores[q_id] = {}
        queries_counter[q_id] = Counter()
        for word in query:
            queries_counter[q_id][word] += 1

    # iterate over each query
    for q_id, query in query_dict.items():
        # for each word in the query, get the docs that have that word
        for word in query:
            
            # access placement details from catalog
            index_placement = catalog_dict.get(word,0)
            if index_placement != 0:
                offset = index_placement[0]
                length = index_placement[1]

                # access string of information from invertex index
                f = open(INVERTED_INDEX_PATH, "r")
                f.seek(int(offset))
                doc_details = f.read(int(length))
                doc_details_dict = parse_doc_details(doc_details)
                f.close()

                # iterate over each doc that the word appears in
                for doc, positions in doc_details_dict.items():
                    
                    # first calc
                    total_docs = len(doc_names) # total number of docs we have
                    df = len(doc_details_dict) # number of docs this word occurs in
                    
                    # second calc
                    tf = len(positions)
                    len_d = int(doc_names[doc][1])
                    avg_len = AVG_LEN_D
                    
                    # third calc
                    tf_query = queries_counter[q_id][word]
                    
                    # constants
                    k_1 = 1.5 # 1.5 is better when using with proximity search
                    # k_1 = 1.2
                    k_2 = 1.2
                    b = 0.5 # 0.5 is better when using with proximity search
                    # b = 0.75
                    
                    # calculate okapi BM25
                    temp_score = okapi_BM25_calc(total_docs, df, tf, tf_query, len_d, avg_len, k_1, k_2, b)

                    # add score to dictionary
                    doc_name = doc_names[doc][0]
                    if doc_name not in scores[q_id].keys():
                        scores[q_id][doc_name] = temp_score
                    else:
                        scores[q_id][doc_name] += temp_score

    return scores


# ****************************************************************************************************************** #
# Model 5: Unigram LM with Laplace smoothing

"""
Function for laplace calculation
"""
def p_laplace_calc(tf, doc_len, v):
    return (tf + 1) / (doc_len + v)


"""
Function: unigram_lm_laplace
Input: Catalog dict (term --> offset,len in index); doc_names dict (doc_hash --> (name, len)),
       queries_dict (query id --> dictionary (doc_no --> score))
Output: A scores dictionary (query id --> dictionary (doc-id --> score)
Does: Iterates through each query term, and through each doc that the term appers in and calculates unigram_lm_laplace 
for the document-word combination. Sums score and returns as dict.
"""
def unigram_lm_laplace(catalog_dict, doc_names, query_dict):
    
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    
    # populate with query ids mapped to empty dict
    for q_id, query in query_dict.items():
        scores[q_id] = {}

    # get vocabulary size
    v = len(catalog_dict)
    
    # iterate over each query
    for q_id, query in query_dict.items():
        # for each word in the query, get the docs that have that word
        for word in query:
            
            # access placement details from catalog (just string at this point)
            index_placement = catalog_dict.get(word,0)
            
            # get the docs that this word appers in
            doc_details_dict = {}
            # if word is in the catalog, score all docs it appears in
            if index_placement != 0:
                offset = index_placement[0]
                length = index_placement[1]

                # access string of information from invertex index
                f = open(INVERTED_INDEX_PATH, "r")
                f.seek(int(offset))
                doc_details = f.read(int(length))
                doc_details_dict = parse_doc_details(doc_details)
                f.close()
                
            # iterate over each document (doc_hash --> (name, len))
            for hash_val, info in doc_names.items():
                # when the word is in the doc
                if hash_val in doc_details_dict.keys():
                    positions = doc_details_dict.get(hash_val)
                    tf = len(positions)
                    len_d = int(info[1])
                    temp_score = p_laplace_calc(tf, len_d, v)
                # when the word is not in the doc
                else:
                    len_d = int(info[1])
                    temp_score = p_laplace_calc(0, len_d, v)
                    
                # add score to dictionary
                doc_name = info[0]
                score = math.log(temp_score)
                if doc_name not in scores[q_id].keys():
                    scores[q_id][doc_name] = score
                else:
                    scores[q_id][doc_name] += score
                
    return scores


In [7]:
"""
Okapi TF model for running with compressed index
"""


"""
Function: okapi_tf_calc, a helper function to calculate score
"""
def okapi_tf_calc(tf, doc_len, avg_corp_len):
    score = tf / (tf + 0.5 + (1.5 * (doc_len / avg_corp_len)))
    return score


"""
Function: okapi_tf
Input: Catalog dict (term --> offset,len in index); doc_names dict (doc_hash --> (name, len)),
       queries_dict (query id --> dictionary (doc_no --> score))
Output: A scores dictionary (query id --> dictionary (doc-id --> score)
Does: Iterates through each query term, and through each doc that the term appers in and calculates okapi-tf 
for the document-word combination. Sums score and returns as dict.
"""
def okapi_tf(catalog_dict, doc_names, query_dict):
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    # populate with query ids mapped to empty dict
    for q_id, query in query_dict.items():
        scores[q_id] = {}
        
    # iterate over each query
    for q_id, query in query_dict.items():
        # for each word in the query, get the docs that have that word
        for word in query:
            
            # access placement details from catalog
            index_placement = catalog_dict.get(word,0)
            if index_placement != 0:
                offset = index_placement[0]
                length = index_placement[1]

                # access string of information from invertex index
                f = open(INVERTED_INDEX_PATH, "rb")
                f.seek(int(offset))
                doc_details = f.read(int(length))
                doc_details_decompressed = zlib.decompress(doc_details)
                doc_details_string = str(doc_details_decompressed, 'utf-8')
                doc_details_dict = parse_doc_details(doc_details_string)
                f.close()

                # iterate over each doc that the word appears in
                for doc, positions in doc_details_dict.items():
                    tf = len(positions)
                    len_d = int(doc_names[doc][1])
                    avg_len = AVG_LEN_D
                    temp_score = okapi_tf_calc(tf, len_d, avg_len)

                    # add score to dictionary
                    doc_name = doc_names[doc][0]
                    if doc_name not in scores[q_id].keys():
                        scores[q_id][doc_name] = temp_score
                    else:
                        scores[q_id][doc_name] += temp_score
                    
    return scores

In [None]:
""" Main code for running retrieval models """

In [7]:
# Run Okapi-TF
doc_scores = okapi_tf(catalog, doc_hashes, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "okapi_tf_results")
print("Okapi-TF finished running!")

Okapi-TF finished running!


In [8]:
# Run Okapi-BM25
doc_scores = okapi_BM25(catalog, doc_hashes, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "okapi_BM25_results")
print("Okapi-BM25 finished running!")

Okapi-BM25 finished running!


In [9]:
# Run Laplace Smoothing
doc_scores = unigram_lm_laplace(catalog, doc_hashes, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "unigram_laplace_results")
print("Unigram LM with Lapalace Smoothing finished running!")

Unigram LM with Lapalace Smoothing finished running!


In [8]:
# Run okapi TF on compressed index
doc_scores = okapi_tf(catalog, doc_hashes, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "okapi_tf_compressed_results")
print("Okapi-TF finished running!")

Okapi-TF finished running!


In [44]:
"""
Proximity search model:
- Code derived from "Experiments with Proximity-Aware Scoring for XML Retrieval at INEX 2008"
  by Broschart, Schenkel, and Theobald
"""


"""
Function: calc_ief
Input: Number doc docs in corpus
Output: IEF calculation
"""
def calc_ief(n, term_freq):
    return math.log( (n - (term_freq) + 1) / (term_freq + 0.5), 2)


"""
Function: calc_acc
Input: total docs, distance, word, dictionary of words and their positions in the current document
Output: acc score as defined by Broschart et al (2008)
Does: Checks the distance between the current query word and all other words in the query. If the two words
in question fall within the defined distance, calcuate ief score for those two words and sum to acc score. 
"""
def calc_acc(td, d, wrd, w_positions_dict):
    curr_positions = w_positions_dict.get(wrd)
    acc = 0
    for w, p in w_positions_dict.items():
        if w == wrd:
            continue
        p1 = 0
        p2 = 0
        term_j_pos = p
        while (p1 < len(curr_positions) and p2 < len(term_j_pos)):
            window = abs(int(curr_positions[p1]) - int(term_j_pos[p2]))
            if window <= d:
                # log(total_docs - tf + 0.5 / tf + 1) / d^2
                tf = len(term_j_pos)
                temp_acc = calc_ief(td, tf) / d**2
                acc += temp_acc
            if int(curr_positions[p1]) < int(term_j_pos[p2]):
                p1 += 1
            else:
                p2 += 1
        while (p1 < len(curr_positions)):
            window = abs(int(curr_positions[p1]) - int(term_j_pos[p2-1])) # may need to do p2 -1 or else out of range
            if window <= d:
                tf = len(term_j_pos)
                temp_acc = calc_ief(td, tf) / d**2
                acc += temp_acc
            p1 += 1
        while (p2 < len(term_j_pos)):
            window = abs(int(curr_positions[p1-1]) - int(term_j_pos[p2]))
            if window <= d:
                tf = len(term_j_pos)
                temp_acc = calc_ief(td, tf) / d**2
                acc += temp_acc
            p2 += 1
    return acc


"""
Function: calc_proximity
Input: total docs, tf, acc for term, doc len, avg doc len, k1, k2, b values
Output: proximity score as defined by Broschart et al (2008)
"""
def calc_proximity(num_docs, tf, acc_t, len_d, avg_doc_len, k1, k2, b):
    ief = calc_ief(num_docs, tf)
    calc2 = (acc_t * (k1+1)) / (acc_t + (k1 * ((1-b) + (b * (len_d/avg_doc_len)))) ) 
    prox = min(ief, 1) * calc2
    return prox
    
        
    
"""
Function: proximity_search
Input: catalog, dict mapping doc names to a number, query dict of terms
Output: Dict of scores for each document in each query
Does: First gets okapi BM25 scores for all docs relating to the queries. Then calculates a proximity score to
append to the current score of each doc, using the proximity of the query terms in the doc. 
"""
def proximity_search(catalog_dict, doc_names, query_dict):
    
    # constants
    k_1 = 1.5
    k_2 = 1.2
    b = 0.4
    d = 2
    
    # maps the query # --> dictionary (doc-no : score)
    scores = okapi_BM25(catalog_dict, doc_names, query_dict)
    
    for q_id, query in query_dict.items():
        
        query_info = {} # doc: {word: positions} for only words in the current query
        for word in query:
            
            # get the docs that this word appers in
            index_placement = catalog_dict.get(word,0)
            doc_details_dict = {} # doc: positions, for just the current word
            if index_placement != 0: # if the word exists in the corpus
                offset = index_placement[0]
                length = index_placement[1]
                # access string of information from invertex index
                f = open(INVERTED_INDEX_PATH, "r")
                f.seek(int(offset))
                doc_details = f.read(int(length))
                doc_details_dict = parse_doc_details(doc_details) # doc --> positions
                f.close()
                
                # populate query_info with the docs that this word appears in
                for doc, pos in doc_details_dict.items():
                    if query_info.get(doc): # if doc is already in query_info
                        query_info[doc][word] = pos
                    else:
                        query_info[doc] = {word : pos}
                        
        # once we have all of the docs that this query exists in, get proximity information for the words
        for doc, words in query_info.items():
            for word, pos in words.items():
                tf = len(pos)
                curr_acc = calc_acc(len(doc_names), d, word, words)
                prox_score = calc_proximity(len(doc_names), tf, curr_acc, int(doc_names[doc][1]), AVG_LEN_D, k_1, k_2, b)
                doc_name = doc_names[doc][0]
                scores[q_id][doc_name] += prox_score
                
    return scores

In [45]:
# read in queries without modification (except removing stop words/stemming)
stop_words = get_stop_words(STOP_PATH)
queries = read_queries(QUERIES_PATH_UNMODIFIED, stop_words)
queries.pop("")
for key, value in queries.items():
    print(key, value)



85 ['Document', 'discuss', 'allegations', 'measures', 'taken', 'corrupt', 'public', 'officials', 'governmental', 'jurisdiction', 'worldwide']
59 ['Document', 'report', 'type', 'weather', 'event', 'directly', 'caused', 'least', 'fatality', 'location']
56 ['Document', 'prediction', 'prime', 'lending', 'rate', 'report', 'actual', 'prime', 'rate', 'move']
71 ['Document', 'report', 'incursions', 'land', 'air', 'water', 'border', 'area', 'country', 'military', 'forces', 'second', 'country', 'guerrilla', 'group', 'based', 'second', 'country']
64 ['Document', 'report', 'event', 'result', 'politically', 'motivated', 'hostage', 'taking']
62 ['Document', 'report', 'military', 'coup', 'd', 'etat', 'attempted', 'successful', 'country']
93 ['Document', 'describe', 'identify', 'supporters', 'National', 'Rifle', 'Association', 'NRA', 'assets']
99 ['Document', 'identify', 'development', 'Iran', 'Contra', 'Affair']
58 ['Document', 'predict', 'anticipate', 'rail', 'strike', 'report', 'ongoing', 'rail', '

In [46]:
# Run Proximity search
start_time = time.time()
doc_scores = proximity_search(catalog, doc_hashes, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "proximity_results_unstemmed")
print("Proximity search model finished running!")
print("--- %s seconds ---" % (time.time() - start_time))

Proximity search model finished running!
--- 6.93445611000061 seconds ---
