In [6]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch.client import IndicesClient
import os
from collections import OrderedDict
import pickle
import operator
import math
from collections import Counter

In [17]:
es = Elasticsearch()
ic = IndicesClient(es)

In [None]:
"""

Part 1: Read in queries

    
"""

In [8]:
# Function: query_analyzer()
# Input: The full query as a string (one or more words)
# Output: A list of strings where each string is one word (token) of the query
def query_analyzer(query):
    body = {
        "tokenizer": "standard",
        "filter": ["english_stemmer", "lowercase", "english_stop"],
        "text": query
    }
    response = ic.analyze(body=body, index="ap_dataset")
    cleaned_queries = [list["token"] for list in response["tokens"]]
    return cleaned_queries


# Function: read_queries()
# Input: The folder path to the queries file as a string
# Output: A dictionary mapping each query ID to a list of terms in that query (as str)
def read_queries(folder_path):
    # iterate over each line in the query
    lines = []
    ids = []
    for line in open(folder_path, encoding="ISO-8859-1", errors='ignore'):
        curr_query = str(line)
        id_end = curr_query.find(".")
        id = curr_query[:id_end].strip()
        ids.append(id)
        curr_query = curr_query[id_end + 3:].strip()
        lines.append(curr_query)

    # put them through the analyzer
    cleaned_queries = {}
    for i in range(len(lines)):
        # cleaned_query will be a list of the query words as strings
        cleaned_query = query_analyzer(lines[i])
        if cleaned_query:
            cleaned_queries[ids[i]] = cleaned_query
    return cleaned_queries

In [9]:
# Main code to get the queries and print them out
queries_path = "C:/6200-IR/homework1-mplatt27/IR_data/AP_DATA/queries_modified_x.txt"
queries = read_queries(queries_path)
for key, value in queries.items():
    print(key,value)

85 ['alleg', 'corrupt', 'public', 'offici', 'govern']
59 ['weather', 'caus', 'fatal']
56 ['predict', 'prime', 'lend', 'rate', 'prime', 'rate', 'move']
71 ['incurs', 'border', 'militari', 'forc', 'guerrilla']
64 ['polit', 'hostag']
62 ['militari', 'coup', "d'etat"]
93 ['support', 'nation', 'rifl', 'associat', 'nra']
99 ['iran', 'contra', 'affair']
58 ['rail', 'strike']
77 ['poach', 'wildlif']
54 ['contract', 'preliminari', 'agreement', 'tent', 'reserv', 'launch', 'commerci', 'satellit']
87 ['crimin', 'offic', 'fail', 'u.s', 'financi', 'institut']
94 ['crime', 'aid', 'comput']
100 ['non', 'communist', 'industri', 'state', 'regul', 'transfer', 'high', 'tech', 'good', 'technolog', 'undesir', 'nation']
89 ['exist', 'pend', 'invest', 'opec', 'member', 'state', 'ani', 'downstream', 'oper']
61 ['israel', 'iran', 'contra', 'affair']
95 ['comput', 'crime', 'solv']
68 ['studi', 'safeti', 'manufactur', 'employe', 'instal', 'worker', 'fine', 'diamet', 'fiber', 'insul']
57 ['mci', 'bell', 'system', 

In [None]:
"""

Part 2: Retrieval models
    
"""

In [18]:
"""
Utility functions
"""

# Function: get_all_docs()
# Input: None
# Output: A list of all doc_ids in the index
# Does: Uses es search() API to retrieve the document ids of each doc in the index. Uses scroll to do this.
def get_all_docs():
    # initialize list that will store all doc ids that we need to return
    doc_ids = []

    # start the search process, we will then scroll until we have all docs
    query_body = {
        "size": 10000,
        "query": {
            "match_all": {}
        }
    }
    response = es.search(index="ap_dataset", body=query_body, scroll='3m')
    old_scroll_id = response['_scroll_id']

    for doc in response['hits']['hits']:
        doc_ids.append(doc['_id'])

    while len(response['hits']['hits']):
        response = es.scroll(scroll_id=old_scroll_id, scroll='3m')
        old_scroll_id = response['_scroll_id']
        for doc in response['hits']['hits']:
            doc_ids.append(doc['_id'])

    return doc_ids


# Function: get_term_vectors
# Input: A list of document ids
# Output: Dictionary mapping doc-ids --> term vector for that document
# Does: Collects the term vector for each document in the input list using the es termvectors API. Returns as dict.
# This output takes a very long time, so we will dump to pickle and load it in each time to run the models.
def get_term_vectors(doc_ids):
    vector_per_doc = {}
    for doc in doc_ids:
        tv = es.termvectors(index="ap_dataset", id=doc, fields="text", term_statistics=True)
        vector_per_doc[doc] = tv

    return vector_per_doc


# Function: sort_scores_dict()
# Input: scores, a dictionary that maps query # to a dictionary (docno --> score)
# Output: The same dictionary, but the value (dict that each key maps to) is now sorted by the scores
def sort_scores_dict(scores):
    for q_id, d in scores.items():
        sorted_dict = dict(sorted(d.items(), key=operator.itemgetter(1), reverse=True))
        scores[q_id] = sorted_dict

    return scores


# Function: write_scores_to_file_es()
# Input: A dictionary of query responses (documents returned for each query) and a name for the file
# Output: None
# Does: Writes a file for the output to ES built in model. Scores will already by sorted.
# For each query response, writes a line for each document that was returned that includes the query number,
# doc number, rank, and score. Each line should be of the form: <query-number> Q0 <docno> <rank> <score> Exp
def write_scores_to_file_es(response_dict, name):
    # assumes scores are already sorted
    file_name = name + ".txt"
    if os.path.exists(file_name):
        os.remove(file_name)
    output = open(file_name, "w")

    # iterate over the response_dict for each query (maps query number from input to response dict)
    # response["hits"]["hits"] is a list of dicts for each doc with keys:
    # _id, _score, _source (dict of keys "file_name", "text")
    for q_id, response in response_dict.items():
        query_number = q_id
        rank = 1
        for doc in response["hits"]["hits"]:
            docno = doc["_id"]
            score = doc["_score"]
            new_line = query_number + " Q0 " + docno + " " + str(rank) + " " + str(score) + " Exp\n"
            output.write(new_line)
            rank += 1
    output.close()


# Function: write_scores_to_file()
# Input: A dictionary of scored documents for each query and a name for the file
# Output: None
# Does: Writes a file for the output. Assumes scores are already sorted. For each query response, writes a line
# for each document that was returned that includes the query number, doc number, rank, and score.
# Each line should be of the form: <query-number> Q0 <docno> <rank> <score> Exp
# This is for all models, except the ES built in, due to the differing format of results.
def write_scores_to_file(scores, name):
    # assumes scores are already sorted
    # scores is dict of query id --> dict (doc_id --> score)
    file_name = name + ".txt"
    if os.path.exists(file_name):
        os.remove(file_name)
    output = open(file_name, "w")

    # iterate over query id responses
    for q_id, dict in scores.items():
        query_number = q_id
        rank = 1
        for doc_id, score in dict.items():
#             if rank > 2000:
#                 break
            new_line = query_number + " " + "Q0" + " " + doc_id + " " + str(rank) + " " + str(score) + " Exp\n"
            output.write(new_line)
            rank += 1
    output.close()
    

# Function: vocab_size
# Input: None
# Output: size of vocab for index
def vocab_size():
    req = {
        'aggs': {
            "vocabSize": {
                "cardinality": {
                    "field": "text"
                }
            }
        },
        "size": 0
    }
    resp = es.search(index="ap_dataset", body=req)
    v = resp["aggregations"]["vocabSize"]["value"]
    return v


# Function: create_terms_dict()
# Input: term vectors dictionary that maps doc id --> tvs (maps doc id --> tv, dict with 
#        keys: _index, _type, _id, _version, _found, took, term_vectors
# Output: terms dictionary that  will map terms (str) --> dict of info: doc_freq, ttf
def create_terms_dict(tvs):
    
    # terms_dict will map terms (str) --> dict of info: doc_freq, ttf; will be size V
    terms_dict = {}
    
    for doc_id, tv in tvs.items():
        terms = tv["term_vectors"]["text"]["terms"]
        for term, info in terms.items():
            term = term.strip()
            if term not in terms_dict.keys():
                df = info["doc_freq"]
                ttf = info["ttf"]
                terms_dict[term] = {"doc_freq": df, "ttf": ttf}
    
    return terms_dict
        

In [19]:
"""
Retrieval Models
"""

# Model 1: ES Built-in
# Input: A dictionary of queries where their ID is mapped to a list of the queries as a string, each token separated
# by a single whitespace
# Returns: A dictionary of the responses provided by ES for each query
# Does: Iterates through each query and saves the HIT responses in a response dictionary. Max 1000 hits per query
def es_built_in(query_dict):
    responses = {}
    for id, query in query_dict.items():
        query = " ".join(query)
        query_body = {
            "size": 2000,
            "query": {
                "match": {
                    "text": query
                }
            }
        }
        response = es.search(index="ap_dataset", body=query_body)
        responses[id] = response
    return responses


# ****************************************************************************************************************** #
# Model 2; Okapi TF

# Function: okapi_tf_calc, a helper function to calculate score
def okapi_tf_calc(tf, doc_len, avg_corp_len):
    score = tf / (tf + 0.5 + (1.5 * (doc_len / avg_corp_len)))
    return score


# Input: A dictionary of term vectors for each document (doc-id --> term vector dict) and a dictionary
# of queries (query id --> list of each word in the query as str
# Returns: A scores dictionary (query id --> dictionary (doc-id --> score)
# Does: Iterates through each document, and through each query term and calculates okapi-tf for the document-word
# combination. Sums score and returns as dict.
def okapi_tf(tvs, query_dict):
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    # populate with query ids mapped to empty dict
    for q_id, query in query_dict.items():
        scores[q_id] = {}

    # iterate over each document
    # tv is a dict with keys: _index, _type, _id, _version, _found, took, term_vectors
    for doc_id, tv in tvs.items():
        # now iterate over each query that we have
        for q_id, query in query_dict.items():
            # iterate over each word in the query that we have to check with the current document
            for word in query:
                if word in tv["term_vectors"]["text"]["terms"].keys():
                    tf = tv["term_vectors"]["text"]["terms"].get(word, 0)
                    tf_value = tf['term_freq']
                    doc_len = sum(map(lambda doc_length_term: doc_length_term['term_freq'],
                                      tv["term_vectors"]["text"]["terms"].values()))
                    avg_doc_len = tv["term_vectors"]["text"]["field_statistics"]["sum_ttf"] / \
                                  tv["term_vectors"]["text"]["field_statistics"]["doc_count"]
                    # calculate okapi tf
                    temp_score = okapi_tf_calc(tf_value, doc_len, avg_doc_len)

                    # add score to dictionary
                    if doc_id not in scores[q_id].keys():
                        scores[q_id][doc_id] = temp_score
                    else:
                        scores[q_id][doc_id] += temp_score

    return scores


# ****************************************************************************************************************** #
# Model 3: TF-IDF


# Function for calculating tf-idf
def tf_idf_calc(okapi_tf_score, total_docs, df):
    return okapi_tf_score * math.log((total_docs/df), 2)


# Function tf-idf
def tf_idf(tvs, query_dict):
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    # populate with query ids mapped to empty dict
    for q_id, query in query_dict.items():
        scores[q_id] = {}

    # iterate over each document
    # tv is a dict with keys: _index, _type, _id, _version, _found, took, term_vectors
    for doc_id, tv in tvs.items():
        # now iterate over each query that we have
        for q_id, query in query_dict.items():
            # iterate over each word in the query that we have to check with the current document
            for word in query:
                if word in tv["term_vectors"]["text"]["terms"].keys():
                    tf = tv["term_vectors"]["text"]["terms"].get(word, 0)
                    tf_value = tf['term_freq']
                    doc_len = sum(map(lambda doc_length_term: doc_length_term['term_freq'],
                                      tv["term_vectors"]["text"]["terms"].values()))
                    avg_doc_len = tv["term_vectors"]["text"]["field_statistics"]["sum_ttf"] / \
                                  tv["term_vectors"]["text"]["field_statistics"]["doc_count"]
                    # calculate okapi tf
                    okapi_tf_score = okapi_tf_calc(tf_value, doc_len, avg_doc_len)
                    total_docs = len(tvs)
                    df = tf['doc_freq']
                    tf_idf_score = tf_idf_calc(okapi_tf_score, total_docs, df)

                    # add score to dictionary
                    if doc_id not in scores[q_id].keys():
                        scores[q_id][doc_id] = tf_idf_score
                    else:
                        scores[q_id][doc_id] += tf_idf_score

    return scores


# ****************************************************************************************************************** #
# Model 4: Okapi BM25


# Function for calculating BM25
def okapi_BM25_calc(total_docs, df, tf_value, tf_query, doc_len, avg_doc_len, k_1, k_2, b):
    calc1_num = total_docs + 0.5
    calc1_den = df + 0.5
    calc1 = math.log((calc1_num/calc1_den), 2)

    calc2_num = tf_value + (k_1 * tf_value)
    cal2_den = tf_value + k_1 * ((1-b) + (b * (doc_len/avg_doc_len)))
    calc2 = calc2_num/cal2_den

    calc3_num = tf_query + (k_2 * tf_query)
    calc3_den = tf_query + k_2
    calc3 = calc3_num/calc3_den

    return calc1 * calc2 * calc3


# Function: Okapi BM25
def okapi_BM25(tvs, query_dict):
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    # maps the query # --> Counter (for each query)
    queries_counter = {}
    # populate with query ids mapped to empty dict and queries counter with tf_queries
    for q_id, query in query_dict.items():
        scores[q_id] = {}
        queries_counter[q_id] = Counter()
        for word in query:
            queries_counter[q_id][word] += 1

    # iterate over each document
    # tv is a dict with keys: _index, _type, _id, _version, _found, took, term_vectors
    for doc_id, tv in tvs.items():
        # now iterate over each query that we have
        for q_id, query in query_dict.items():
            # iterate over each word in the query that we have to check with the current document
            for word in query:
                if word in tv["term_vectors"]["text"]["terms"].keys():
                    tf = tv["term_vectors"]["text"]["terms"].get(word, 0)

                    # for first block of equation
                    total_docs = len(tvs)
                    df = tf['doc_freq']

                    # for second block of equation
                    tf_value = tf['term_freq']
                    doc_len = sum(map(lambda doc_length_term: doc_length_term['term_freq'],
                                      tv["term_vectors"]["text"]["terms"].values()))
                    avg_doc_len = tv["term_vectors"]["text"]["field_statistics"]["sum_ttf"] / \
                                  tv["term_vectors"]["text"]["field_statistics"]["doc_count"]

                    # for third block of equation, number of times the word occurs in the query
                    tf_query = queries_counter[q_id][word]

                    # constants
                    k_1 = 1.2
                    k_2 = 1.2
                    b = 0.75

                    # calculate okapi BM25
                    okapi_BM25_score = okapi_BM25_calc(total_docs, df, tf_value, tf_query, doc_len, avg_doc_len, k_1, k_2, b)

                    # add score to dictionary
                    if doc_id not in scores[q_id].keys():
                        scores[q_id][doc_id] = okapi_BM25_score
                    else:
                        scores[q_id][doc_id] += okapi_BM25_score

    return scores


# ****************************************************************************************************************** #
# Model 5: Unigram LM with Laplace smoothing


# Function for laplace calculation
def p_laplace_calc(tf_value, doc_len, v):
    return (tf_value + 1) / (doc_len + v)


# Function: Unigram LM with Laplace smoothing
def unigram_lm_laplace(tvs, query_dict):
    
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    
    # populate with query ids mapped to empty dict
    for q_id, query in query_dict.items():
        scores[q_id] = {}

    # get vocabulary size from es search API, will be used in calculation
    v = vocab_size()

    # iterate over each document
    # tv is a dict with keys: _index, _type, _id, _version, _found, took, term_vectors
    for doc_id, tv in tvs.items():
        # now iterate over each query that we have
        for q_id, query in query_dict.items():
            # iterate over each word in the query that we have to check with the current document
            for word in query:
                if word in tv["term_vectors"]["text"]["terms"].keys():
                    tf = tv["term_vectors"]["text"]["terms"].get(word, 0)
                    tf_value = tf['term_freq']
                    doc_len = sum(map(lambda doc_length_term: doc_length_term['term_freq'],
                                      tv["term_vectors"]["text"]["terms"].values()))
                    p_laplace_score = p_laplace_calc(tf_value, doc_len, v)
                    score = math.log(p_laplace_score)
                else:
                    doc_len = sum(map(lambda doc_length_term: doc_length_term['term_freq'],
                                      tv["term_vectors"]["text"]["terms"].values()))
                    p_laplace_score = p_laplace_calc(0, doc_len, v)
                    score = math.log(p_laplace_score)

                # add score to dictionary
                if doc_id not in scores[q_id].keys():
                    scores[q_id][doc_id] = score
                else:
                    scores[q_id][doc_id] += score

    return scores


# ****************************************************************************************************************** #
# Model 6: Unigram LM with Jelinek-Mercer smoothing


# Caluclation for p_jm when the term is in the current document
def p_jm_calc(tf_value, doc_len, ttf, sum_ttf, v):
    lambda_val = 0.9
    calc1 = lambda_val * (tf_value/doc_len)
    calc2 = (1-lambda_val) * (ttf / v)
#     calc2 = (1-lambda_val) * ((ttf - tf_value) / (sum_ttf - doc_len))
    return calc1 + calc2


# Caluclation for p_jm when the term is not in the current document
def p_jm_calc_term_not_present(ttf, v):
    lambda_val = 0.9
    return (1-lambda_val) * (ttf / v)


# Function: Unigram LM with Jelinek-Mercer smoothing
def unigram_lm_jm(tvs, query_dict, terms_dict):
    
    # maps the query # --> dictionary (doc-no : score)
    scores = {}
    
    # populate with query ids mapped to empty dict
    for q_id, query in query_dict.items():
        scores[q_id] = {}
        
    # get vocabulary size from es search API, will be used in calculation
    v = vocab_size()

    # iterate over each document
    # tv is a dict with keys: _index, _type, _id, _version, _found, took, term_vectors
    for doc_id, tv in tvs.items():
        # now iterate over each query that we have
        for q_id, query in query_dict.items():
            # iterate over each word in the query that we have to check with the current document
            for word in query:
                if word in tv["term_vectors"]["text"]["terms"].keys():
                    tf = tv["term_vectors"]["text"]["terms"].get(word, 0)
                    tf_value = tf['term_freq']
                    doc_len = sum(map(lambda doc_length_term: doc_length_term['term_freq'],
                                      tv["term_vectors"]["text"]["terms"].values()))
                    ttf = tf['ttf']
                    sum_ttf = tv["term_vectors"]["text"]["field_statistics"]["sum_ttf"]
                    p_jm_score = p_jm_calc(tf_value, doc_len, ttf, sum_ttf, v)
                    score = math.log(p_jm_score)
                else:
                    # case where word is not in the doc, we still need to account for that
                    term_info = terms_dict.get(word, 0)
                    ttf = 100
                    if term_info != 0:
                        ttf = int(term_info['ttf'])
                    p_jm_score = p_jm_calc_term_not_present(ttf, v)
                    score = math.log(p_jm_score)

                # add score to dictionary
                if doc_id not in scores[q_id].keys():
                    scores[q_id][doc_id] = score
                else:
                    scores[q_id][doc_id] += score

    return scores

In [None]:
# Main code for running the models and saving output to file
# ONLY RUN IF FIRST TIME USING THIS: Create pickle file with all termvectors for each doc


# 1. Use search API to get ids of all docs, this will be used to get term vectors
# return_ids = get_all_docs()
# print("We got responses for this many docs: ", len(return_ids))

# 2. Create dictionary that will store term vectors
# return_term_vectors = get_term_vectors(return_ids)
# print("We got term vectors for this many docs: ", len(return_term_vectors))

# 3. Save term vectors in pickle file
# with open('termvectorswstats.pickle', 'wb') as handle:
#     pickle.dump(return_term_vectors, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
# IF RUNNING AFTER CREATING PICKLE FILE:


# Load pickle file that contains term vectors for each doc
handle = open('C:/6200-IR/homework1-mplatt27/config/termvectorswstats.pickle', 'rb')
return_term_vectors = pickle.load(handle)
handle.close()
print("Pickle file has opened")
print("There are this many documents to search: ", len(return_term_vectors))

Pickle file has opened
There are this many documents to search:  84678


In [15]:
# Create terms dictionary from the term vectors. This maps the terms to info that does not change (ttf and doc_freq)
# This is needed for the Unigram LM with Jelinek-Mercer Smoothing model
terms_stats_dict = create_terms_dict(return_term_vectors)
print("There are this many terms: ", len(terms_stats_dict))

There are this many terms:  180393


In [16]:
# run model 1:
hits = es_built_in(queries)
write_scores_to_file_es(hits, "es_built_in_results")
print("ES-Built in finished running!")

ES-Built in finished running!


In [20]:
# run model 2:
doc_scores = okapi_tf(return_term_vectors, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "okapi_tf_results")
print("Okapi-TF finished running!")

Okapi-TF finished running!


In [21]:
# run model 3:
doc_scores = tf_idf(return_term_vectors, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "tfidf_results")
print("TF-IDF finished running!")

TF-IDF finished running!


In [22]:
# run model 4:
doc_scores = okapi_BM25(return_term_vectors, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "okapi_bm25_results")
print("Okapi BM25 finished running!")

Okapi BM25 finished running!


In [23]:
# run model 5:
doc_scores = unigram_lm_laplace(return_term_vectors, queries)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "unigram_lm_laplace_results")
print("Unigram LM with Laplace Smoothing finished running!")

Unigram LM with Laplace Smoothing finished running!


In [24]:
# run model 6:
doc_scores = unigram_lm_jm(return_term_vectors, queries, terms_stats_dict)
doc_scores_sorted = sort_scores_dict(doc_scores)
write_scores_to_file(doc_scores_sorted, "unigram_lm_jm_results")
print("Unigram LM with Jelinek-Mercer Smoothing finished running!")

Unigram LM with Jelinek-Mercer Smoothing finished running!
