In [None]:
"""
 Part 1: This code retrieves the results from es search needed for ranking. 
"""

In [1]:
import pickle
from elasticsearch import Elasticsearch, helpers
from elasticsearch.client import IndicesClient
import operator
import os
import pandas as pd


In [2]:
host='https://elastic:cwHN1LsyXbAGmb5LxCbADTkj@cs6200.es.us-west1.gcp.cloud.es.io:9243'

es = Elasticsearch([host],timeout=3000)
print(es.ping())
ic = IndicesClient(es)

True


In [17]:
# get the query ready for elasticsearch

"""
Function: query_analyzer()
Input: The full query as a string (one or more words)
Output: A list of strings where each string is one word (token) of the query
"""
def query_analyzer(query):
    body = {
        "tokenizer": "standard",
        "filter": ["english_stemmer", "lowercase", "english_stop"],
        "text": query
    }
    response = ic.analyze(body=body, index="corpus_wwii")
    cleaned_queries = [list["token"] for list in response["tokens"]]
    return cleaned_queries

q = 'United States battles won in WWII'
query_clean = query_analyzer(q)
print(query_clean)

['unite', 'state', 'battl', 'won', 'wwii']


In [19]:
# search elastic search for the documents, sort them, and save them to a file

"""
Function: write_scores_to_file_es()
Input: A dictionary of query responses (documents returned for each query) and a name for the file
Output: None
Does: Writes a file for the output to ES built in model. Scores will already by sorted.
For each query response, writes a line for each document that was returned that includes the query number,
doc number, rank, and score. Each line should be of the form: <query-number> Q0 <docno> <rank> <score> Exp
"""
def write_scores_to_file_es(response_dict, name):
    # assumes scores are already sorted
    file_name = name + ".txt"
    if os.path.exists(file_name):
        os.remove(file_name)
    output = open(file_name, "w")

    # iterate over the response_dict for each query (maps query number from input to response dict)
    # response["hits"]["hits"] is a list of dicts for each doc with keys:
    # _id, _score, _source (dict of keys "file_name", "text")
    for q_id, response in response_dict.items():
        query_number = q_id
        rank = 1
        for doc in response["hits"]["hits"]:
            docno = doc["_id"]
            score = doc["_score"]
            new_line = "2" + " Q0 " + docno + " " + str(rank) + " " + str(score) + " Exp\n"
            output.write(new_line)
            rank += 1
    output.close()

"""
Model: ES Built-in
Input: A dictionary of queries where their ID is mapped to a list of the queries as a string, each token separated
by a single whitespace
Returns: A dictionary of the responses provided by ES for each query
Does: Iterates through each query and saves the HIT responses in a response dictionary. Max 1000 hits per query
"""
def es_built_in(query_dict):
    responses = {}
    for _id, query in query_dict.items():
        query = " ".join(query)
        query_body = {
            "size": 200,
            "query": {
                "match": {
                    "text": query
                }
            }
        }
        response = es.search(index="corpus_wwii", body=query_body)
        responses[_id] = response
    return responses

# run model and write to file
q_dict = {"1" : query_clean}
r = es_built_in(q_dict)
write_scores_to_file_es(r, "es_results_200_us")
print("ES-Built in finished running!")

ES-Built in finished running!


In [2]:
"""
  Part 2: If we already have es results, create a results file to use with trec.
"""

''

In [2]:
# read in es results from csv
es_results = pd.read_csv("C:/6200-IR/hw5/scoring/es_scores.csv")
es_results.head()

Unnamed: 0,query_no,author,doc_no,score
0,151801,Melanie,https://en.wikipedia.org/wiki/Centre_for_the_S...,6.800807
1,151801,Melanie,https://en.wikipedia.org/wiki/Germany%27s_Aims...,6.764579
2,151801,Melanie,http://en.wikipedia.org/wiki/List_of_recession...,6.638603
3,151801,Melanie,https://en.wikipedia.org/wiki/Chemical_weapons...,6.61629
4,151801,Melanie,https://en.wikipedia.org/wiki/Use_of_poison_ga...,6.61629


In [5]:
# es file needs to be of the form

def write_es_results_from_df(file_name):
    if os.path.exists(file_name):
        os.remove(file_name)
    output = open(file_name, "w")

    rank = 1
    last_q_no = 151801
    for i in range(len(es_results)):
        curr_q_no = es_results.loc[i, "query_no"]
        if last_q_no != curr_q_no:
            rank = 1
            last_q_no = curr_q_no
            
        new_line = str(es_results.loc[i, "query_no"]) + " Q0 " + str(es_results.loc[i, "doc_no"]) + " " + str(rank) + " " + str(es_results.loc[i, "score"]) + " Exp\n"
        output.write(new_line)
        rank += 1
        
    output.close()
    
write_es_results_from_df('C:/6200-IR/hw5/scoring/es_results.txt')
