In [8]:
import pysolr
import re
import pandas as pd
from pprint import pprint
from tabulate import tabulate
from IPython.display import display


In [9]:
# Solr collection url 
SOLR_URL = 'http://solr:8983/solr/cmp269'

QUERIES_FILE_PATH = 'data/Consultas_UTF8.txt'

OUTPUT_REPORTS_FOLDER = 'data'

In [10]:
def parse_file(fpath, doc_tag):
    '''Reads file line by line and extract the docs'''

    docs = []
    closing_doc_line = "</%s>\n" % doc_tag
    
    with open(fpath) as file:
        doc_as_string = ""

        line = file.readline()
        while (line):
            if (line) == closing_doc_line:
                doc_as_string += line
                
                doc = parse_doc(doc_as_string)
                docs.append(doc)

                doc_as_string = ""
            else:
                doc_as_string += line

            line = file.readline()

    return docs

In [11]:
def parse_doc(doc_as_string):
    # print(doc_as_string)

    doc_tpl = {
        'num': 'num',
        'PT-title': 'title',
        'PT-desc': 'desc',
        'PT-narr': 'narr',
    }

    doc = {}

    for key in doc_tpl.keys():
        # generates something like:
        # str_pattern = ".*\<DOCID\>(.*)\<\/DOCID\>.*"
        str_pattern = ".*\<%s\>(.*)\<\/%s\>.*" % (key, key)

        # compiles pattern, capture match, updates doc
        re_pattern = re.compile(str_pattern, re.DOTALL)
        match = re_pattern.match(doc_as_string)
        
        if match:
            value = match.group(1).strip()
            solr_key = doc_tpl[key]
            doc.update({solr_key: value})

    # ensures we have all keys..
    for key in doc_tpl.values():
        assert(doc[key] is not None)

    return doc


In [12]:
sq = pysolr.Solr(SOLR_URL)

queries = parse_file(QUERIES_FILE_PATH, 'top')

pprint(queries[0])

{'desc': 'Encontrar documentos sobre tratamentos que empreguem medicina '
         'natural ou alternativa. Aqui são incluídas terapias como a '
         'acupuntura, a hemopatia, a quiroprática, entre outras.',
 'narr': 'Documentos relevantes devem fornecer informação, específica ou '
         'genérica, sobre o uso de tratamentos ou técnicas de medicina '
         'natural ou alternativa.',
 'num': '251',
 'title': 'Medicina alternativa'}


In [13]:
sq = pysolr.Solr(SOLR_URL)


def query_solr(query_field):
    """Executa as queries no solr e retorna um DataFrame com os resultados"""
    result_lines = []
    queries = parse_file(QUERIES_FILE_PATH, 'top')
    for q in queries:
        title = q["title"]
        params = {
            "q": title,
            "q.op": "OR",
            "df": query_field,
            "rows": 100,
            "fl": "*,score",
            "sort": "score desc",
        }

        result = sq.search(**params)

        ranking = 0
        for doc in result.docs:
            result_lines.append([q["num"], "Q0", doc["docno_s"], ranking, doc["score"], "Brenda_Piter"])
            ranking += 1

    df = pd.DataFrame(data=result_lines)
    
    return df
    

def normalize_results(df):
    """Normaliza os resultados entre 0 e 1"""
    grouped = df.groupby(0)  # Separa em grupos cada query (Coluna 0 é o número da query)

    new_df = pd.DataFrame()
    for name, group in grouped:
        # Em cada grupo, divide o score da linha pelo maior score do grupo
        group.loc[:,4] = group.loc[:,4].apply(lambda x: x/group[4].max())
        aux = pd.concat([new_df, group])
        new_df = aux
        
    return new_df


def to_fwf(df, fname):
    # Pra exportar um DataFrame pra um txt com colunas de tamanho fixo
    # https://stackoverflow.com/a/35974742/3284017
    content = tabulate(df.values.tolist(), tablefmt="plain")
    open(fname, "w").write(content)

pd.DataFrame.to_fwf = to_fwf


def generate_report(query_field):
    
    df = query_solr(query_field)
    df = normalize_results(df)
    
    # Salva o arquivo
    fname = "%s/respostas_campo_%s.txt" % (OUTPUT_REPORTS_FOLDER, query_field)
    df.to_fwf(fname)
    
    display(df.head())
    print("Criado arquivo %s" % fname)


In [14]:
generate_report("text_txt_c1")
generate_report("text_txt_c4")

Unnamed: 0,0,1,2,3,4,5
0,251,Q0,FSP950315-114,0,1.0,Brenda_Piter
1,251,Q0,FSP950426-099,1,0.804281,Brenda_Piter
2,251,Q0,FSP950720-079,2,0.798148,Brenda_Piter
3,251,Q0,FSP950527-009,3,0.751702,Brenda_Piter
4,251,Q0,FSP951121-104,4,0.750709,Brenda_Piter


Criado arquivo data/respostas_campo_text_txt_c1.txt


Unnamed: 0,0,1,2,3,4,5
0,251,Q0,FSP950315-114,0,1.0,Brenda_Piter
1,251,Q0,FSP950409-099,1,0.824287,Brenda_Piter
2,251,Q0,FSP951111-086,2,0.778781,Brenda_Piter
3,251,Q0,FSP950828-098,3,0.746205,Brenda_Piter
4,251,Q0,FSP950303-004,4,0.726719,Brenda_Piter


Criado arquivo data/respostas_campo_text_txt_c4.txt
