In [None]:
import pandas as pd
import json
import re

from metadata import NewsPassagePaths
from retrieval.tools import SearchTools


def get_df_from_file(passage_path, entity_path, entity_qrels_path, 
                     fold=0, index_path=NewsPassagePaths.index):
    search_tools = SearchTools(index_path=index_path)
    qrels = search_tools.retrieval_utils.get_qrels_scaled_dict(qrels_path=entity_qrels_path)

    data = {}
    
    def get_df(path, task):
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                row = line.split()
                metric, query, score = row[0], row[1], row[2]
                
                type_metric = task + '_' + metric
                if type_metric not in data:
                    data[type_metric] = {}
                
                if query == 'all':
                    query = 'all' + '_' + str(fold)
                    data[type_metric].update({query: float(score)})
                else:
                    data[type_metric].update({query: float(score)})

        return pd.DataFrame(data)
        
    df_passage = get_df(path=passage_path, task='passage')
    df_entity = get_df(path=entity_path, task='entity')
    print(len(df_passage.index), len(df_entity.index))
    df = pd.concat([df_passage, df_entity], axis=1)

    df['fold'] = fold
    
    title_list = []
    doc_list = []
    words_list = []
    qrels_list = []

    for query in list(df.index):
        if 'all_' in query:
            title_list.append(" ")
            doc_list.append(" ")
            words_list.append(" ")
            qrels_list.append([""])
            
        else:
            doc = search_tools.get_contents_from_docid(query)
            doc_list.append(doc)
            doc_dict = json.loads(doc)
            
            title = doc_dict['title']
            title_list.append(title)
            
            content_text = ""
            for content in doc_dict['contents']:
                if 'content' in content.keys():
                    if isinstance(content['content'], dict) == False:
                        rule = r'<a href=.*\>|<a class=.*\>|<span class=.*\>|</span>|<span>|</span>|<i>|</i>|<strong>|</strong>|<b>|</b>|<br />'
                        text = re.sub(rule, '', str(content['content']))
                        content_text += " " + str(text)
            words = title + content_text
            words_list.append(words)
            
            if query in qrels:
                qrels_list.append(sorted(qrels[query].items(), key=lambda x: x[1],  reverse=True)) 
            else:
                qrels_list.append([""])

    df['doc'] = doc_list
    df['title'] = title_list
    df['words'] = words_list
    print(len(qrels_list))
    df['entity_qrels'] = qrels_list
    
    return df

In [None]:
df_list = []
model = 'bm25'
for k in range(5):
    passage_path = '/nfs/trec_news_track/runs/anserini/folds/anserini_{}_default_passage_scaled_fold_{}.run.eval.by_query'.format(model, k)
    entity_path = '/nfs/trec_news_track/runs/anserini/folds/entity_ranking_fold_{}_bm25.run.eval.by_query'.format(model, k)
    entity_qrels_path = '/nfs/trec_news_track/data/5_fold/scaled_5fold_{}_entity.qrels'.format(k)
    df_list.append(get_df_from_file(passage_path=passage_path, entity_path=entity_path, entity_qrels_path=entity_qrels_path, fold=k))
    
df = pd.concat(df_list) 