## Statistic of IR evaluation on BioASQ with 10 abstracts (182 documents)

In [1]:
import json
import pandas as pd
#!pip install xlsxwriter
import xlsxwriter

In [2]:
# Load the JSON file
file_path = 'hybrid_results_pmid_recore_07-03.json'

# Open and read the file
with open(file_path, 'r') as file:
    data = [json.loads(line) for line in file]
#data

In [3]:
# Prepare the extracted data
extracted_data = []
for entry in data:
    query_type = entry.get('question_type')
    true_documents = entry.get('true_documents', [])
    retrieved_documents = [doc[0] for doc in entry.get('retrieved_documents', [])]  # Extract the first element from each sub-list
    # Calculate the number of matching documents in
    matching_documents = len(set(true_documents).intersection(set(retrieved_documents)))
    extracted_data.append([query_type, true_documents, retrieved_documents, matching_documents])
#extracted_data

In [4]:
# Create a DataFrame
df = pd.DataFrame(extracted_data, columns=['question_type', 'true_documents', 'retrieved_documents', 'matching_documents'])
df

Unnamed: 0,question_type,true_documents,retrieved_documents,matching_documents
0,yesno,"[3320045, 7515725, 20805556, 19297413, 1972424...","[11076767, 3320045, 15094122, 12666201, 303886...",6
1,yesno,"[22948539, 23698708, 23827649, 22901753, 21618...","[23698708, 33765924, 15238139, 33114380, 22428...",1
2,factoid,"[24642372, 25056878, 24694658, 25698922, 17064...","[16531318, 27826632, 35628472, 16690210, 29246...",0
3,yesno,"[24037088, 22094949, 20202189, 22617881, 24384...","[20202189, 35626721, 25618141, 32701835, 18533...",1
4,factoid,"[22166853, 20020530, 25759798, 26196025, 24129...","[24453141, 28954305, 15517377, 21982616, 35545...",1
...,...,...,...,...
177,summary,"[35576529, 36260990, 35988546, 34889443, 34987...","[36124781, 37596712, 37676741, 37991635, 35366...",4
178,factoid,"[34938127, 32519222, 33285037, 36401027, 32453...","[34938127, 37005891, 31720560, 32453377, 33285...",8
179,yesno,"[32127123, 33135801, 36119462, 32881022, 34484...","[33947345, 32832306, 36948964, 35603380, 32577...",0
180,summary,"[31670406, 28862809, 30152529, 26733686, 35836...","[31670406, 26733686, 30243851, 28862809, 28923...",6


### Matching documents on dataset level

In [9]:
# Create matching_all DataFrame by grouping by 'matching_documents' and counting rows
matching_all = df.groupby('matching_documents').size().reset_index(name='count')
matching_all

Unnamed: 0,matching_documents,count
0,0,45
1,1,41
2,2,23
3,3,16
4,4,22
5,5,16
6,6,12
7,7,1
8,8,5
9,10,1


### Matching documents on by question type

In [10]:
# Group the data first by query_type and then by matching_documents, and count the occurrences
matching_all_by_query_type = df.groupby(['question_type', 'matching_documents']).size().reset_index(name='count')
matching_all_by_query_type

Unnamed: 0,question_type,matching_documents,count
0,factoid,0,11
1,factoid,1,13
2,factoid,2,6
3,factoid,3,2
4,factoid,4,6
5,factoid,5,4
6,factoid,6,5
7,factoid,8,2
8,list,0,16
9,list,1,9


In [11]:
# Save both DataFrames to an Excel file with two sheets
output_path = 'matching_documents_statistic.xlsx'
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    matching_all.to_excel(writer, sheet_name='Matching_All', index=False)
    matching_all_by_query_type.to_excel(writer, sheet_name='Matching_By_Query_Type', index=False)