# group question on retrived document similarity entropy

In [59]:
import os
import sys

# Set the project root directory
project_root = os.path.abspath("..")  # Adjust this based on the notebook's location
if project_root not in sys.path:
    sys.path.append(project_root)


In [60]:
import json
def load_calibration(filename="claims.jsonl"):
    """
    Reverse of dump_claims.
    """
    with open(filename, "r") as fopen:
        return json.load(fopen)["data"]
    
def dump_claims(output_list, filename="claims.jsonl"):
    """
    Dumps output_list into filename.
    [{"prompt": "Who is Tatsu?", "claims": [{"subclaim": "Tatsu is Japanese person", 'correct': 1.0}, {"subclaim": "Tatsu was born in 1988", 'correct': 0.0} ..]}]
    """
    with open(filename, "w") as outfile:
        merged_json = {"data": output_list}
        json.dump(merged_json, outfile, indent=4)

In [22]:
import numpy as np
from collections import Counter

def entropy_histogram(data, bins=10):
    hist, bin_edges = np.histogram(data, bins=bins, density=True)  # Density gives probability estimates
    hist = hist / hist.sum()  # Normalize to get probabilities
    hist = hist[hist > 0]  # Remove zero probabilities
    return -np.sum(hist * np.log2(hist))

data = np.random.uniform(0, 1, 100)  # 100 random float values
print("Entropy:", entropy_histogram(data, bins=10))

data = np.random.uniform(0, 1, 10)  # 10 random float values
print("Entropy:", entropy_histogram(data, bins=3))

numbers = [3, 3, 3, 7, 7, 7, 7, 1, 1, 9]
print("Entropy:", entropy_histogram(numbers,3))
numbers = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
print("Entropy:", entropy_histogram(numbers,3))
numbers = [1, 1, 1, 1, 1, 1, 1, 1, 1, 9]
print("Entropy:", entropy_histogram(numbers,3))
numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
print("Entropy:", entropy_histogram(numbers,3))


Entropy: 3.3031630545974275
Entropy: 1.1567796494470395
Entropy: 1.0
Entropy: -0.0
Entropy: 0.4689955935892811
Entropy: 1.5709505944546687


In [70]:
import json
from rag.scorer.wikitexts_embedding import WikitextsDocumentScorer

def group_by_entropy(input_file):
    entropy_dict = {}
    wiki_embedding = WikitextsDocumentScorer()
    data = load_calibration(input_file)
    for item in data:
        prompt = item["prompt"]
        retrived_docs = wiki_embedding.faiss_manager.search_faiss_index(prompt, 20, 0.4)
        scores = []
        for doc in retrived_docs:
            parsed_doc = wiki_embedding.faiss_manager.parse_result(doc)
            scores.append(parsed_doc["score"])
        #store both entropy and scores in the dict
        entropy_dict[prompt] = {
            "entropy": entropy_histogram(scores, 10),
            "scores": scores
        }

    
    # sort by entropy
    entropy_dict = dict(sorted(entropy_dict.items(), key=lambda item: item[1]["entropy"]))

    #divide into 3 groups based on entropy, low is easy, high is hard
    easy = {}
    medium = {}
    hard = {}

    for i, (k, v) in enumerate(entropy_dict.items()):
        if i < len(entropy_dict) / 3:
            easy[k] = v
        elif i < 2 * len(entropy_dict) / 3:
            medium[k] = v
        else:
            hard[k] = v

    #save prompt into 3 json files
    with open(input_file.replace('.json', '_easy.json'), "w") as fopen:
        json.dump(easy, fopen)

    with open(input_file.replace('.json', '_medium.json'), "w") as fopen:
        json.dump(medium, fopen)

    with open(input_file.replace('.json', '_hard.json'), "w") as fopen:
        json.dump(hard, fopen)

    #split original file into 3 files based on easy, medium and hard
    easy_prompts = []
    medium_prompts = []
    hard_prompts = []

    for item in data:
        prompt = item["prompt"]
        if prompt in easy:
            easy_prompts.append(item)
        elif prompt in medium:
            medium_prompts.append(item)
        else:
            hard_prompts.append(item)

    #has a rely on data file path, need to reconsider
    dump_claims(easy_prompts, input_file.replace('data/out/', 'data/out/easy_'))
    dump_claims(medium_prompts, input_file.replace('data/out/', 'data/out/medium_'))
    dump_claims(hard_prompts, input_file.replace('data/out/', 'data/out/hard_'))




In [39]:
input_file =  "data/out/medlfqav2_subclaims_with_scores.jsonl"

group_by_entropy(input_file)

Loaded texts from file: data/ref/medication_qa_ref_docs_texts.json
Loaded texts from file: data/ref/healthsearch_qa_ref_docs_texts.json
Loaded texts from file: data/ref/kqa_golden_ref_docs_texts.json
Loaded texts from file: data/ref/kqa_silver_wogold_ref_docs_texts.json
Loaded texts from file: data/ref/live_qa_ref_docs_texts.json


KeyboardInterrupt: 

In [34]:
#help to get avg and std of entropy of each file:

def get_avg_std(input_file):
    with open(input_file, "r") as fopen:
        data = json.load(fopen)
    entropy_list = []
    for item in data.values():
        entropy_list.append(item["entropy"])
    return np.mean(entropy_list), np.std(entropy_list)

In [35]:
print(get_avg_std(input_file.replace('.json', '_easy.json')))
print(get_avg_std(input_file.replace('.json', '_medium.json')))
print(get_avg_std(input_file.replace('.json', '_hard.json')))

(1.7762106821708217, 0.3262044016181052)
(2.3742451217227503, 0.12068827128075368)
(2.8024560906705043, 0.15425720827204462)


In [69]:
import json
def group_into_3(input_file):
    data = load_calibration(input_file)
    easy = {}
    medium = {}
    hard = {}
    with open(input_file.replace('.json', '_easy.json'), "r") as fopen:
        easy = json.load(fopen)
    with open(input_file.replace('.json', '_medium.json'), "r") as fopen:
        medium = json.load(fopen)
    with open(input_file.replace('.json', '_hard.json'), "r") as fopen:
        hard = json.load(fopen)
    
        #split original file into 3 files based on easy, medium and hard
    easy_prompts = []
    medium_prompts = []
    hard_prompts = []

    for item in data:
        prompt = item["prompt"]
        if prompt in easy:
            easy_prompts.append(item)
        elif prompt in medium:
            medium_prompts.append(item)
        else:
            hard_prompts.append(item)

    #has a rely on data file path, need to reconsider
    dump_claims(easy_prompts, input_file.replace('data/out/', 'data/out/easy_'))
    dump_claims(medium_prompts, input_file.replace('data/out/', 'data/out/medium_'))
    dump_claims(hard_prompts, input_file.replace('data/out/', 'data/out/hard_'))


In [41]:
group_into_3(input_file)

In [61]:

def add_group(input_file, groupnames, group_files):
    """
    input file and group files entry should all distinguish by "prompt" entry
    ideally input_file should be sum of all group files and each group file should not have overlapping prompts
    """
    data = load_calibration(input_file)
    for groupname, group_file in zip(groupnames, group_files):
        group = {}
        with open(group_file, "r") as fopen:
            group = json.load(fopen)
        for item in data:
            prompt = item["prompt"]
            if prompt in group:
                if "groups" in item:
                    item["groups"].append(groupname)
                else:
                    item["groups"]= [groupname]
 
    dump_claims(data, input_file)

In [50]:
add_group(input_file, ["entropy_easy", "entropy_medium", "entropy_hard"], ["data/out/medlfqav2_subclaims_with_scores_easy.jsonl", "data/out/medlfqav2_subclaims_with_scores_medium.jsonl", "data/out/medlfqav2_subclaims_with_scores_hard.jsonl"])

In [62]:

def add_group_by_prompt(input_file, groupnames, group_files):
    """
    input file and group files entry should all distinguish by "prompt" entry
    ideally input_file should be sum of all group files and each group file should not have overlapping prompts
    """
    data = load_calibration(input_file)
    group = {}
    for groupname, group_file in zip(groupnames, group_files):
        group[groupname] = {}
        with open(group_file, "r") as fopen:
            quries = load_calibration(group_file)
            for item in quries:
                group[groupname][item["prompt"]] = item

    for item in data:
        prompt = item["prompt"]
        for groupname in groupnames:
            if prompt in group[groupname]:
                if "groups" in item:
                    item["groups"].append(groupname)
                else:
                    item["groups"]= [groupname]
 
    dump_claims(data, input_file)

In [73]:
#concat all data in medqa
input_file = "data/out/wiki_rag_subclaims_with_scores.jsonl"
medlfqav2_data = []
dataset_prefixs = ['hotpotqa', 'popqa'] #option: medication_qa, healthsearch_qa, kqa_golden, kqa_silver_wogold, live_qa
for dataset_prefix in dataset_prefixs:
    data = load_calibration(f"data/out/{dataset_prefix}_subclaims_with_scores.jsonl")
    medlfqav2_data.extend(data)
dump_claims(medlfqav2_data, input_file)


In [74]:
input_file =  input_file = "data/out/wiki_rag_subclaims_with_scores.jsonl"

group_by_entropy(input_file)

Loaded texts from file: data/ref/medication_qa_ref_docs_texts.json
Loaded texts from file: data/ref/healthsearch_qa_ref_docs_texts.json
Loaded texts from file: data/ref/kqa_golden_ref_docs_texts.json
Loaded texts from file: data/ref/kqa_silver_wogold_ref_docs_texts.json
Loaded texts from file: data/ref/live_qa_ref_docs_texts.json


In [68]:
add_group_by_prompt(input_file, ["popqa", "hotpotqa"], 
          ["data/out/popqa_subclaims_with_scores.jsonl", 
           "data/out/hotpotqa_subclaims_with_scores.jsonl"])