# Evaluation for Fine Tuned Model ExpertQA Responses

In [1]:
# read in csv of fine-tuned model generated responses
# compare to revised_answer_string from ExpertQA dataset
# evaluation metrics: cosine and QAFactEval

In [22]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import csv

In [2]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import SmoothingFunction
import bert_score

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [6]:
def parse_jsonl_for_fields(file_path, fields):
    """
    Parses a JSONL file and returns dictionaries for specified fields.

    :param file_path: Path to the JSONL file.
    :param fields: List of fields to extract data for.
    :return: A dictionary where each key is a field and the value is another dictionary
             of question-answer pairs for that field.
    """
    data = {field: {} for field in fields}
    
    with open(file_path, 'r') as file:
        for line in file:
            line_data = json.loads(line)
            field = line_data.get('metadata', {}).get('field')
            question = line_data.get('question')

            for answer_key in line_data.get('answers', {}):
                revised_answer = line_data['answers'][answer_key].get('revised_answer_string')
                if field in fields and question and revised_answer:
                    data[field][question] = revised_answer
                    break # There is only one revised answer per question

            if field in fields and question and revised_answer:
                data[field][question] = revised_answer

    return data

In [31]:
def parse_jsonl_for_fields_with_question_type(file_path, fields):
    """
    Parses a JSONL file and returns dictionaries for specified fields.

    :param file_path: Path to the JSONL file.
    :param fields: List of fields to extract data for.
    :return: A dictionary where each key is a field and the value is another dictionary
             containing question-answer pairs for that field, including question_type.
    """
    data = {field: {} for field in fields}
    
    with open(file_path, 'r') as file:
        for line in file:
            line_data = json.loads(line)
            field = line_data.get('metadata', {}).get('field')
            question_type = line_data.get('metadata', {}).get('question_type')
            if question_type:
                question_types = question_type.split('| ')
            else:
                question_types = []
            question = line_data.get('question')
            specific_field = line_data.get('metadata', {}).get('specific_field')

            for answer_key in line_data.get('answers', {}):
                revised_answer = line_data['answers'][answer_key].get('revised_answer_string')
                if field in fields and question and revised_answer:
                    if question not in data[field]:  # Check if question already exists in data
                        data[field][question] = {'revised_answer_string': revised_answer,
                                                 'question_type': question_types,
                                                 'specific_field': specific_field}
                    break  # There is only one revised answer per question

    return data

In [32]:
fields_of_interest = ["Healthcare / Medicine", "Law"]
parsed_data = parse_jsonl_for_fields('expertqa.jsonl', fields_of_interest)

# Display a small part of the data to verify
for field in parsed_data:
    print(f"Field: {field}, Number of entries: {len(parsed_data[field])}")
    for question in list(parsed_data[field].keys())[:2]:  # Displaying the first two entries for brevity
        print(f"  Question: {question}")
        print(f"  Answer: {parsed_data[field][question][:100]}...")  # Displaying first 100 characters of answer

Field: Healthcare / Medicine, Number of entries: 504
  Question: What are signs and study findings that would indicate follicular lymphoma has transformed to diffuse large B-cell lymphoma?
  Answer: Signs that might indicate a transformation of follicular lymphoma (FL) to diffuse large B-cell lymph...
  Question: A patient with a history of heart failure now presents with newly diagnosed metastatic HER2+ breast cancer. What is her recommended first line of treatment and what additional information should be discussed with the patient given her history of heart failure?
  Answer: According to the web search results, the recommended first line of treatment for HER2+ metastatic br...
Field: Law, Number of entries: 103
  Question: How will the estate of an individual who dies without a will be distributed?
  Answer: When an individual dies without a will, their estate is distributed according to the intestacy rules...
  Question: What are the requirements for claiming inheritance as per th

In [34]:
fields_of_interest = ["Healthcare / Medicine"]
parsed_data_with_question_type = parse_jsonl_for_fields_with_question_type('expertqa.jsonl', fields_of_interest)

# Display a small part of the data to verify
for field in parsed_data_with_question_type:
    print(f"Field: {field}, Number of entries: {len(parsed_data_with_question_type[field])}")
    for question, data in parsed_data_with_question_type[field].items():  
        print(f"  Question: {question}")
        print(f"  Question Type: {data['question_type']}")
        print(f"  Specific Field: {data['specific_field']}")
        print(f"  Answer: {data['revised_answer_string'][:100]}...")  # Displaying first 100 characters of answer

Field: Healthcare / Medicine, Number of entries: 504
  Question: What are signs and study findings that would indicate follicular lymphoma has transformed to diffuse large B-cell lymphoma?
  Question Type: ['Directed question that has a single unambiguous answer']
  Specific Field: Oncology
  Answer: Signs that might indicate a transformation of follicular lymphoma (FL) to diffuse large B-cell lymph...
  Question: A patient with a history of heart failure now presents with newly diagnosed metastatic HER2+ breast cancer. What is her recommended first line of treatment and what additional information should be discussed with the patient given her history of heart failure?
  Question Type: ['Advice or suggestions on how to approach a problem', 'Question that describes a hypothetical scenario and asks a question based on this scenario', 'Request for opinion on a topic']
  Specific Field: Oncology
  Answer: According to the web search results, the recommended first line of treatment for HER

  Specific Field: Emergency Planning
  Answer: EPRR stands for Emergency Preparedness, Resilience and Response [2]....
  Question: What at the benefits and consequences of the rise of at-home, or direct-to-patient testing kits in patient care?
  Question Type: ['Open-ended question that is potentially ambiguous', 'Question that describes a hypothetical scenario and asks a question based on this scenario']
  Specific Field: Biochemistry
  Answer: The rise of at-home, or direct-to-patient testing kits in patient care offers various benefits and c...
  Question: What are the ethical implications of increased non-invasive pre-natal testing?
  Question Type: ['Open-ended question that is potentially ambiguous', 'Summarization of information on a topic', 'Request for opinion on a topic']
  Specific Field: Biochemistry
  Answer: The ethical implications of increased non-invasive prenatal testing (NIPT) include the routinization...
  Question: Why do people develop resistance to monoclonal ant

# Structure of parsed data
 
 
{
    {'Healthcare / Medicine': {'Question': 'Answer', 'Question': 'Answer'},
    {'Law': {'Question': 'Answer', 'Question': 'Answer'},
}

In [5]:
def read_json_to_dict(file_path):
    """
    read json file and returns a dictionary where each question is a key and its corresponding answer is the value
    :param file_path: Path to the json file.
    :return: dictionary of questions and answers.
    """
    data = {}
    with open(file_path, 'r') as file:
        for line in file:
            question_answer = json.loads(line)
            for key, value in question_answer.items():
                data[key] = value
    return data

In [7]:
# Read the JSONL file and create the big dictionary
medical_chatbot_data = read_json_to_dict('medical_chatbot.json')

In [8]:
# verify count of medical chatbot questions and parsed data for medicine/healthcare are the same

print (len(parsed_data['Healthcare / Medicine']) == len(medical_chatbot_data))

True


# Structure of Medical Chatbot

{'Question': 'Answer', 'Question': 'Answer', 'Question': 'Answer'}

# Cosine Similarity

In [6]:
# Function to compute cosine similarity between two strings
def compute_cosine_similarity(string1, string2):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([string1, string2])
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
    return cosine_sim

In [7]:
# Function to compute cosine similarity for each question and write results to CSV
def compute_and_write_cosine_similarity(expert_data, model_data, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['Key', 'Cosine Similarity']  # Define field names
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for question, answer_expert in expert_data.items():
            answer_model = model_data.get(question)
            if answer_model:
                similarity = compute_cosine_similarity(answer_expert, answer_model)
                writer.writerow({'Key': question, 'Cosine Similarity': similarity})

In [11]:
# Call the function to compute cosine similarity and write results to CSV
compute_and_write_cosine_similarity(parsed_data['Healthcare / Medicine'], medical_chatbot_data, 'medical_chatbot_cosine_similarity.csv')

# BERTScore

In [8]:
def compute_bert_score(reference_sentence, candidate_sentence):
    _, _, bert_score_f1 = bert_score.score([reference_sentence], [candidate_sentence], lang='en', model_type='bert-base-uncased')
    return bert_score_f1.item()  # Convert tensor to Python float

In [9]:
def compute_and_write_bert_score(expert_data, model_data, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['Key', 'BERTScore']  # Define field names
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for question, answer_expert in expert_data.items():
            answer_model = model_data.get(question)
            if answer_model:
                bert_score = compute_bert_score(answer_expert, answer_model)
                writer.writerow({'Key': question, 'BERTScore': bert_score})

In [17]:
compute_and_write_bert_score(parsed_data['Healthcare / Medicine'], medical_chatbot_data, 'medical_chatbot_bert_score.csv')

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

# Biomistral

In [10]:
biomistral_data = read_json_to_dict('biomistral.json')

In [11]:
compute_and_write_cosine_similarity(parsed_data['Healthcare / Medicine'], biomistral_data, 'biomistral_cosine_similarity.csv')

In [12]:
compute_and_write_bert_score(parsed_data['Healthcare / Medicine'], biomistral_data, 'biomistral_bert_score.csv')

# Join Data Results with Question_Type

- 'medical_chatbot_cosine_similarity.csv'
- 'medical_chatbot_bert_score.csv'
- 'biomistral_cosine_similarity.csv'
- 'biomistral_bert_score.csv'

Read all of the csvs and then join with parsed_data_with_question_type by question and then write to new csv

In [35]:
def add_question_types_to_csv(input_csv_path, output_csv_path, parsed_data):
    """
    Adds question types to a CSV file based on the 'Key' column and writes the updated data to a new CSV file.

    :param input_csv_path: Path to the input CSV file.
    :param output_csv_path: Path to the output CSV file.
    :param parsed_data: Parsed data containing question-answer pairs with question types.
    """
    with open(input_csv_path, 'r', newline='') as infile, open(output_csv_path, 'w', newline='') as outfile:
        reader = csv.DictReader(infile)
        fieldnames = reader.fieldnames + ['Question Type', 'Specific Field']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        for row in reader:
            key = row['Key']
            question_types = []
            specific_field = None
            for field, data in parsed_data.items():
                for question, q_data in data.items():
                    if question == key:
                        question_types.extend(q_data['question_type'])
                        specific_field = q_data['specific_field']
                        break  # No need to continue once we found the specific field

            row['Question Type'] = question_types
            row['Specific Field'] = specific_field
            writer.writerow(row)

In [36]:
# Example usage
input_csv_path = 'medical_chatbot_cosine_similarity.csv'
output_csv_path = 'medical_chatbot_cosine_similarity_types.csv'
add_question_types_to_csv(input_csv_path, output_csv_path, parsed_data_with_question_type)

In [37]:
# Example usage
input_csv_path = 'medical_chatbot_bert_score.csv'
output_csv_path = 'medical_chatbot_bert_score_types.csv'
add_question_types_to_csv(input_csv_path, output_csv_path, parsed_data_with_question_type)

In [38]:
# Example usage
input_csv_path = 'biomistral_cosine_similarity.csv'
output_csv_path = 'biomistral_cosine_similarity_types.csv'
add_question_types_to_csv(input_csv_path, output_csv_path, parsed_data_with_question_type)

In [39]:
# Example usage
input_csv_path = 'biomistral_bert_score.csv'
output_csv_path = 'biomistral_bert_score_types.csv'
add_question_types_to_csv(input_csv_path, output_csv_path, parsed_data_with_question_type)