In [42]:
import pandas as pd
import numpy as np

In [43]:
medicine = pd.read_csv("csv_output/agent1.csv", na_filter=False)

In [47]:
# Check that the dataframe has been loaded correctly
if medicine.empty:
    raise ValueError("The dataframe is empty. Please check the CSV file.")
# Check that the dataframe has no NaN or empty string "" values
if medicine.isnull().values.any() or (medicine == "").any().any():
    raise ValueError("The dataframe contains NaN or empty string values. Please clean the data.")
# Check that the dataframe has the expected columns
expected_columns = ["question", "option_A", "option_B", "option_C", "option_D", "answer", "explanation"]
if not all(col in medicine.columns for col in expected_columns):
    raise ValueError(f"The dataframe is missing one or more expected columns: {expected_columns}")
# Check that the 'answer' column contains only valid options
valid_options = ["A", "B", "C", "D"]
if not medicine["answer"].isin(valid_options).all():
    raise ValueError("The 'answer' column contains invalid options. It should only contain 'A', 'B', 'C', or 'D'.")
# Check that there are no duplicate questions
if medicine["question"].duplicated().any():
    raise ValueError("The dataframe contains duplicate questions. Please ensure all questions are unique.")


# Print the answer percentage for each option
def calculate_answer_percentage(df):
    total_questions = len(df)
    answer_counts = df["answer"].value_counts(normalize=True) * 100
    percentages = {option: answer_counts.get(option, 0) for option in valid_options}
    return percentages


answer_percentages = calculate_answer_percentage(medicine)
# Print the answer percentages
print("Answer Percentages:")
for option, percentage in answer_percentages.items():
    print(f"Option {option}: {percentage:.2f}%")

Answer Percentages:
Option A: 28.82%
Option B: 28.50%
Option C: 22.52%
Option D: 20.16%
