In [1]:
import json
import os
import warnings

import numpy as np
from dotenv import find_dotenv, load_dotenv
from langchain_core.rate_limiters import InMemoryRateLimiter

from langfair.auto import AutoEval

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm
Device set to use mps:0


In [2]:
# User to populate .env file with API credentials
repo_path = "/".join(os.getcwd().split("/")[:-3])
load_dotenv(find_dotenv())

API_KEY = os.getenv("API_KEY")
API_BASE = os.getenv("API_BASE")
API_TYPE = os.getenv("API_TYPE")
API_VERSION = os.getenv("API_VERSION")
MODEL_VERSION = os.getenv("MODEL_VERSION")
DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME")

In [3]:
from langfair.utils.dataloader import load_dialogsum

n = 5  # number of prompts we want to test
dialogue = load_dialogsum(n=n)

INSTRUCTION = (
    "You are to summarize the following conversation in no more than 3 sentences: \n"
)
prompts = [INSTRUCTION + str(text) for text in dialogue[:n]]

In [4]:
# Use LangChain's InMemoryRateLimiter to avoid rate limit errors. Adjust parameters as necessary.
rate_limiter = InMemoryRateLimiter(
    requests_per_second=10,
    check_every_n_seconds=10,
    max_bucket_size=1000,
)

In [5]:
import openai
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    deployment_name=DEPLOYMENT_NAME,
    openai_api_key=API_KEY,
    azure_endpoint=API_BASE,
    openai_api_type=API_TYPE,
    openai_api_version=API_VERSION,
    temperature=1,  # User to set temperature
    rate_limiter=rate_limiter,
)

# Define exceptions to suppress
suppressed_exceptions = (
    openai.BadRequestError,
    ValueError,
)  # this suppresses content filtering errors

In [6]:
ae = AutoEval(
    prompts=prompts,  # small sample used as an example; in practice, a bigger sample should be used
    langchain_llm=llm,
    suppressed_exceptions=suppressed_exceptions,
    metrics={
        "counterfactual": ["Rougel", "Bleu", "Sentiment Bias"],
        "stereotype": [
            "Stereotype Association",
            "Cooccurrence Bias",
        ],
        "toxicity": [
            "Toxic Fraction",
            "Expected Maximum Toxicity",
            "Toxicity Probability",
        ],
    },
    # toxicity_device=device # uncomment if GPU is available
)

In [7]:
warnings.filterwarnings("ignore")
results = await ae.evaluate(return_data=True)

[1mStep 1: Fairness Through Unawareness Check[0m
------------------------------------------
Number of prompts containing race words: 0
Number of prompts containing gender words: 3
Fairness through unawareness is not satisfied. Toxicity, stereotype, and counterfactual fairness assessments will be conducted.

[1mStep 2: Generate Counterfactual Dataset[0m
---------------------------------------
Gender words found in 3 prompts.
Generating 25 responses for each gender prompt...
Responses successfully generated!

[1mStep 3: Generating Model Responses[0m
----------------------------------
Generating 25 responses per prompt...
Responses successfully generated!

[1mStep 4: Evaluate Toxicity Metrics[0m
---------------------------------
Computing toxicity scores...
Evaluating metrics...

[1mStep 5: Evaluate Stereotype Metrics[0m
-----------------------------------
Computing stereotype scores...
Evaluating metrics...

[1mStep 6: Evaluate Counterfactual Metrics[0m
----------------------

In [8]:
results = {
    "counterfactual_responses": ae.counterfactual_responses,
    "prompts": ae.prompts,
    "responses": ae.responses,
    "toxicity_metrics": ae.results["metrics"]["Toxicity"],
    "stereotype_metrics": ae.results["metrics"]["Stereotype"],
    "counterfactual_metrics": {
        key: np.float64(ae.results["metrics"]["Counterfactual"]["male-female"][key])
        for key in ae.results["metrics"]["Counterfactual"]["male-female"]
    },
}
autoeval_results_file = "autoeval_results_file.json"
with open(autoeval_results_file, "w") as f:
    json.dump(results, f)