Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 66 additions & 5 deletions evals/cli/oaieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,40 @@ def get_parser() -> argparse.ArgumentParser:
help="Path to the registry",
)
parser.add_argument("--debug", action=argparse.BooleanOptionalAction, default=False)
parser.add_argument("--local-run", action=argparse.BooleanOptionalAction, default=True)
parser.add_argument(
"--local-run",
action=argparse.BooleanOptionalAction,
default=True,
help="Enable local mode for running evaluations. In this mode, the evaluation results are stored locally in a JSON file. This mode is enabled by default.",
)

parser.add_argument(
"--http-run",
action=argparse.BooleanOptionalAction,
default=False,
help="Enable HTTP mode for running evaluations. In this mode, the evaluation results are sent to a specified URL rather than being stored locally or in Snowflake. This mode should be used in conjunction with the '--http-run-url' and '--http-batch-size' arguments.",
)

parser.add_argument(
"--http-run-url",
type=str,
default=None,
help="URL to send the evaluation results when in HTTP mode. This option should be used in conjunction with the '--http-run' flag.",
)

parser.add_argument(
"--http-batch-size",
type=int,
default=100,
help="Number of events to send in each HTTP request when in HTTP mode. Default is 1, i.e., send events individually. Set to a larger number to send events in batches. This option should be used in conjunction with the '--http-run' flag.",
)
parser.add_argument(
"--http-fail-percent-threshold",
type=int,
default=5,
help="The acceptable percentage threshold of HTTP requests that can fail. Default is 5, meaning 5% of total HTTP requests can fail without causing any issues. If the failure rate goes beyond this threshold, suitable action should be taken or the process will be deemed as failing, but still stored locally.",
)

parser.add_argument("--dry-run", action=argparse.BooleanOptionalAction, default=False)
parser.add_argument("--dry-run-logging", action=argparse.BooleanOptionalAction, default=True)
return parser
Expand All @@ -69,6 +102,10 @@ class OaiEvalArguments(argparse.Namespace):
registry_path: Optional[str]
debug: bool
local_run: bool
http_run: bool
http_run_url: Optional[str]
http_batch_size: int
http_fail_percent_threshold: int
dry_run: bool
dry_run_logging: bool

Expand Down Expand Up @@ -122,13 +159,38 @@ def run(args: OaiEvalArguments, registry: Optional[Registry] = None) -> str:
else:
record_path = args.record_path

if args.http_run:
args.local_run = False
elif args.local_run:
args.http_run = False

recorder: evals.record.RecorderBase
recorder_kwargs = []
if args.dry_run:
recorder = evals.record.DummyRecorder(run_spec=run_spec, log=args.dry_run_logging)
recorder_class = evals.record.DummyRecorder
recorder_args = {"run_spec": run_spec, "log": args.dry_run_logging}
elif args.local_run:
recorder = evals.record.LocalRecorder(record_path, run_spec=run_spec)
recorder_class = evals.record.LocalRecorder
recorder_args = {"run_spec": run_spec}
recorder_kwargs = [record_path]
elif args.http_run:
if args.http_run_url is None:
raise ValueError("URL must be specified when using http-run mode")
recorder_class = evals.record.HttpRecorder
recorder_args = {
"url": args.http_run_url,
"run_spec": run_spec,
"batch_size": args.http_batch_size,
"fail_percent_threshold": args.http_fail_percent_threshold,
"local_fallback_path": record_path,
}

else:
recorder = evals.record.Recorder(record_path, run_spec=run_spec)
recorder_class = evals.record.Recorder
recorder_args = {"run_spec": run_spec}
recorder_kwargs = [record_path]

recorder = recorder_class(*recorder_kwargs, **recorder_args)

api_extra_options: dict[str, Any] = {}
if not args.cache:
Expand Down Expand Up @@ -190,7 +252,6 @@ def main() -> None:
)
logging.getLogger("openai").setLevel(logging.WARN)

# TODO)) why do we need this?
if hasattr(openai.error, "set_display_cause"): # type: ignore
openai.error.set_display_cause() # type: ignore
run(args)
Expand Down
95 changes: 95 additions & 0 deletions evals/record.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from typing import Any, List, Optional, Sequence

import blobfile as bf
import requests

import evals
from evals.base import RunSpec
Expand Down Expand Up @@ -341,6 +342,100 @@ def record_final_report(self, final_report: Any):
logging.info(f"Final report: {final_report}. Logged to {self.event_file_path}")


class HttpRecorder(RecorderBase):
def __init__(
self,
url: str,
run_spec: RunSpec,
local_fallback_path: str,
fail_percent_threshold: int = 5,
batch_size: int = 100,
):
super().__init__(run_spec)
self.url = url
self.batch_size = batch_size
self.fail_percent_threshold = fail_percent_threshold / 100
self.failed_requests = 0 # Add this line to track failed requests
self.local_fallback_path = local_fallback_path
self.local_fallback_recorder = LocalRecorder(local_fallback_path, run_spec)
logger.info(f"HttpRecorder initialized with URL {self.url}")

def _flush_events_internal(self, events_to_write: Sequence[Event]):
batch_size = self.batch_size
for i in range(0, len(events_to_write), batch_size):
batch = list(events_to_write[i : i + batch_size])
try:
self._send_event(batch)
except RuntimeError as e:
logger.error(f"Falling back to LocalRecorder due to error: {str(e)}")
self.local_fallback_recorder._flush_events_internal(batch)
raise RuntimeError(
"An error occurred when sending events. Your events have been saved locally using the Local recorder."
)

def _send_event(self, events: List[Event]):
# Convert the events to dictionaries
events_dict = [dataclasses.asdict(event) for event in events]

logger.debug(f"Sending events: {events_dict}")

try:
# Send the events to the specified URL
response = requests.post(self.url, json=events_dict)

# If the request succeeded, log a success message
if response.ok:
logger.debug(f"Events sent successfully")

# If the request failed, log a warning and increment failed_requests
else:
logger.warning(f"Failed to send events: {response.text}")
self.failed_requests += len(
events
) # Increase the count by the number of events in the failed request

except Exception as e:
logger.warning(f"Failed to send events: {str(e)}")
self.failed_requests += len(
events
) # Increase the count by the number of events in the failed request

# Check if the proportion of failed requests exceeds the threshold
fail_threshold = self.fail_percent_threshold
# Make a string for human comprehention
fail_threshold_str = str(fail_threshold * 100) + "%"

if self.failed_requests / len(self._events) > fail_threshold:
raise RuntimeError(
"The proportion of failed events has exceeded the threshold of: "
+ fail_threshold_str
+ "."
+ " Falling back to LocalRecorder. "
"You can modify this via the cli flag --http-fail-percent-threshold"
)

def record_final_report(self, final_report: Any):
# Convert the final report to a dictionary and prepare it as an event
report_event = Event(
run_id=self.run_spec.run_id,
event_id=len(self._events),
sample_id=None, # or you could use a specific id for final reports
type="final_report",
data=final_report,
created_by=self.run_spec.created_by,
created_at=str(datetime.now(timezone.utc)),
)

# Send the final report event
try:
self._send_event([report_event])
logging.info(f"Final report: {final_report}.")
logging.info(f"Data logged to: {self.url}")
except RuntimeError as e:
logger.error(f"Falling back to LocalRecorder due to error: {str(e)}")
self.local_fallback_recorder.record_final_report(final_report)


class Recorder(RecorderBase):
"""
A recorder which logs events to Snowflake.
Expand Down