# Export LangSmith Dataset tests and aggregate by metadata

The goal is to generate plots aggregating result sets from a specific dataset and aggregate the result over-time by a pre-defined project metadata key.

In [None]:
!pip install langsmith matplotlib

In [None]:
from getpass import getpass
import os
os.environ["LANGCHAIN_API_KEY"] = getpass("LangSmith API Key: ")
dataset_name = input("Dataset name: ")
metadata_key = input("Project metadata key: ")

In [None]:
import os
import dataclasses
import datetime

from dataclasses import dataclass
from typing import List
from langsmith import Client

@dataclass
class RunStats:
    ts: datetime
    latency: float
    total_tokens: int
    feedbacks: dict[str, float]

use_existing = ""

client = Client()
examples = client.list_examples(dataset_name=dataset_name)

runs_by_key = {}
feedback_set = []
for example in examples:

    runs = client.list_runs(reference_example_id=example.id)
    runs = list(runs)
    for run in runs:
        trace = client.list_runs(run_ids=[run.trace_id])
        ts = 0
        latency = -1
        total_tokens = -1
        feedbacks = {}

        for t in trace:
            latency = t.end_time - t.start_time
            ts = t.end_time
            latency = latency.total_seconds()
            total_tokens = t.total_tokens
            for f in client.list_feedback(run_ids=[t.id]):
                feedbacks[f.key] = f.score
                feedback_set.append(f.key)
            break

        pro = client.read_project(project_id=run.session_id)
        key = pro.extra.get("metadata", {}).get(metadata_key, "")
        if key not in runs_by_key:
            runs_by_key[key] = []
        runs_by_key[key].append(RunStats(ts=ts, latency=latency, total_tokens=total_tokens, feedbacks=feedbacks))

print("Completed")
print(str(runs_by_key))
feedback_set = list(set(feedback_set))

In [None]:
import matplotlib.pyplot as main_plt
import math
main_plt.rcParams["figure.figsize"] = (12, 12)
import matplotlib.dates as mdates
types = ["latency", "total_tokens"] + list(feedback_set)
fig, axs = main_plt.subplots(math.ceil(len(types) / 2), 2)
flat = axs.flat
for i, type in enumerate(types):
    plt = flat[i]
    min_ts = None
    max_ts = None
    for key in runs_by_key.keys():
        values = runs_by_key[key]
        values = sorted(values, key=lambda v: v.ts)
        if type == "latency":
            y = [run.latency for run in values]
        elif type == "total_tokens":
            y = [run.total_tokens for run in values]
        else:
            y = [(run.feedbacks[type] if type in run.feedbacks else -1) for run in values]            
        x = [run.ts for run in values]
        if min_ts is None:
            min_ts = min(x)
        else:
            min_ts = min(min(x), min_ts)
        if max_ts is None:
            max_ts = max(x)
        else:
            max_ts = max(max(x), max_ts)
        plt.plot(x, y, '-', lw=2, label=key)


    if type == "latency":
        ylabel = "Latency (s)"
    elif type == "total_tokens":
        ylabel = "Total Tokens"
    else:
        ylabel = type
    plt.set(xlabel="Time", ylabel=ylabel, title=ylabel)
    min_ts = min_ts.replace(second=0)
    max_ts = max_ts.replace(second=0)
    
    plt.set_xlim([min_ts, max_ts])
    plt.set_xticks([min_ts, max_ts])
    
    plt.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.legend()

fig.tight_layout()
main_plt.show()