# Import the Zenbase Library

In [1]:
import sys
import subprocess

def install_package(package):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    except subprocess.CalledProcessError as e:
        print(f"Failed to install {package}: {e}")
        raise

def install_packages(packages):
    for package in packages:
        install_package(package)

try:
    # Check if running in Google Colab
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    # Install the zenbase package if running in Google Colab
    # install_package('zenbase')
    # Install the zenbse package from a GitHub branch if running in Google Colab
    install_package('git+https://github.com/zenbase-ai/lib.git@main#egg=zenbase&subdirectory=py')

    # List of other packages to install in Google Colab
    additional_packages = [
        'python-dotenv',
        'arize-phoenix[evals]',
        'openai',
        'langchain',
        'langchain_openai'
    ]
    
    # Install additional packages
    install_packages(additional_packages)

# Now import the zenbase library
try:
    import zenbase
except ImportError as e:
    print("Failed to import zenbase: ", e)
    raise

# Configure the Environment

In [2]:
from pathlib import Path
from dotenv import load_dotenv

# import os
#
# os.environ["OPENAI_API_KEY"] = "..."
# os.environ["LANGFUSE_HOST"] = "..."
# os.environ["LANGFUSE_PUBLIC_KEY"] = "..."
# os.environ["LANGFUSE_SECRET_KEY"] = "..."

load_dotenv(Path("../../.env.test"), override=True)

True

In [3]:
import nest_asyncio

nest_asyncio.apply()

# Initial Setup

In [4]:
# initiate the phoenix app
import phoenix as px
px.launch_app()
# initiate the phoenix client
arize_phoenix = px.Client()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [5]:
from openai import OpenAI
openai = OpenAI()

In [24]:
from zenbase.utils import ksuid
from zenbase.adaptors.arize import ZenArizeAdaptor
# setup datasets
import datasets
gsm8k_dataset = datasets.load_dataset("gsm8k", "main")
zen_arize_adaptor = ZenArizeAdaptor(arize_phoenix)
TESTSET_SIZE = 2
TRAINSET_SIZE = 5
VALIDATIONSET_SIZE = 2


def create_dataset_with_examples(zen_arize_adaptor: ZenArizeAdaptor, prefix: str, item_set: list) -> str:
    dataset_name = ksuid(prefix=prefix)

    inputs = [{"question": example["question"]} for example in item_set]
    expected_outputs = [{"answer": example["answer"]} for example in item_set]
    zen_arize_adaptor.add_examples_to_dataset(dataset_name, inputs, expected_outputs)
    return dataset_name

train_set = create_dataset_with_examples(
        zen_arize_adaptor,
        "GSM8K_train_set_parea_dataset",
        list(gsm8k_dataset["train"].select(range(TRAINSET_SIZE))),
    )

validation_set = create_dataset_with_examples(
        zen_arize_adaptor,
        "GSM8K_validation_set_parea_dataset",
        list(gsm8k_dataset["train"].select(range(TRAINSET_SIZE + 1, TRAINSET_SIZE + VALIDATIONSET_SIZE + 1))),
    )

test_set = create_dataset_with_examples(
        zen_arize_adaptor,
        "GSM8K_test_set_parea_dataset",
        list(gsm8k_dataset["test"].select(range(TESTSET_SIZE))),
    )

📤 Uploading dataset...
💾 Examples uploaded: http://localhost:6006/datasets/RGF0YXNldDo0/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246NA==
📤 Uploading dataset...
💾 Examples uploaded: http://localhost:6006/datasets/RGF0YXNldDo1/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246NQ==
📤 Uploading dataset...
💾 Examples uploaded: http://localhost:6006/datasets/RGF0YXNldDo2/examples
🗄️ Dataset version ID: RGF0YXNldFZlcnNpb246Ng==


# What you already have should look like below:

## Your OpenAI Call should look like this with LangChain (It could be with OpenAI too, doesn't matter)

In [25]:
def langchain_chain(inputs) -> str:
    from langchain_core.output_parsers import StrOutputParser
    from langchain_core.prompts import ChatPromptTemplate
    from langchain_openai import ChatOpenAI

    messages = [
        (
            "system",
            "You are an expert math solver. Your answer must be just the number with no separators, and nothing else. Follow the format of the examples.",  # noqa
        )
    ]
    messages.append(("user", "{question}"))

    chain = ChatPromptTemplate.from_messages(messages) | ChatOpenAI(model="gpt-3.5-turbo") | StrOutputParser()

    print("Mathing...")
    answer = chain.invoke(inputs["inputs"])
    return answer

## Your Scoring Function should look like this:

In [28]:
def score_answer(output: str, expected: dict):
    print(output, expected["outputs"])
    """The first argument is the return value from the `langchain_chain` function above."""
    score = int(output == expected["outputs"]["answer"].split("#### ")[-1])
    return score

## Your Evaluation should look like this:

In [29]:
from phoenix.experiments import run_experiment

experiment = run_experiment(
                arize_phoenix.get_dataset(name=test_set),
                langchain_chain,
                experiment_name="Experiment-Name",
                evaluators=[score_answer],
            )

🧪 Experiment started.
📺 View dataset experiments: http://localhost:6006/datasets/RGF0YXNldDo2/experiments
🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDo2/compare?experimentId=RXhwZXJpbWVudDo3


running tasks |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

Mathing...
Mathing...
✅ Task runs completed.
🧠 Evaluation started.


running experiment evaluations |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

16 - 3 - 4 = 9.  
9 * 2 = 18.  
Therefore, Janet makes 18 dollars every day at the farmers' market. {'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}
3 {'answer': 'It takes 2/2=<<2/2=1>>1 bolt of white fiber\nSo the total amount of fabric is 2+1=<<2+1=3>>3 bolts of fabric\n#### 3'}

🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDo2/compare?experimentId=RXhwZXJpbWVudDo3

Experiment Summary (07/11/24 01:44 PM -0600)
--------------------------------------------
      evaluator  n  n_scores  avg_score
0  score_answer  2         2        0.5

Tasks Summary (07/11/24 01:44 PM -0600)
---------------------------------------
   n_examples  n_runs  n_errors
0           2       2         0


# How you should do the few-shot learning

## Rewrite your langchain_chain function to use the `zenbase` decorators

In [30]:
from zenbase.core.managers import ZenbaseTracer

zenbase_tracer = ZenbaseTracer()


In [44]:
from zenbase.types import LMRequest, LMDemo
@zenbase_tracer
def zen_chain(request: LMRequest) -> str:
    print(request)
    from langchain_core.output_parsers import StrOutputParser
    from langchain_core.prompts import ChatPromptTemplate
    from langchain_openai import ChatOpenAI

    messages = [
        (
            "system",
            "You are an expert math solver. Your answer must be just the number with no separators, and nothing else. Follow the format of the examples.",  # noqa
        )
    ]
    for demo in request.zenbase.task_demos:
        messages += [
            ("user", demo.inputs["question"]),
            ("assistant", demo.outputs["answer"]),
        ]

    messages.append(("user", "{question}"))

    chain = ChatPromptTemplate.from_messages(messages) | ChatOpenAI(model="gpt-3.5-turbo") | StrOutputParser()

    print("Mathing...")
    answer = chain.invoke(request.inputs["inputs"])
    return answer


In [45]:
return_langchain = zen_chain({"inputs": {"question": "What is 2 + 2?"}})


LMRequest(zenbase=LMZenbase(task_demos=[], model_params={}), inputs={'inputs': {'question': 'What is 2 + 2?'}}, id='request_2j79Hit9mbvep0fAiAISWUiNoJT')
Mathing...


## Make your evaluation function to use the langfuse from the LMDemo and the langfuse from the Zenbase

In [46]:
def score_answer(output: str, expected: dict):
    print(output, expected["outputs"])
    """The first argument is the return value from the `langchain_chain` function above."""
    # if there is any #### in the output
    if "####" in expected["outputs"]["answer"]:
        output = output.split("#### ")[-1]

    score = int(output == expected["outputs"]["answer"].split("#### ")[-1])
    return score

## Optimize the few-shot learning

### Define your optimizer:


In [47]:
from zenbase.optim.metric.labeled_few_shot import LabeledFewShot

optimizer = LabeledFewShot(
    demoset=zen_arize_adaptor.fetch_dataset_demos(train_set), ## The dataset to use for the few-shot learning and training
    shots=3,
)

### Perform the optimization

In [49]:
best_fn, candidate_results, _ = optimizer.perform(
    zen_chain,
    evaluator=zen_arize_adaptor.metric_evaluator(
        dataset=arize_phoenix.get_dataset(name=test_set), evaluators=[score_answer]
    ),
    samples=4,
    concurrency=1,
    rounds=1,
)

🧪 Experiment started.
📺 View dataset experiments: http://localhost:6006/datasets/RGF0YXNldDo2/experiments
🔗 View this experiment: http://localhost:6006/datasets/RGF0YXNldDo2/compare?experimentId=RXhwZXJpbWVudDoyMA==


running tasks |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

LMRequest(zenbase=LMZenbase(task_demos=(LMDemo(inputs={'inputs': {'question': 'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?'}}, outputs={'outputs': {'answer': 'Maila read 12 x 2 = <<12*2=24>>24 pages today.\nSo she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nThere are 120 - 36 = <<120-36=84>>84 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42'}}, adaptor_object=None), LMDemo(inputs={'inputs': {'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'}}, outputs={'outputs': {'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips alto

running experiment evaluations |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

None {'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}
[91mTraceback (most recent call last):
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/functions.py", line 560, in async_evaluate_run
    result = await evaluator.async_evaluate(
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/evaluators/base.py", line 76, in async_evaluate
    return self.evaluate(
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/evaluators/utils.py", line 149, in evaluate
    result = func(*bound_signature.args, **bound_signature.kwargs)
  File "/var/folders/47/_rbxm97j0kl5nm_6vv2m8h540000gn/T/ipykernel_52718/2607286871.py", line 6, in score_answer
    output = output.split("#### ")[-1]
AttributeError: 'NoneType' object has no attribute 'split'

The above exception 

running tasks |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

LMRequest(zenbase=LMZenbase(task_demos=(LMDemo(inputs={'inputs': {'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'}}, outputs={'outputs': {'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}}, adaptor_object=None), LMDemo(inputs={'inputs': {'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?'}}, outputs={'outputs': {'answer': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10'}}, adaptor_object=None), LMDemo(inputs={'inputs': {'question': 'James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?'}}, outputs={'outputs': {'answer': 'He writes each friend 3*2=<<3*2=6>>6 pages a week\nSo he writ

running experiment evaluations |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

None {'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}
[91mTraceback (most recent call last):
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/functions.py", line 560, in async_evaluate_run
    result = await evaluator.async_evaluate(
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/evaluators/base.py", line 76, in async_evaluate
    return self.evaluate(
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/evaluators/utils.py", line 149, in evaluate
    result = func(*bound_signature.args, **bound_signature.kwargs)
  File "/var/folders/47/_rbxm97j0kl5nm_6vv2m8h540000gn/T/ipykernel_52718/2607286871.py", line 6, in score_answer
    output = output.split("#### ")[-1]
AttributeError: 'NoneType' object has no attribute 'split'

The above exception 

running tasks |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

LMRequest(zenbase=LMZenbase(task_demos=(LMDemo(inputs={'inputs': {'question': 'James writes a 3-page letter to 2 different friends twice a week.  How many pages does he write a year?'}}, outputs={'outputs': {'answer': 'He writes each friend 3*2=<<3*2=6>>6 pages a week\nSo he writes 6*2=<<6*2=12>>12 pages every week\nThat means he writes 12*52=<<12*52=624>>624 pages a year\n#### 624'}}, adaptor_object=None), LMDemo(inputs={'inputs': {'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?'}}, outputs={'outputs': {'answer': 'Natalia sold 48/2 = <<48/2=24>>24 clips in May.\nNatalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.\n#### 72'}}, adaptor_object=None), LMDemo(inputs={'inputs': {'question': 'Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her g

running experiment evaluations |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

None {'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}
[91mTraceback (most recent call last):
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/functions.py", line 560, in async_evaluate_run
    result = await evaluator.async_evaluate(
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/evaluators/base.py", line 76, in async_evaluate
    return self.evaluate(
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/evaluators/utils.py", line 149, in evaluate
    result = func(*bound_signature.args, **bound_signature.kwargs)
  File "/var/folders/47/_rbxm97j0kl5nm_6vv2m8h540000gn/T/ipykernel_52718/2607286871.py", line 6, in score_answer
    output = output.split("#### ")[-1]
AttributeError: 'NoneType' object has no attribute 'split'

The above exception 

running tasks |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

LMRequest(zenbase=LMZenbase(task_demos=(LMDemo(inputs={'inputs': {'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?'}}, outputs={'outputs': {'answer': 'Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.\n#### 10'}}, adaptor_object=None), LMDemo(inputs={'inputs': {'question': 'Julie is reading a 120-page book. Yesterday, she was able to read 12 pages and today, she read twice as many pages as yesterday. If she wants to read half of the remaining pages tomorrow, how many pages should she read?'}}, outputs={'outputs': {'answer': 'Maila read 12 x 2 = <<12*2=24>>24 pages today.\nSo she was able to read a total of 12 + 24 = <<12+24=36>>36 pages since yesterday.\nThere are 120 - 36 = <<120-36=84>>84 pages left to be read.\nSince she wants to read half of the remaining pages tomorrow, then she should read 84/2 = <<84/2=42>>42 pages.\n#### 42'}}, adaptor_object

running experiment evaluations |          | 0/2 (0.0%) | ⏳ 00:00<? | ?it/s

None {'answer': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}
[91mTraceback (most recent call last):
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/functions.py", line 560, in async_evaluate_run
    result = await evaluator.async_evaluate(
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/evaluators/base.py", line 76, in async_evaluate
    return self.evaluate(
  File "/Users/amir/workspace/zenbase-lib/py/.venv/lib/python3.10/site-packages/phoenix/experiments/evaluators/utils.py", line 149, in evaluate
    result = func(*bound_signature.args, **bound_signature.kwargs)
  File "/var/folders/47/_rbxm97j0kl5nm_6vv2m8h540000gn/T/ipykernel_52718/2607286871.py", line 6, in score_answer
    output = output.split("#### ")[-1]
AttributeError: 'NoneType' object has no attribute 'split'

The above exception 

### Use the best function

In [51]:
output = best_fn({"inputs":{"question": "What is 2+2?"}})
output

LMRequest(zenbase=LMZenbase(task_demos=[], model_params={}), inputs={'inputs': {'question': 'What is 2+2?'}}, id='request_2j7DLU9AD9JIwB44lPOp4yH1J2U')
Mathing...


'4'

### Save the best function

In [53]:
# You can also save the zenbase params for re-use
import pickle

pickled_zenbase = pickle.dumps(best_fn.zenbase)
zen_chain.zenbase = pickle.loads(pickled_zenbase)

zen_chain({"inputs":{"question": "What is 2+2?"}}) # uses the best few-shot demos

LMRequest(zenbase=LMZenbase(task_demos=[], model_params={}), inputs={'inputs': {'question': 'What is 2+2?'}}, id='request_2j7DNqKCZk1e4qH5wCRr5nqC7Ja')
Mathing...


'4'