# Analyzing Results with Zeno

This notebook shows how to use the Zeno library to analyze the results from our experiments.

In order to use the browser first unzip the dataset

```bash
cd data
tar -xzf flores200_dataset.tar.gz
cd ..
```

Then put the results in the `system_outputs` directory.

Run this notebook from start to finish.

Make sure that it ran without error, then travel to [http://localhost:8000](http://localhost:8000) to see the results.

In [1]:
import pandas as pd
import tarfile
import tempfile
import os
import csv
import evaluate
import math
from datasets import load_dataset

from zeno import MetricReturn, ZenoOptions, metric, distill, DistillReturn
from zeno_build.evaluation.text_features.length import input_length, output_length
from zeno_build.evaluation.text_metrics.critique import (
    avg_bert_score,
    bert_score,
)
from zeno_build.experiments.experiment_run import ExperimentRun
from zeno_build.reporting.visualize import visualize


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# trg_langs_with_gpt4_results = ["lvs_Latn", "tpi_Latn", "ukr_Cyrl", "lim_Latn", "kat_Geor", "tam_Taml", "mag_Deva", "hau_Latn", "fra_Latn", "acm_Arab", "ssw_Latn", "kmr_Latn", "war_Latn", "ajp_Arab", "pbt_Arab", "gle_Latn", "ron_Latn", "sna_Latn", "ckb_Arab", "ibo_Latn"]
trg_langs = ["ssw_Latn"]

In [5]:
# read `data/flores200_dataset.tar.gz`
# Create a temporary directory
tmpdirname = '/home/amuhamed/gemini/gpt_mt_benchmark/data/'
          
# Extract the archive contents to the temp directory
# with tarfile.open("data/flores200_dataset.tar.gz", "r:gz") as tar:
#     tar.extractall(tmpdirname)
# Read the contents of English
with open(os.path.join(tmpdirname, "flores200_dataset", "devtest", "devtest.eng_Latn"), "r") as f:
    src_data = [x.strip() for x in f.readlines()] * len(trg_langs)
# Read the contents of the target language
trg_data = []
trg_lang_data = []
for trg_lang in trg_langs:
    with open(os.path.join(tmpdirname, "flores200_dataset", "devtest", f"devtest.{trg_lang}"), "r") as f:
        trg_data.extend([x.strip() for x in f.readlines()])
    trg_lang_data.extend([trg_lang] * 1012)
# The dev set is 997 sentences, so it should be that times number of languages
if not len(trg_data) == 1012 * len(trg_langs):
    raise ValueError(f"{len(trg_data)=} != {1012 * len(trg_langs)=}")

# Create the pandas dataframe
df = pd.DataFrame({"source": src_data, "label": trg_data, "trg_lang": trg_lang_data})

In [7]:
def load_hyp_from_tsv(filename: str) -> list[str]:
    with open(filename, "r") as f:
        # Load the third column from the csv reader after the header
        reader = csv.reader(f, delimiter="\t")
        next(reader)
        return [x[2] for x in reader]

# get all the files in the system_outputs folder
experiment_runs = []
system_outputs_path = '/home/amuhamed/gemini/gpt_mt_benchmark/exp_outputs_tsv/'
for subdir in os.listdir(system_outputs_path):
    # The sub_directory must be a directory
    subdir_path = os.path.join(system_outputs_path, subdir)
    if not os.path.isdir(subdir_path):
        raise ValueError(f"{subdir_path=} is not a directory")
    # get all the files in the sub_directory
    all_files = os.listdir(subdir_path)
    trg_results = []
    for trg_lang in trg_langs:
        # find the tsv file
        trg_file = [f for f in all_files if f.endswith(f"{trg_lang}.tsv")]
        if len(trg_file) == 1:
            trg_results.extend(load_hyp_from_tsv(os.path.join(subdir_path, trg_file[0])))
            continue
        # find the hyp file
        trg_file = [f for f in all_files if f.endswith(f"{trg_lang}-devtest.hyp")]
        if len(trg_file) == 1:
            with open(os.path.join(subdir_path, trg_file[0]), "r") as f:
                trg_results.extend([x.strip() for x in f.readlines()])
            continue
        # Die
        raise ValueError(f"Expected tsv or hyp file with {trg_lang=} but found none in {subdir_path}")
    if not len(trg_results) == 1012 * len(trg_langs):
        raise ValueError(f"{len(trg_results)=} != {1012 * len(trg_langs)=}")
    experiment_run = ExperimentRun(
        name=subdir,
        parameters={"name": subdir},
        predictions=trg_results,
    )
    experiment_runs.append(experiment_run)
    

In [9]:
chrf_evaluator = evaluate.load("chrf")

@distill
def chrf(df, ops: ZenoOptions):
    outputs = df[ops.output_column]
    labels = df[ops.label_column]
    results = [
        chrf_evaluator.compute(predictions=[o], references=[[l]])["score"] for o, l in zip(outputs, labels)
    ]
    return DistillReturn(distill_output=results)

@metric
def avg_chrf(df, ops: ZenoOptions):
    avg = df[ops.distill_columns["chrf"]].mean()
    if pd.isnull(avg) or math.isnan(avg):
        return MetricReturn(metric=0)
    return MetricReturn(metric=avg)

Downloading builder script: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9.01k/9.01k [00:00<00:00, 43.2MB/s]


In [10]:
functions = [
    output_length,
    input_length,
    chrf,
    avg_chrf,
    # bert_score,
    # avg_bert_score,
]

visualize(
    df,
    trg_data,
    experiment_runs,
    "text-classification",
    "source",
    functions,
    zeno_config={"cache_path": "zeno_cache"},
    
)
     

tt-zero {'name': 'tt-zero'}

[1mZeno[0m running on http://localhost:8000
Running predistill functions



preprocessing input_length: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 248.29it/s]


Running inference


Inference on tt-zero: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 257.73it/s]


Running postdistill functions


postprocessing output_length on tt-zero: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 447.30it/s]

postprocessing chrf on tt-zero:   0%|                                                                                                                                                      | 0/1 [00:00<?, ?it/s][A
postprocessing chrf on tt-zero: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.03s/it][A


Done processing


  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[str(col)], bucs)])  # type: ignore
  filt_df.groupby([pd.cut(filt_df[