In [None]:
%pip install s3fs==2024.5.0 datasets==2.20.0

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import lumigator_demo as ld
from datasets import load_dataset, load_from_disk
from IPython.display import clear_output
import time

In [3]:
# allow groups to provide a team name so we can track their jobs more easily
team_name = "testing_lumigator"

# Running the Experiment

## Dataset Preprocessing

### Loading data
The following dataset is already in the format that we need as input: 
- one field called `examples` containing the text to summarize
- one field called `ground_truth` containing the summaries to the models' outputs against

Despite being a CSV file, we still specify `split="train"` as this is the default split name for non-split datasets.

Note that you can load many different types of file formats in a similar way (see https://huggingface.co/docs/datasets/loading#local-and-remote-files)

In [72]:
ds = load_dataset("csv", data_files = "dialogsum.csv", split="train")
ds.to_pandas()

Unnamed: 0,examples,ground_truth
0,"#Person1#: Hello, how are you doing today?\n#P...",#Person2# has trouble breathing. The doctor as...
1,#Person1#: Hey Jimmy. Let's go workout later t...,#Person1# invites Jimmy to go workout and pers...
2,#Person1#: I need to stop eating such unhealth...,#Person1# plans to stop eating unhealthy foods...
3,#Person1#: Do you believe in UFOs?\n#Person2#:...,#Person2# believes in UFOs and can see them in...
4,#Person1#: Did you go to school today?\n#Perso...,#Person1# didn't go to school today. #Person2#...
...,...,...
495,"#Person1#: Now that it's the new year, I've de...",#Person1# decides to stop smoking and come out...
496,"#Person1#: You married Joe, didn't you? \n#Per...",#Person1# thought #Person2# married Joe. #Pers...
497,#Person1#: How can I help you mam?\n#Person2#:...,#Person2#'s car makes noises. #Person1# thinks...
498,"#Person1#: Hello, Amazon's customer service. H...",#Person2# calls Amazon's customer service beca...


### Converting data

The following dataset *does not* contain fields in the expected format. The code below shows how you can convert them to make the dataset compatible.

In this specific case we load the `validation` split as it contains fewer samples (500 instead of 12.5K).

In [4]:
### PREPARING DATASET (WITH GROUND TRUTH) FOR LM-BUDDY JOB
dataset='knkarthick/dialogsum'
ds = load_dataset(dataset, split='validation')

# first, show what the dataset looks like
ds.to_pandas()

Unnamed: 0,id,dialogue,summary,topic
0,dev_0,"#Person1#: Hello, how are you doing today?\n#P...",#Person2# has trouble breathing. The doctor as...,see a doctor
1,dev_1,#Person1#: Hey Jimmy. Let's go workout later t...,#Person1# invites Jimmy to go workout and pers...,do exercise
2,dev_2,#Person1#: I need to stop eating such unhealth...,#Person1# plans to stop eating unhealthy foods...,healthy foods
3,dev_3,#Person1#: Do you believe in UFOs?\n#Person2#:...,#Person2# believes in UFOs and can see them in...,UFOs and aliens
4,dev_4,#Person1#: Did you go to school today?\n#Perso...,#Person1# didn't go to school today. #Person2#...,go to school
...,...,...,...,...
495,dev_495,"#Person1#: Now that it's the new year, I've de...",#Person1# decides to stop smoking and come out...,the new year
496,dev_496,"#Person1#: You married Joe, didn't you? \n#Per...",#Person1# thought #Person2# married Joe. #Pers...,fall in love
497,dev_497,#Person1#: How can I help you mam?\n#Person2#:...,#Person2#'s car makes noises. #Person1# thinks...,noises
498,dev_498,"#Person1#: Hello, Amazon's customer service. H...",#Person2# calls Amazon's customer service beca...,a missing page


In [5]:
# the following code removes unwanted fields and renames the remaining ones
# so they are compatible
ds = ds.remove_columns(["id", "topic"])
ds = ds.rename_column("summary", "ground_truth")

In [6]:
ds = ds.rename_column("dialogue", "examples")
ds = ds.to_pandas()

### Extra code to add prompt to each sample:
### (this assumes you have not renamed the "dialogue" column yet)
# def make_prompt(row):
#     prompt = """<s>[INST]Please summarize the following in at most two sentences:
# {text}[/INST]"""
#     return prompt.format(text=row.dialogue)

# ds['examples'] = ds.apply(make_prompt,axis=1)
# ds = ds.drop(columns=["dialogue"])

ds

Unnamed: 0,examples,ground_truth
0,"#Person1#: Hello, how are you doing today?\n#P...",#Person2# has trouble breathing. The doctor as...
1,#Person1#: Hey Jimmy. Let's go workout later t...,#Person1# invites Jimmy to go workout and pers...
2,#Person1#: I need to stop eating such unhealth...,#Person1# plans to stop eating unhealthy foods...
3,#Person1#: Do you believe in UFOs?\n#Person2#:...,#Person2# believes in UFOs and can see them in...
4,#Person1#: Did you go to school today?\n#Perso...,#Person1# didn't go to school today. #Person2#...
...,...,...
495,"#Person1#: Now that it's the new year, I've de...",#Person1# decides to stop smoking and come out...
496,"#Person1#: You married Joe, didn't you? \n#Per...",#Person1# thought #Person2# married Joe. #Pers...
497,#Person1#: How can I help you mam?\n#Person2#:...,#Person2#'s car makes noises. #Person1# thinks...
498,"#Person1#: Hello, Amazon's customer service. H...",#Person2# calls Amazon's customer service beca...


### Saving the dataset

Once data is converted, you can save the dataset locally so it is ready for the upload

In [7]:
dataset_name = "dialogsum_converted.csv"
ds.to_csv(dataset_name, index=False)

## Dataset Upload

In [8]:
# r = ld.dataset_upload(dataset_name)
# dataset_id = ld.get_resource_id(r)

dataset_id = "7f514028-766e-4445-8241-369f142951e2" # dialogsum_converted
# dataset_id = "e4c78c34-1869-45b4-8e73-f0aa8e44741b" # dialogsum_converted_prompts

### Check dataset info

At any point, one can get dataset info by just providing its UUID:

In [9]:
r = ld.dataset_info(dataset_id)

{
  "id": "7f514028-766e-4445-8241-369f142951e2",
  "filename": "dialogsum_converted.csv",
  "format": "experiment",
  "size": 430961,
  "created_at": "2024-07-26T12:23:43.870732Z"
}


## Model Choice

For now we just provide models as a list, later we'll have an API endpoint that returns models for a given task (still as a list)

In [10]:
enc_dec_models = [
    'hf://facebook/bart-large-cnn',
    'hf://mikeadimech/longformer-qmsum-meeting-summarization', 
    'hf://mrm8488/t5-base-finetuned-summarize-news',
    'hf://Falconsai/text_summarization',
]

dec_models = [
    'hf://mistralai/Mistral-7B-Instruct-v0.3',
    'hf://meta-llama/Meta-Llama-3-8B',
    'hf://microsoft/Phi-3-mini-4k-instruct',
]

gpts = [
    "oai://gpt-4o-mini",
    "oai://gpt-4-turbo",
    "oai://gpt-3.5-turbo-0125"  
]

# models = enc_dec_models + dec_models
# models = [gpts[1]]
# models = enc_dec_models
# models = [dec_models[0]]

models = [
    'hf://facebook/bart-large-cnn',
    'hf://mistralai/Mistral-7B-Instruct-v0.3',
    "oai://gpt-4-turbo",
]

# models = [
#     'hf://mistralai/Mistral-7B-Instruct-v0.3',
# ]

In [11]:
models

['hf://facebook/bart-large-cnn',
 'hf://mistralai/Mistral-7B-Instruct-v0.3',
 'oai://gpt-4-turbo']

## Run Evaluations

In [18]:
# change the following to 0 to use all samples in the dataset
max_samples = 10

responses = []
for model in models:
    name = team_name
    descr = f"Testing {model} summarization model on {dataset_name}"
    responses.append(ld.experiments_submit(model, name, descr, dataset_id, max_samples))

{
  "id": "7a7452cf-8f7a-40c7-904c-0b048e5a9801",
  "name": "testing_lumigator",
  "description": "Testing hf://facebook/bart-large-cnn summarization model on dialogsum_converted.csv",
  "status": "created",
  "created_at": "2024-07-26T13:38:51.027861Z",
  "updated_at": null
}
{
  "id": "7ee2d8e0-63a9-4f30-95bc-96a2fc588bc4",
  "name": "testing_lumigator",
  "description": "Testing hf://mistralai/Mistral-7B-Instruct-v0.3 summarization model on dialogsum_converted.csv",
  "status": "created",
  "created_at": "2024-07-26T13:38:51.603253Z",
  "updated_at": null
}
{
  "id": "a188c2ad-32a5-4eba-bae2-7d5e68dbe92f",
  "name": "testing_lumigator",
  "description": "Testing oai://gpt-4-turbo summarization model on dialogsum_converted.csv",
  "status": "created",
  "created_at": "2024-07-26T13:38:52.169213Z",
  "updated_at": null
}


### Track evaluation jobs

You can track the jobs at the following URLs:

In [19]:
job_ids = [ld.get_resource_id(r) for r in responses]

for job_id in job_ids:
    print(ld.get_ray_link(job_id, ld.RAY_SERVER_URL))

http://10.144.20.102:8265/#/jobs/7a7452cf-8f7a-40c7-904c-0b048e5a9801
http://10.144.20.102:8265/#/jobs/7ee2d8e0-63a9-4f30-95bc-96a2fc588bc4
http://10.144.20.102:8265/#/jobs/a188c2ad-32a5-4eba-bae2-7d5e68dbe92f


... or you can run the following cell to keep watching them until they are all completed:

In [20]:
wip = ld.show_experiment_statuses(job_ids)
while wip == True:
    time.sleep(5)
    clear_output()
    wip=ld.show_experiment_statuses(job_ids)

7a7452cf-8f7a-40c7-904c-0b048e5a9801: SUCCEEDED
7ee2d8e0-63a9-4f30-95bc-96a2fc588bc4: SUCCEEDED
a188c2ad-32a5-4eba-bae2-7d5e68dbe92f: SUCCEEDED


## See / Compare Evaluation Results

In [21]:
# after the jobs complete, gather evaluation results
eval_results = []
for job_id in job_ids:
    eval_results.append(ld.experiments_result_download(job_id))

# convert results into a pandas dataframe
eval_table = ld.eval_results_to_table(models, eval_results)

In [22]:
eval_table

Unnamed: 0,Model,Meteor,BERT Precision,BERT Recall,BERT F1,ROUGE-1,ROUGE-2,ROUGE-L
0,facebook/bart-large-cnn,0.327604,0.867442,0.864527,0.865518,0.291195,0.140441,0.232568
1,mistralai/Mistral-7B-Instruct-v0.3,0.28717,0.798274,0.877465,0.835923,0.126657,0.045201,0.103069
2,gpt-4-turbo,0.277295,0.82732,0.870236,0.847986,0.146265,0.012322,0.094301


In [23]:
eval_results[2]

{'bertscore': {'precision': [0.8324013352394104,
   0.8265461921691895,
   0.8362937569618225,
   0.8076360821723938,
   0.822287917137146,
   0.8309663534164429,
   0.824184000492096,
   0.8419346809387207,
   0.8172837495803833,
   0.8336615562438965],
  'recall': [0.883567750453949,
   0.8466815948486328,
   0.8746528029441833,
   0.8509311079978943,
   0.8974119424819946,
   0.8834987878799438,
   0.8926336765289307,
   0.8897188901901245,
   0.8822887539863586,
   0.8009738922119141],
  'f1': [0.8572217226028442,
   0.8364927768707275,
   0.8550432324409485,
   0.8287185430526733,
   0.8582090735435486,
   0.8564277291297913,
   0.8570443391799927,
   0.865167498588562,
   0.8485430479049683,
   0.816990852355957],
  'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.38.0)',
  'precision_mean': 0.8273195624351501,
  'recall_mean': 0.8702359199523926,
  'f1_mean': 0.8479858815670014},
 'rouge': {'rouge1': [0.1565217391304348,
   0.05633802816901408,
   0.1196581196581

# Observability Tools

This section contains tools to inspect the system, e.g. check the filesystem, the DB, etc.

## S3

### Check (local) AWS credentials

In [None]:
### Keeping the following commented to avoid having secrets saved by mistake:
### uncomment and run to see if AWS credentials are properly set
# import os
# for k in ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "AWS_ENDPOINT_URL"]:
#     print(f"[{k}]: {os.environ[k]}")

### Look into S3 storage

In [11]:
!s5cmd --endpoint-url $AWS_ENDPOINT_URL ls s3://lumigator-storage

                                  DIR  datasets/
                                  DIR  experiments/
