In [1]:
#!pip install lm-eval==0.3.0 -qqq
!git clone https://github.com/EleutherAI/lm-evaluation-harness
!pip install -e ./lm-evaluation-harness/. -qqq

In [2]:
import ctranslate2
import glob
import os
import pandas as pd
import peft
import random
import timeit
import urllib
import torch
import json
import os
from lm_eval import tasks, evaluator, utils
import lm_eval
import wandb
from peft import get_peft_model, LoraConfig, TaskType
from peft import PeftModel, PeftConfig
import wandb.apis.reports as wr
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline

In [3]:
os.environ["WANDB_PROJECT"] = "Autocompletion with evaluation"
os.environ["WANDB_ENTITY"] = "reviewco"
os.environ["WANDB_USERNAME"] = "keisuke-kamata"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"] = "gradients"

SCORE_TABLE_NAME = "Score"
EVALUATION_TABLE_NAME = "Validation Responses"
LATENCY_TABLE_NAME = "Model Latencies"
MODEL_NAME = "Finetuned-Review-Autocompletion"

In [4]:
run = wandb.init(job_type="evaluation")
artifact = run.use_artifact('reviewco/Autocompletion with evaluation/finetuned-model:v1', type='model')
artifact_dir = artifact.download()    
base_llm = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", device_map="auto", torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", local_files_only=True)
model = PeftModel.from_pretrained(base_llm, artifact_dir,torch_dtype=torch.float16)
model = model.merge_and_unload()

In [5]:
# 評価を実行
results = lm_eval.evaluator.simple_evaluate(
    model=model,
    tasks=["arc_easy","hellaswag","squad2"],  
    batch_size=16,
    num_fewshot=3,
    device="cuda"  
)

In [6]:
results

{'results': {'arc_easy': {'acc': 0.42045454545454547,
   'acc_stderr': 0.010129114278546526,
   'acc_norm': 0.4057239057239057,
   'acc_norm_stderr': 0.010075755540128871},
  'hellaswag': {'acc': 0.29047998406691894,
   'acc_stderr': 0.004530560646902538,
   'acc_norm': 0.31467835092611035,
   'acc_norm_stderr': 0.004634385694170044},
  'squad2': {'exact': 2.661500884359471,
   'f1': 5.770005969375321,
   'HasAns_exact': 5.280026990553306,
   'HasAns_f1': 11.505951564506272,
   'NoAns_exact': 0.050462573591253154,
   'NoAns_f1': 0.050462573591253154,
   'best_exact': 50.07159100480081,
   'best_f1': 50.07159100480081}},
 'versions': {'arc_easy': 0, 'hellaswag': 0, 'squad2': 1},
 'config': {'model': 'pretrained=facebook/opt-125m',
  'model_args': None,
  'num_fewshot': 3,
  'batch_size': 16,
  'batch_sizes': [],
  'device': 'cuda',
  'no_cache': True,
  'limit': None,
  'bootstrap_iters': 100000,
  'description_dict': None}}

In [7]:
results

{'results': {'arc_easy': {'acc': 0.42045454545454547,
   'acc_stderr': 0.010129114278546526,
   'acc_norm': 0.4057239057239057,
   'acc_norm_stderr': 0.010075755540128871},
  'hellaswag': {'acc': 0.29047998406691894,
   'acc_stderr': 0.004530560646902538,
   'acc_norm': 0.31467835092611035,
   'acc_norm_stderr': 0.004634385694170044},
  'squad2': {'exact': 2.661500884359471,
   'f1': 5.770005969375321,
   'HasAns_exact': 5.280026990553306,
   'HasAns_f1': 11.505951564506272,
   'NoAns_exact': 0.050462573591253154,
   'NoAns_f1': 0.050462573591253154,
   'best_exact': 50.07159100480081,
   'best_f1': 50.07159100480081}},
 'versions': {'arc_easy': 0, 'hellaswag': 0, 'squad2': 1},
 'config': {'model': 'pretrained=facebook/opt-125m',
  'model_args': None,
  'num_fewshot': 3,
  'batch_size': 16,
  'batch_sizes': [],
  'device': 'cuda',
  'no_cache': True,
  'limit': None,
  'bootstrap_iters': 100000,
  'description_dict': None}}

In [8]:
results["results"]

{'arc_easy': {'acc': 0.42045454545454547,
  'acc_stderr': 0.010129114278546526,
  'acc_norm': 0.4057239057239057,
  'acc_norm_stderr': 0.010075755540128871},
 'hellaswag': {'acc': 0.29047998406691894,
  'acc_stderr': 0.004530560646902538,
  'acc_norm': 0.31467835092611035,
  'acc_norm_stderr': 0.004634385694170044},
 'squad2': {'exact': 2.661500884359471,
  'f1': 5.770005969375321,
  'HasAns_exact': 5.280026990553306,
  'HasAns_f1': 11.505951564506272,
  'NoAns_exact': 0.050462573591253154,
  'NoAns_f1': 0.050462573591253154,
  'best_exact': 50.07159100480081,
  'best_f1': 50.07159100480081}}

In [9]:
results["results"]["arc_easy"]

{'acc': 0.42045454545454547,
 'acc_stderr': 0.010129114278546526,
 'acc_norm': 0.4057239057239057,
 'acc_norm_stderr': 0.010075755540128871}

In [10]:
results["results"]["arc_easy"]["acc"]

0.42045454545454547

In [11]:
run.id

'00vh05cd'

In [12]:
results["results"]

{'arc_easy': {'acc': 0.42045454545454547,
  'acc_stderr': 0.010129114278546526,
  'acc_norm': 0.4057239057239057,
  'acc_norm_stderr': 0.010075755540128871},
 'hellaswag': {'acc': 0.29047998406691894,
  'acc_stderr': 0.004530560646902538,
  'acc_norm': 0.31467835092611035,
  'acc_norm_stderr': 0.004634385694170044},
 'squad2': {'exact': 2.661500884359471,
  'f1': 5.770005969375321,
  'HasAns_exact': 5.280026990553306,
  'HasAns_f1': 11.505951564506272,
  'NoAns_exact': 0.050462573591253154,
  'NoAns_f1': 0.050462573591253154,
  'best_exact': 50.07159100480081,
  'best_f1': 50.07159100480081}}

In [13]:
results["results"]["hellaswag"]["acc"]

0.29047998406691894

In [14]:
results["results"]

{'arc_easy': {'acc': 0.42045454545454547,
  'acc_stderr': 0.010129114278546526,
  'acc_norm': 0.4057239057239057,
  'acc_norm_stderr': 0.010075755540128871},
 'hellaswag': {'acc': 0.29047998406691894,
  'acc_stderr': 0.004530560646902538,
  'acc_norm': 0.31467835092611035,
  'acc_norm_stderr': 0.004634385694170044},
 'squad2': {'exact': 2.661500884359471,
  'f1': 5.770005969375321,
  'HasAns_exact': 5.280026990553306,
  'HasAns_f1': 11.505951564506272,
  'NoAns_exact': 0.050462573591253154,
  'NoAns_f1': 0.050462573591253154,
  'best_exact': 50.07159100480081,
  'best_f1': 50.07159100480081}}

In [15]:
results["results"][""squad2""]

In [16]:
results["results"]["squad2"]

{'exact': 2.661500884359471,
 'f1': 5.770005969375321,
 'HasAns_exact': 5.280026990553306,
 'HasAns_f1': 11.505951564506272,
 'NoAns_exact': 0.050462573591253154,
 'NoAns_f1': 0.050462573591253154,
 'best_exact': 50.07159100480081,
 'best_f1': 50.07159100480081}

In [17]:
results["results"]["squad2"]["f1"]

5.770005969375321

In [18]:
table_contents = []
table_contents.append(run.id)
table_contents.append(results["results"]["arc_easy"]["acc"])
table_contents.append(results["results"]["hellaswag"]["acc"])
table_contents.append(results["results"]["squad2"]["f1"])
table = wandb.Table(columns=['finetuned-model:v1']+tasks,
                            data=[table_contents])
run.log({'result_table':table})
run.log_code()

In [19]:
['finetuned-model:v1']+tasks

In [20]:
tasks=["arc_easy","hellaswag","squad2"]
['finetuned-model:v1']+tasks

['finetuned-model:v1', 'arc_easy', 'hellaswag', 'squad2']

In [21]:
tasks=["arc_easy","hellaswag","squad2"]
table_contents = []
table_contents.append(run.id)
table_contents.append(results["results"]["arc_easy"]["acc"])
table_contents.append(results["results"]["hellaswag"]["acc"])
table_contents.append(results["results"]["squad2"]["f1"])
table = wandb.Table(columns=['finetuned-model:v1']+tasks,
                            data=[table_contents])
run.log({'result_table':table})
run.log_code()

<Artifact source-Autocompletion_with_evaluation-evaluation.ipynb>

In [22]:
tasks=["arc_easy","hellaswag","squad2"]
table_contents = []
table_contents.append(run.id)
table_contents.append(results["results"]["arc_easy"]["acc"])
table_contents.append(results["results"]["hellaswag"]["acc"])
table_contents.append(results["results"]["squad2"]["f1"])
table = wandb.Table(columns=['finetuned-model:v1']+tasks,
                            data=[table_contents])
run.log({'result_table':table})
run.log_code()
run.finish()

In [23]:
results

{'results': {'arc_easy': {'acc': 0.42045454545454547,
   'acc_stderr': 0.010129114278546526,
   'acc_norm': 0.4057239057239057,
   'acc_norm_stderr': 0.010075755540128871},
  'hellaswag': {'acc': 0.29047998406691894,
   'acc_stderr': 0.004530560646902538,
   'acc_norm': 0.31467835092611035,
   'acc_norm_stderr': 0.004634385694170044},
  'squad2': {'exact': 2.661500884359471,
   'f1': 5.770005969375321,
   'HasAns_exact': 5.280026990553306,
   'HasAns_f1': 11.505951564506272,
   'NoAns_exact': 0.050462573591253154,
   'NoAns_f1': 0.050462573591253154,
   'best_exact': 50.07159100480081,
   'best_f1': 50.07159100480081}},
 'versions': {'arc_easy': 0, 'hellaswag': 0, 'squad2': 1},
 'config': {'model': 'pretrained=facebook/opt-125m',
  'model_args': None,
  'num_fewshot': 3,
  'batch_size': 16,
  'batch_sizes': [],
  'device': 'cuda',
  'no_cache': True,
  'limit': None,
  'bootstrap_iters': 100000,
  'description_dict': None}}

In [24]:
def get_completion_responses_batch(prompts, ft_path):
    # Get completions for each model in batches
    opt_completions = get_huggingface_completion_batch(prompts, "facebook/opt-125m")
    ft_completions = get_huggingface_completion_batch(prompts, ft_path)
    responses = []
    for opt, ft in zip(opt_completions, ft_completions):
        responses.append({
            "Production": opt,
            "Staging (finetuned)": ft
        })
    return responses

def get_huggingface_completion_batch(prompts, model):
    generator = pipeline('text-generation', model=model)
    responses = generator(prompts, max_new_tokens=50)
    completions = []
    for i, prompt in enumerate(prompts):
        full_output = responses[i][0]["generated_text"]
        output = full_output[len(prompt):] if full_output.startswith(prompt) else full_output
        completions.append(output.strip())
    return completions

def get_model_comparison_df(prompts, ft_path):
    trimmed_prompts = [
        " ".join(prompt.split()[:random.randint(5,12)])
        for prompt in prompts
    ]
    responses = get_completion_responses_batch(trimmed_prompts, ft_path)
    df = pd.DataFrame(responses)
    df.insert(0, "prompt", trimmed_prompts)
    return df

def get_latency_df(prompts, num_prompts, ft_path, ct2_path):
  prompts = prompts.to_list()[:num_prompts]
  ft_time = timeit.timeit(lambda: get_huggingface_completion_batch(prompts, model), number=1)
  opt_time = timeit.timeit(lambda: get_huggingface_completion_batch(prompts, "facebook/opt-125m"), number=1)

  return pd.DataFrame({
    "Model": ["Production", "Staging (finetuned)"],
    f"Latency for {num_prompts} Reviews": [opt_time, ft_time],
  })

In [25]:
# 評価を実行
results = lm_eval.evaluator.simple_evaluate(
    model=model,
    tasks=["arc_easy","hellaswag","squad2","drop","sciq"],  
    batch_size=16,
    num_fewshot=10,
    device="cuda"  
)

In [26]:
results

{'results': {'arc_easy': {'acc': 0.4377104377104377,
   'acc_stderr': 0.010179856486006918,
   'acc_norm': 0.41624579124579125,
   'acc_norm_stderr': 0.010114819404500876},
  'hellaswag': {'acc': 0.2900816570404302,
   'acc_stderr': 0.004528723951878255,
   'acc_norm': 0.3136825333598885,
   'acc_norm_stderr': 0.004630407476835194},
  'squad2': {'exact': 3.4869030573570288,
   'f1': 6.788581445337238,
   'HasAns_exact': 6.831983805668016,
   'HasAns_f1': 13.444808957572375,
   'NoAns_exact': 0.15138772077375945,
   'NoAns_f1': 0.15138772077375945,
   'best_exact': 50.07159100480081,
   'best_f1': 50.07159100480081},
  'sciq': {'acc': 0.811,
   'acc_stderr': 0.01238678458811771,
   'acc_norm': 0.792,
   'acc_norm_stderr': 0.012841374572096921},
  'drop': {'em': 0.0006291946308724832,
   'em_stderr': 0.00025680027497240645,
   'f1': 0.04319630872483222,
   'f1_stderr': 0.0011526100580973827}},
 'versions': {'arc_easy': 0,
  'hellaswag': 0,
  'squad2': 1,
  'drop': 1,
  'sciq': 0},
 'conf

In [27]:
config = {
    "MODEL_NAME":"Finetuned-Review-Autocompletion",
    "BASE_MODEL":"facebook/opt-125m",
    "MODEL_NAME":"Instruction-tuned-model",
}

In [28]:
tokenizer = AutoTokenizer.from_pretrained(config.BASE_MODEL)
artifact = run.use_artifact(f'{os.environ["WANDB_ENTITY"]}/{os.environ["WANDB_PROJECT"]}/data:production')
artifact_dir = artifact.download()
test_data = load_dataset('json', data_files=artifact_dir+'/test_data.json',split="train")

In [29]:
config = {
    "MODEL_NAME":"Finetuned-Review-Autocompletion",
    "BASE_MODEL":"facebook/opt-125m",
    "MODEL_NAME":"Instruction-tuned-model",
}

In [30]:
tokenizer = AutoTokenizer.from_pretrained(config.BASE_MODEL)
artifact = run.use_artifact(f'{os.environ["WANDB_ENTITY"]}/{os.environ["WANDB_PROJECT"]}/data:production')
artifact_dir = artifact.download()
test_data = load_dataset('json', data_files=artifact_dir+'/test_data.json',split="train")

In [31]:
config.BASE_MODEL

In [32]:
config["BASE_MODEL"]

'facebook/opt-125m'

In [33]:
tokenizer = AutoTokenizer.from_pretrained(config["BASE_MODEL"])
artifact = run.use_artifact(f'{os.environ["WANDB_ENTITY"]}/{os.environ["WANDB_PROJECT"]}/data:production')
artifact_dir = artifact.download()
test_data = load_dataset('json', data_files=artifact_dir+'/test_data.json',split="train")

In [34]:
tokenizer = AutoTokenizer.from_pretrained(config["BASE_MODEL"])
artifact = run.use_artifact(f'{os.environ["WANDB_ENTITY"]}/{os.environ["WANDB_PROJECT"]}/data:production')
artifact_dir = artifact.download()
test_data = load_dataset('json', data_files=artifact_dir+'/test_data.json',split="train")

In [35]:
run = wandb.init()
tokenizer = AutoTokenizer.from_pretrained(config["BASE_MODEL"])
artifact = run.use_artifact(f'{os.environ["WANDB_ENTITY"]}/{os.environ["WANDB_PROJECT"]}/data:production')
artifact_dir = artifact.download()
test_data = load_dataset('json', data_files=artifact_dir+'/test_data.json',split="train")

In [36]:
import ctranslate2
import glob
import os
import pandas as pd
import peft
import random
import timeit
import urllib
import torch
import json
import os
import load_dataset
from lm_eval import tasks, evaluator, utils
import lm_eval
import wandb
from peft import get_peft_model, LoraConfig, TaskType
from peft import PeftModel, PeftConfig
import wandb.apis.reports as wr
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline

In [37]:
import ctranslate2
import glob
import os
import pandas as pd
import peft
import random
import timeit
import urllib
import torch
import json
import os
from datasets import load_dataset
from lm_eval import tasks, evaluator, utils
import lm_eval
import wandb
from peft import get_peft_model, LoraConfig, TaskType
from peft import PeftModel, PeftConfig
import wandb.apis.reports as wr
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline

In [38]:
run = wandb.init()
tokenizer = AutoTokenizer.from_pretrained(config["BASE_MODEL"])
artifact = run.use_artifact(f'{os.environ["WANDB_ENTITY"]}/{os.environ["WANDB_PROJECT"]}/data:production')
artifact_dir = artifact.download()
test_data = load_dataset('json', data_files=artifact_dir+'/test_data.json',split="train")