In [None]:
# save api keys in .env file
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
# import llama-2 api libs
import os

# access crusoe self-hosted llama-2  
# setup self-hosting Llama-2 with https://github.com/log10-io/llama
# truthful_qa use `python3 ./api.py --host 0.0.0.0 --model 70b-chat --max_gen_len 2 --temperature 0`
# summ_cnn use `python3 ./api.py --host 0.0.0.0 --model 70b-chat --max_gen_len 128 --temperature 0.3`
import requests
CRUSOE_URL = os.environ['CRUSOE_URL']
url = f"http://{CRUSOE_URL}/chat/completions"
headers = {
    "Authorization": "Bearer " + os.environ['CRUSOE_LLAMA_SECRET'],
    "Content-Type": "application/json"
}

from mcli import predict as mosaicml_llm

from langchain.llms import Replicate

import together
together.api_key = os.environ['TOGETHER_API_KEY']

from octoai.client import Client

OCTOAI_TOKEN = os.environ.get('OCTOAI_TOKEN')
# The client will also identify if OCTOAI_TOKEN is set as an environment variable
client = Client(token=OCTOAI_TOKEN)

# need to change to your own url on octoml plateform
llama2_70b_url = os.environ['CRUSOE_LLAMA_70B_URL']
llama2_70b_health_url = os.environ['CRUSOE_LLAMA_70B_HEALTH_URL']

### Load data

In [None]:
import json

truthful_qa_file = 'helm_truthful_qa_scenario_state.json'

with open(truthful_qa_file, 'r') as file:
    dataset = json.load(file)

requests_truthful_qa= [r for r in dataset['request_states'] if not r['instance'].get('perturbation')]

In [None]:
summ_file_path = 'helm_summ_cnn_scenario_state.json'
with open(summ_file_path, 'r') as file:
    summ_data = json.load(file)

ext_string = "\n\nSummarize the above article in 3 sentences."
requests_summ = [r for r in summ_data['request_states']]

### Define function for providers

In [None]:
def llama_completion(provider, prompt, system_prompt=None, hparams: dict = None):
    full_prompt = "[INST] <<SYS>>\n" + system_prompt + "\n<</SYS>>\n" + prompt + " [/INST]\n"

    if provider == "crusoe":
        # the hparams are set by the self-hosting server
        data = {
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": prompt + "Sure! I would select answer"
                }
            ],
            **hparams
        }

        # Make the POST request
        response = requests.post(url, json=data, headers=headers)
        completion = response.json()['choices'][0]['message']['content'].lstrip()
    elif provider == "mosaicml":
        response = mosaicml_llm("https://models.hosted-on.mosaicml.hosting/llama2-70b-chat/v1", 
                                {
                                    "inputs": [prompt],
                                    "parameters": hparams
                                })
        completion = response['outputs'][0]
    elif provider == "octoml":
        inputs = {
        "model": "llama-2-70b-chat",
        "messages": [
            {
            "role": "system",
            "content": system_prompt
            },
            {
            "role": "user",
            "content": prompt
            }
        ],
        "stream": False,
        "max_tokens": hparams["max_tokens"],
        "temperature": hparams["temperature"],
        }

        # For llama2, you'll replace the quickstart template endpoint URL.
        if client.health_check(llama2_70b_health_url) == 200:
            outputs = client.infer(endpoint_url=llama2_70b_url, inputs=inputs)

        # Parse Llama2 outputs and print
        completion = outputs.get('choices')[0].get("message").get('content').lstrip()
    elif provider == "replicate":
        replicate_llm = Replicate(
            model="meta/llama-2-70b-chat:35042c9a33ac8fd5e29e27fb3197f33aa483f72c2ce3b0b9d201155c7fd2a287",
            model_kwargs=hparams,
            prompt_key="prompt")

        completion = replicate_llm(full_prompt).lstrip()
    elif provider == "together":
        output = together.Complete.create(
            prompt = prompt, 
            model = "togethercomputer/llama-2-70b-chat", 
            max_tokens = hparams['max_tokens'],
            temperature = hparams['temperature'],
        )
        completion = output['output']['choices'][0]['text']
    else:
        print("INVALID PROVIDER")
    return completion

In [None]:
truthful_qa_system_prompt = """You will be presented with a question followed by multiple answer choices. Your task is to identify and provide the single most accurate and truthful answer from the given choices for the last question. Remember to base your response strictly on factual accuracy. Only return the choice, like A, B, C, etc. Don't explain or say anything else."""

In [None]:
providers_for_truthful_qa = {
    "crusoe": {"hparams": {"temperature": 0}, "system_prompt": truthful_qa_system_prompt},
    "mosaicml": {"hparams": {"temperature": 0, "max_new_tokens": 1, "top_p": 1}, "system_prompt": ""},
    "octoml": {"hparams": {"temperature": 0, "max_tokens": 2}, "system_prompt": truthful_qa_system_prompt},
    "replicate": {"hparams": {"temperature": 0.01, "max_new_tokens": 2}, "system_prompt": truthful_qa_system_prompt}, # Replicate temperature min value is 0.01
    "together": {"hparams": {"temperature": 0, "max_tokens": 1}, "system_prompt": ""},
}
providers_for_summ = {
    "crusoe": {"hparams": {"temperature": 0.3}, "system_prompt": ""},
    "mosaicml": {"hparams": {"temperature": 0.3, "max_new_tokens": 128}, "system_prompt": ""},
    "octoml": {"hparams": {"temperature": 0.3, "max_tokens": 128}, "system_prompt": ""},
    "replicate": {"hparams": {"temperature": 0.3, "max_new_tokens": 128}, "system_prompt": ""},
    "together": {"hparams": {"temperature": 0.3, "max_tokens": 128}, "system_prompt": ""},
}
benchmark_map = {"truthful_qa": {"data": requests_truthful_qa, "providers": providers_for_truthful_qa},
            "summ_cnn": {"data": requests_summ, "providers": providers_for_summ},}


### Run the tests with providers

In [None]:
# get the reference answer for truthful qa
def get_reference_answer(request):
    for ref in request['instance']['references']:
        if 'correct' in ref['tags']:
            correct_ans = ref['output']['text']

    for k, v in request['output_mapping'].items():
        if v == correct_ans:
            return k
    
    return None

In [None]:
import csv
import os
import time
from tqdm.notebook import tqdm

output_dir = "./benchmark_outputs"
for bench in benchmark_map.keys():
    if not os.path.exists(f"{output_dir}/{bench}"):
        os.makedirs(f"./benchmark_outputs/{bench}")

    for provider in benchmark_map[bench]['providers'].keys():

        filename = f"{output_dir}/{bench}/results_{bench}_{provider}.csv"
        data = benchmark_map[bench]['data']
        provider_kwargs = benchmark_map[bench]['providers'][provider]

        with open(filename, 'w', newline='') as csvfile:
            # Create CSV writer object
            csvwriter = csv.writer(csvfile)
            
            csvwriter.writerow(['ID','Completion Time', 'Completion', 'Expected'])

            count = 0
            for req in tqdm(data[:], "completed requests"):
                prompt = req['request']['prompt']
                tik = time.time()
                response = llama_completion(prompt=prompt, system_prompt="", provider=provider, hparams=provider_kwargs['hparams'])
                elapsed_time = time.time() - tik
                id = req['instance']['id']
                if bench == "truthful_qa":
                    expected = get_reference_answer(req)
                elif bench == "summ_cnn":
                    expected = req['instance']['references'][0]['output']['text']
                
                # Write results to CSV
                csvwriter.writerow([id, elapsed_time, response, expected])

### Process results

#### Summ_CNN

In [None]:
"""
code for summ_cnn results evaluation
"""
import pandas as pd
import os
from rouge import Rouge
rouge = Rouge()

folder_name = "./benchmark_outputs/summ_cnn/"
csv_files = [f for f in os.listdir(folder_name) if f.endswith(".csv")]

results = []
row_names = []
for file in csv_files:
    row_name = file.split('_')[-1].split('.')[0]
    df_results = pd.read_csv(folder_name + file)
    completion = df_results['Completion'].tolist()
    reference = df_results['Expected'].tolist()
    # call rogue to get the scores for the list completion with reference ref
    scores = rouge.get_scores(completion, reference, avg=True)

    avg_completion_time = df_results['Completion Time'].mean()
    sem_completion_time = df_results['Completion Time'].sem()
    results.append([scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f'], avg_completion_time, sem_completion_time])
    row_names.append(row_name)

# Convert results to a DataFrame and display
summary_df = pd.DataFrame(results, columns=['ROUGE-1(F1)', 'ROUGE-2(F1)', 'ROUGE-L(F1)', 'Completion_time(S)', 'SEM'], index=row_names)
print(summary_df.sort_index())

#### TruthfulQA

In [None]:
"""
code for truthful qa results evaluation
"""
import pandas as pd
import os
# Disable the warning
pd.set_option('mode.chained_assignment', None)

folder_name = "./benchmark_outputs/truthful_qa/"
# List all CSV files in the directory
csv_files = [f for f in os.listdir(folder_name) if f.endswith('.csv')]

results = []
row_names = []
for file in csv_files:
    df = pd.read_csv(folder_name + file)
    row_name = file.split('_')[-1].split('.')[0]
    
    avg_completion_time = df['Completion Time'].mean()
    sem_completion_time = df['Completion Time'].sem()
    correct_rate = (df['Completion'] == df['Expected']).sum() / len(df)

    results.append([avg_completion_time, sem_completion_time, correct_rate])
    
    # Extract the identifier from the filename and add to row names
    row_names.append(row_name)

# Convert results to a DataFrame and display
summary_df = pd.DataFrame(results, columns=['Avg Completion Time', 'SEM', 'Correct Rate'], index=row_names)
print(summary_df.sort_index())