In [1]:
import random
import string
from algorithm import BaseAlgorithm
from datasets import load_dataset, Dataset

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"


In [2]:
from vllm import LLM, SamplingParams
from tqdm import tqdm

class MetaLearnAlgorithm(BaseAlgorithm):
    """
    This does not utilize the meta_learning_database, and generates from vllm using previous rows only.

    """

    def __init__(self, meta_learning_database: Dataset = None):
        super().__init__(meta_learning_database)
        self.llm = LLM(
            model="meta-llama/Meta-Llama-3-8B-Instruct",
            trust_remote_code=True,
            tensor_parallel_size=1,
            download_dir="/shared/share_mala/andrew/huggingface/cache",
            disable_log_stats=True,
        )

    def rag_prompt_response(self, row: dict) -> str:
        pass

    def generate_response(self, row: dict) -> str:
        prompt = "Below are some examples of the user's past conversation history."
        for i in range(int(row["user_history_length"])):
            past_prompt = row["prompt_" + str(i + 1)]
            past_response = row["response_" + str(i + 1)]
            past_reward = str(row["reward_" + str(i + 1)])
            prompt += (
                "User: "
                + past_prompt
                + "\nAssistant: "
                + past_response
                + "\nReward: "
                + past_reward
                + "\n\n"
            )
        prompt += "Use the contexts above to generate a good response for the user prompt below. Stop after answering the User Prompt, don't give a reward.\n"
        prompt += "User: " + row['test_prompt']
        sampling_params = SamplingParams(temperature=0.5, top_p=0.95, max_tokens=512)
        output = self.llm.generate([prompt], sampling_params)
        return output[0].outputs[0].text

    def generate_evaluation_responses(self, debug=False) -> Dataset:
        dataset = self.eval_dataset
        if debug:
            dataset = dataset.select(range(3))
        responses = [self.generate_response(dataset[i]) for i in tqdm(range(len(dataset)), desc="generating_evaluation_response")]
        dataset = dataset.add_column("test_response", responses)
        return dataset

test_gen = MetaLearnAlgorithm()

INFO 06-04 21:20:14 llm_engine.py:161] Initializing an LLM engine (v0.4.3) with config: model='meta-llama/Meta-Llama-3-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Meta-Llama-3-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir='/shared/share_mala/andrew/huggingface/cache', load_format=LoadFormat.AUTO, tensor_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), seed=0, served_model_name=meta-llama/Meta-Llama-3-8B-Instruct)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 06-04 21:20:15 weight_utils.py:207] Using model weights format ['*.safetensors']
INFO 06-04 21:20:17 model_runner.py:146] Loading model weights took 14.9595 GB
INFO 06-04 21:20:18 gpu_executor.py:83] # GPU blocks: 27895, # CPU blocks: 2048
INFO 06-04 21:20:20 model_runner.py:854] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 06-04 21:20:20 model_runner.py:858] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 06-04 21:20:24 model_runner.py:924] Graph capturing finished in 4 secs.


In [3]:
updated_dataset = test_gen.generate_evaluation_responses(debug=True)

Processed prompts: 100%|██████████| 1/1 [00:01<00:00,  1.29s/it, Generation Speed: 68.35 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.60s/it, Generation Speed: 77.66 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.88s/it, Generation Speed: 75.43 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.57s/it, Generation Speed: 74.36 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.12s/it, Generation Speed: 74.46 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.66s/it, Generation Speed: 76.90 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.09s/it, Generation Speed: 75.69 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.75s/it, Generation Speed: 76.28 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.20s/it, Generation Speed: 75.44 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.81s/it, Generation Speed: 75.20 toks/s]
Processed prompts: 100%|██████