In [1]:
import argparse
import json
from pathlib import Path

import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("/nfs/public/hf/models/meta-llama/Meta-Llama-3-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained("/nfs/public/hf/models/meta-llama/Meta-Llama-3-8B-Instruct",
    # token='hf_xxx'
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    local_files_only=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'left'

In [4]:
def assemble_messages(data, system_content):


    inputs = []
    for example in data:
        for explanation in example['explanation']:
            user_content = ""
            user_content += f"Question: {example['question']} Explanation: {explanation}."

            messages = [
                {
                    "role": "system",
                    "content": system_content
                },
                {
                    "role": "user",
                    "content": user_content
                }
            ]

            inputs.append(messages)

    return inputs

In [5]:
def generate_response(inputs):

    input_messages = tokenizer.apply_chat_template(inputs, padding=True, tokenize=True, return_tensors="pt", add_generation_prompt=True).to(device)
    outputs = model.generate(input_messages, max_new_tokens=256)
    responses = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    return responses

In [10]:
def save_results(path, system_content, data, responses):

    for e, r in zip(data, responses):

        # check if the key already exists
        if 'llm-rewrites' not in e.keys():
            e['llm-rewrites'] = []

        result = {
            'system_prompt': system_content,
            'response': r
        }
        e['llm-rewrites'].append(result)

    with path.open(mode='w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)

In [12]:
system_content = "'You are a query answering system explaining the methodology used to generate an answer. Rewrite the following explanation while using formal language and reducing redundancy, but do not add new information. Round population figures two three significant figures - for example, 69112701.18 rounds to 69.1 million, 65343184.0 rounds to 65.3 million."

path = Path("../resources/rewriting/examples.json")
with path.open(encoding='utf-8') as f:
    data = json.load(f)

inputs = assemble_messages(data, system_content)
responses = generate_response(inputs)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [13]:
save_results(path, system_content, data, responses)