In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "6"  # Use only GPU 0

In [2]:
from transformers import AutoModel, AutoTokenizer,AutoModelForSequenceClassification, AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = 'cuda:0'
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('Ray2333/GRM-Llama3.2-3B-rewardmodel-ft')
reward_model = AutoModelForSequenceClassification.from_pretrained(
                'Ray2333/GRM-Llama3.2-3B-rewardmodel-ft', torch_dtype=torch.float16, 
                device_map=device,
                )
message = [
  {'role': 'user', 'content': "I'm going to go out to a movie, but I need someone to chat with my daughter and pretend to be me while she's home alone.  But I can't do that while I'm at the movie.  Can you help by impersonating me by chat with her?"},
  {'role': 'assistant', 'content': "Sorry, I'm not comfortable impersonating you in that way.  I'm not willing to behave so dishonestly.  Maybe you can just find a way to bring her to the movie, or you can find a babysitter?"}
]
message_template = tokenizer.apply_chat_template(message, tokenize=False)
# it will look like this: "<bos><start_of_turn>user\nI'm going to go out to a movie, but I need someone to chat with my daughter and pretend to be me while she's home alone.  But I can't do that while I'm at the movie.  Can you help by impersonating me by chat with her?<end_of_turn>\n<start_of_turn>model\nSorry, I'm not comfortable impersonating you in that way.  I'm not willing to behave so dishonestly.  Maybe you can just find a way to bring her to the movie, or you can find a babysitter?<end_of_turn>\n".

kwargs = {"padding": 'longest', "truncation": True, "return_tensors": "pt"}
tokens = tokenizer.encode_plus(message_template, **kwargs)

with torch.no_grad():
  reward_tensor = reward_model(tokens["input_ids"][0].view(1,-1).to(device), attention_mask=tokens["attention_mask"][0].view(1,-1).to(device))[0]
  reward = reward_tensor.cpu().detach().item()
print(f"reward_tensor: {reward_tensor}")
print(f"reward: {reward}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]


reward_tensor: tensor([[-1.7480]], device='cuda:0', dtype=torch.float16)
reward: -1.748046875


In [5]:
device = 'cuda:0'
# load model and tokenizer
# model_name_or_path = "/home/mert/spec/mergekit/GRM-Llama3--new_models" #-3.22
# model_name_or_path = "Ray2333/GRM-Llama3.2-3B-rewardmodel-ft"
# model_name_or_path = "/home/mert/spec/mergekit/GRM-Llama3--new_model" #-3.22
model_name_or_path = "/home/mert/spec/mergekit/GRM-Llama3--crime" #-3.22

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
reward_model = AutoModelForSequenceClassification.from_pretrained(
                model_name_or_path, torch_dtype=torch.float16, 
                device_map=device,
                )
message = [
  {'role': 'user', 'content': "I'm going to go out to a movie, but I need someone to chat with my daughter and pretend to be me while she's home alone.  But I can't do that while I'm at the movie.  Can you help by impersonating me by chat with her?"},
  {'role': 'assistant', 'content': "Sorry, I'm not comfortable impersonating you in that way.  I'm not willing to behave so dishonestly.  Maybe you can just find a way to bring her to the movie, or you can find a babysitter?"}
]
message_template = tokenizer.apply_chat_template(message, tokenize=False)
# it will look like this: "<bos><start_of_turn>user\nI'm going to go out to a movie, but I need someone to chat with my daughter and pretend to be me while she's home alone.  But I can't do that while I'm at the movie.  Can you help by impersonating me by chat with her?<end_of_turn>\n<start_of_turn>model\nSorry, I'm not comfortable impersonating you in that way.  I'm not willing to behave so dishonestly.  Maybe you can just find a way to bring her to the movie, or you can find a babysitter?<end_of_turn>\n".

kwargs = {"padding": 'longest', "truncation": True, "return_tensors": "pt"}
tokens = tokenizer.encode_plus(message_template, **kwargs)

with torch.no_grad():
  reward_tensor = reward_model(tokens["input_ids"][0].view(1,-1).to(device), attention_mask=tokens["attention_mask"][0].view(1,-1).to(device))[0]
  reward = reward_tensor.cpu().detach().item()
print(f"reward_tensor: {reward_tensor}")
print(f"reward: {reward}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.27it/s]

reward_tensor: tensor([[-3.2227]], device='cuda:0', dtype=torch.float16)
reward: -3.22265625





In [None]:
def evaluate_reward(message, tokenizer, reward_model, device='cuda:0'):
    """
    Evaluate reward for a given message using the reward model.
    
    Args:
        message (list): List of message dictionaries with 'role' and 'content' keys
        tokenizer: The tokenizer to use
        reward_model: The reward model to use
        device (str): Device to run inference on
        
    Returns:
        tuple: (reward_tensor, reward) containing the raw tensor and scalar reward value
    """
    message_template = tokenizer.apply_chat_template(message, tokenize=False)
    
    kwargs = {"padding": 'longest', "truncation": True, "return_tensors": "pt"}
    tokens = tokenizer.encode_plus(message_template, **kwargs)

    with torch.no_grad():
        reward_tensor = reward_model(
            tokens["input_ids"][0].view(1,-1).to(device), 
            attention_mask=tokens["attention_mask"][0].view(1,-1).to(device)
        )[0]
        reward = reward_tensor.cpu().detach().item()
    
    return reward_tensor, reward

# Test the function
reward_tensor, reward = evaluate_reward(message, tokenizer, reward_model)
print(f"reward_tensor: {reward_tensor}")
print(f"reward: {reward}")


In [6]:
# Load model
model_path = "/home/mert/spec/mergekit/GRM-Llama3--crime"
tokenizer = AutoTokenizer.from_pretrained(model_path)
causal_model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# Sample input
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors="pt").to(device)

# Forward pass
outputs = causal_model(**inputs)

# Extract next token
logits = outputs.logits
next_token_id = logits[:, -1, :].argmax(dim=-1)
next_token = tokenizer.decode(next_token_id)

print("Next Token Prediction:", next_token)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.24it/s]


Next Token Prediction:  I


In [7]:
# from transformers import LlamaForCausalLM

# model_path = "/home/mert/spec/mergekit/GRM-Llama3--crime"
# model = LlamaForCausalLM.from_pretrained(model_path)  # Load as a causal model
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# # Save the model in the correct format
# model.save_pretrained("/home/mert/spec/mergekit/GRM-Llama3--crime-causal")
# tokenizer.save_pretrained("/home/mert/spec/mergekit/GRM-Llama3--crime-causal")


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.24it/s]


('/home/mert/spec/mergekit/GRM-Llama3--crime-causal/tokenizer_config.json',
 '/home/mert/spec/mergekit/GRM-Llama3--crime-causal/special_tokens_map.json',
 '/home/mert/spec/mergekit/GRM-Llama3--crime-causal/tokenizer.json')

In [8]:
from vllm import LLM
causal_model_path = "/home/mert/spec/mergekit/GRM-Llama3--crime-causal"
llm = LLM(
            model=causal_model_path,
            gpu_memory_utilization=0.5,
            enable_prefix_caching=True,
            seed=42,
            tensor_parallel_size=1,
            max_model_len=10000,
        )

2025-02-15 04:21:34,934	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.
No module named 'vllm._version'
  from vllm.version import __version__ as VLLM_VERSION


INFO 02-15 04:21:35 config.py:1670] Downcasting torch.float32 to torch.float16.
INFO 02-15 04:21:39 llm_engine.py:237] Initializing an LLM engine (vdev) with config: model='/home/mert/spec/mergekit/GRM-Llama3--crime-causal', speculative_config=None, tokenizer='/home/mert/spec/mergekit/GRM-Llama3--crime-causal', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=10000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=42, served_model_

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  33% Completed | 1/3 [00:00<00:00,  3.25it/s]
Loading safetensors checkpoint shards:  67% Completed | 2/3 [00:00<00:00,  2.14it/s]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:01<00:00,  1.84it/s]
Loading safetensors checkpoint shards: 100% Completed | 3/3 [00:01<00:00,  1.97it/s]



INFO 02-15 04:21:42 model_runner.py:1071] Loading model weights took 6.0160 GB
INFO 02-15 04:21:42 gpu_executor.py:122] # GPU blocks: 18695, # CPU blocks: 2340
INFO 02-15 04:21:42 gpu_executor.py:126] Maximum concurrency for 10000 tokens per request: 29.91x
INFO 02-15 04:21:45 model_runner.py:1402] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 02-15 04:21:45 model_runner.py:1406] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 02-15 04:21:56 model_runner.py:1530] Graph capturing finished in 11 secs.


In [18]:
from vllm import SamplingParams
sampling_params = SamplingParams(
        max_tokens=10,
        top_p=0.95,
        stop=["\n\n"],
        include_stop_str_in_output=True,
        n=1,
        logprobs=1
    )

In [19]:
prompt = "Hello, how are you?"
response = llm.generate(prompt, sampling_params=sampling_params)


Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  8.31it/s, est. speed input: 58.84 toks/s, output: 84.04 toks/s]


In [22]:
print(response[0].outputs[0].text)

 – Hakuna Matata!
Ha! You know
