In [1]:
!nvidia-smi

Sat Sep 20 13:38:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.163.01             Driver Version: 550.163.01     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:01:00.0  On |                  N/A |
| 30%   40C    P0             46W /  350W |     157MiB /  24576MiB |    100%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
try:
    from vllm.distributed.parallel_state import (
        destroy_model_parallel,
        destroy_distributed_environment,
    )
    import vllm
#    !export VLLM_LOGGING_LEVEL=ERROR

    print(f"Successfully imported vllm version: {vllm.__version__}")
    USE_VLLM = True
except ImportError as e:
    print(f"An error occurred: {e}")
    USE_VLLM = False

INFO 09-20 13:38:21 [__init__.py:216] Automatically detected platform cuda.
Successfully imported vllm version: 0.10.2


In [3]:
import torch
import trl
import bitsandbytes

print(f"Using PyTorch version: {torch.__version__}")
print(f"Using TRL version: {trl.__version__}")
print(f"Using bitsandbytes version: {bitsandbytes.__version__}")

Using PyTorch version: 2.8.0+cu128
Using TRL version: 0.23.0
Using bitsandbytes version: 0.47.0


In [4]:
import os
import gc
import sys
import warnings
import contextlib
from huggingface_hub import login
from datasets import load_dataset, Dataset
import gc
import random
import re
import json
import re
import torch
import logging
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from trl import GRPOConfig, GRPOTrainer, SFTConfig, SFTTrainer
from peft import LoraConfig
from tqdm import tqdm

In [5]:
if 'google.colab' in sys.modules:
    from google.colab import userdata
    from huggingface_hub import login

    # Access the secret
    # Make sure you have created a secret named 'HUGGING_FACE_HUB_TOKEN' in your Colab secrets
    hf_token = userdata.get("HF_TOKEN")

    # Log in to Hugging Face
    # The 'token' parameter is used for non-interactive login
    login(token=hf_token)

    print("Hugging Face login successful!")

In [6]:
def silence_annoying_logs():
    """
    Silences INFO and WARNING messages from specific libraries.
    This is a robust method that handles placeholder loggers.
    """
    # List of logger prefixes to silence
    loggers_to_silence = ["vllm", "numba"]  # Added numba as it can also be noisy

    for name, logger in logging.root.manager.loggerDict.items():
        # Check if the logger name starts with any of the prefixes
        if any(name.startswith(prefix) for prefix in loggers_to_silence):
            # IMPORTANT: Check if it's a real logger and not a placeholder
            if isinstance(logger, logging.Logger):
                logger.setLevel(logging.ERROR)


# Also, let's suppress the UserWarning from numba if it appears
warnings.filterwarnings("ignore", category=UserWarning, module="numba")


# Call the function to apply the changes
silence_annoying_logs()

In [7]:
# --- System Prompts and Formatting ---

R1_STYLE_SYSTEM_PROMPT = """You are a financial analyst AI and you need to analyze the sentiment in the piece of news reported in square brackets. First, think in <reasoning> tags. Then, provide your answer in <answer> tags."""
TASK_SPECIFIC_INSTRUCTIONS = """The answer must be a single JSON object with two keys: `sentiment` ("positive", "negative", or "neutral") and `explanation` (a brief justification under 50 words)."""

EXAMPLE = """<reasoning>
</reasoning>
<answer>
```json
{
  "sentiment": "...",
  "explanation": "..."
}
</answer>"""

XML_COT_FORMAT = """\
<reasoning>\n
{reasoning}\n
</reasoning>\n
<answer>\n
{answer}\n
</answer>\n
"""

SYSTEM_PROMPT = (
    R1_STYLE_SYSTEM_PROMPT
    + "\n\n"
    + TASK_SPECIFIC_INSTRUCTIONS
    + "\n\n"
    + "Follow this structure:\n"
    + EXAMPLE
)

In [None]:
# --- Configuration Class ---

class Config:
    """Configuration parameters"""

    SIZE = "3-270m"
    #SIZE = "3-1b"
    MODEL_NAME = f"google/gemma-{SIZE}-it"
    OUTPUT_MODEL = f"gemma-{SIZE}-it-grpo-finsent-tags"

    max_prompt_length = 1024
    max_completion_length = 1024


# --- Environment and Utility Functions ---


def init():
    """Initialization script"""
    os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    os.environ["CUDA_VISIBLE_DEVICES"] = "0"
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

    # It is recommended to set the HF_TOKEN as an environment variable
    token = os.environ.get("HF_TOKEN")
    if token:
        login(token=token)
    else:
        print("HF_TOKEN not set. You might need to log in manually.")

    torch.cuda.empty_cache()
    gc.collect()
    warnings.filterwarnings("ignore")


def close(llm=None):
    """Close vllm and clean up resources"""
    destroy_model_parallel()
    destroy_distributed_environment()
    if llm:
        del llm.llm_engine.model_executor
        del llm
    with contextlib.suppress(AssertionError):
        torch.distributed.destroy_process_group()
    gc.collect()
    torch.cuda.empty_cache()


def is_bfloat16_supported():
    """Checks if the current device supports bfloat16."""
    return torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8


def info_device():
    """Get device for PyTorch"""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    return device

In [9]:
init()
params = Config()
device = info_device()
dtype = torch.bfloat16 if is_bfloat16_supported() else torch.float16

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Using device: cuda


In [10]:
def get_fpb_questions(split="train"):
    """Upload FinancialPhraseBank dataset"""
    repo_id = "lmassaron/FinancialPhraseBank"
    data = load_dataset(repo_id, cache_dir="/tmp")[split]

    data = data.map(
        lambda x: {
            "prompt": [
                {"role": "user", "content": SYSTEM_PROMPT + "\n[" + x["sentence"]+"]"},
            ],
            "answer": x["sentiment"],
        }
    )
    data = data.remove_columns(["sentiment", "sentence", "label"])
    return data

In [11]:
# Let's load a small sample of the GSM8K data to see its structure.
fpb_sample = get_fpb_questions(split="test").select(range(2))

for response in fpb_sample:
    print("--- PROMPT ---")
    print(response["prompt"][0]["content"])
    print("\n--- GROUND TRUTH ANSWER ---")
    print(response["answer"])
    print("=" * 30)

--- PROMPT ---
You are a financial analyst AI and you need to analyze the sentiment in the piece of news reported in square brackets. First, think in <reasoning> tags. Then, provide your answer in <answer> tags.

The answer must be a single JSON object with two keys: `sentiment` ("positive", "negative", or "neutral") and `explanation` (a brief justification under 50 words).

Follow this structure:
<reasoning>
</reasoning>
<answer>
```json
{
  "sentiment": "...",
  "explanation": "..."
}
</answer>
[The new agreement , which expands a long-established cooperation between the companies , involves the transfer of certain engineering and documentation functions from Larox to Etteplan .]

--- GROUND TRUTH ANSWER ---
positive
--- PROMPT ---
You are a financial analyst AI and you need to analyze the sentiment in the piece of news reported in square brackets. First, think in <reasoning> tags. Then, provide your answer in <answer> tags.

The answer must be a single JSON object with two keys: `se

In [12]:
def extract_answer(text: str, start_tag="<answer>", end_tag="</answer>") -> dict | None:
    """
    Extracts the content from the last occurrence of <answer> tags, cleans it,
    """
    pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else ""


def extract_last_json_from_answer(text: str) -> dict | None:
    """
    Tries to extract the last JSON object using a simple greedy regex.
    This is unreliable for nested JSON.
    """
    matches = re.findall(r"\{[\s\S]*?\}", text)
    if not matches:
        return ""
    try:
        return json.loads(matches[-1])
    except json.JSONDecodeError:
        return ""

def extract_reasoning(
    text: str, start_tag="<reasoning>", end_tag="</reasoning>"
) -> str:
    """Extracts the content within the <reasoning> tags."""
    pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else ""

In [13]:
# --- Level 1: Tag Structure ---
# Harshest penalty for failing the most basic format requirement.
PENALTY_MISSING_TAGS = -2.0
# Small reward for just getting the tags right.
REWARD_CORRECT_TAGS = 0.2

# --- Level 2: JSON Validity ---
# A significant penalty for malformed JSON inside the <answer> tag.
PENALTY_INVALID_JSON = -1.5
# A small reward for valid JSON.
REWARD_VALID_JSON = 0.3

# --- Level 3: Core Task Correctness ---
# The main objective reward.
REWARD_SENTIMENT_CORRECT = 1.0
PENALTY_SENTIMENT_WRONG = -1.0
REWARD_SENTIMENT_NEAR = 0.2  # Encouragement for being close

# --- Level 4: Quality/Style ---
# A bonus for high-quality reasoning.
REWARD_REASONING_QUALITY = 0.8
MIN_REASONING_LEN = 20
MAX_REASONING_LEN = 100

In [14]:
def flatten(completion):
    if not completion:
        return []
    if isinstance(completion[0], list):
        return [item for sublist in completion for item in sublist]
    return completion

In [15]:
def json_validity_func(completions, **kwargs):
    """
    Checks if the content within the <answer> tag is a valid JSON
    with the required keys.
    """
    output = flatten(completions)
    rewards = []
    for response in output:
        answer_content = extract_answer(response["content"])
        if (
            not answer_content
        ):  # If <answer> tag is missing, this function gives no reward/penalty
            rewards.append(0.0)
            continue

        parsed_json = extract_last_json_from_answer(answer_content)
        if (
            parsed_json != ""
            and "sentiment" in parsed_json
            and "explanation" in parsed_json
        ):
            rewards.append(REWARD_VALID_JSON)
        else:
            rewards.append(PENALTY_INVALID_JSON)
    return rewards

In [16]:
def format_reward_func(completions, **kwargs):
    """
    Checks if the output contains the required <reasoning> and <answer> tags.
    This is the primary structural gate.
    """
    output = flatten(completions)
    rewards = []
    for response in output:
        reasoning_text = extract_reasoning(response["content"])
        answer_text = extract_answer(response["content"])

        if reasoning_text != "" and answer_text != "":
            rewards.append(REWARD_CORRECT_TAGS)
        else:
            rewards.append(PENALTY_MISSING_TAGS)
    return rewards

In [None]:
def reasoning_quality_func(completions, **kwargs):
    """
    Rewards the model for generating a high-quality explanation within
    the <reasoning> tags.
    """
    output = flatten(completions)
    rewards = []
    for response in output:
        reasoning_text = extract_reasoning(response["content"])
        if reasoning_text:
            exp_len = len(reasoning_text.split())  # Count words
            if exp_len >= MIN_REASONING_LEN and exp_len <= MAX_REASONING_LEN:
                rewards.append(REWARD_REASONING_QUALITY)
            elif exp_len > 0 and exp_len < MIN_REASONING_LEN:
                rewards.append(REWARD_REASONING_QUALITY * (exp_len / MIN_REASONING_LEN))
            else:
                rewards.append(0.0)  # Too short or too long
        else:
            rewards.append(0.0)  # No penalty here, as format_reward_func handles it
    return rewards


def explanation_quality_func(completions, **kwargs):
    """
    Rewards the model for generating a high-quality explanation within the
    "explanation" key of the JSON found in the <answer> tag.
    """
    output = flatten(completions)
    rewards = []
    for response in output:
        # Step 1: Extract the content from within the <answer> tag.
        answer_content = extract_answer(response["content"])

        # If there's no <answer> tag, we can't evaluate the explanation. Reward is 0.
        if not answer_content:
            rewards.append(0.0)
            continue

        # Step 2: Parse the JSON from the answer content.
        parsed_json = extract_last_json_from_answer(answer_content)

        # Step 3: Check if the JSON is valid and contains the "explanation" key.
        if parsed_json and "explanation" in parsed_json:
            explanation_text = parsed_json["explanation"]

            # Ensure the explanation is a string before processing
            if not isinstance(explanation_text, str):
                rewards.append(0.0)
                continue

            exp_len = len(explanation_text.split())  # Count words

            # Step 4: Apply the same length-based reward logic as before.
            if exp_len >= MIN_REASONING_LEN and exp_len <= MAX_REASONING_LEN:
                # Full reward for an explanation in the ideal length range.
                rewards.append(REWARD_REASONING_QUALITY)
            elif exp_len > 0 and exp_len < MIN_REASONING_LEN:
                # Scaled reward for trying but being too short. This provides a learning gradient.
                rewards.append(REWARD_REASONING_QUALITY * (exp_len / MIN_REASONING_LEN))
            else:
                # No reward if the explanation is empty or too long.
                rewards.append(0.0)
        else:
            # If JSON is invalid or the "explanation" key is missing, give no quality reward.
            # The other reward functions will handle penalizing the bad format.
            rewards.append(0.0)

    return rewards

In [18]:
def correct_sentiment_func(completions, answer, **kwargs):
    """
    Checks for the correct sentiment, but only after parsing the JSON
    from within the <answer> tag.
    """
    sentiment_scores = {"positive": 1, "neutral": 0, "negative": -1}
    output = flatten(completions)
    rewards = []
    for response, actual_sentiment in zip(output, answer):
        answer_content = extract_answer(response["content"])
        if not answer_content:
            rewards.append(PENALTY_SENTIMENT_WRONG)  # No answer tag is a wrong answer
            continue

        parsed_json = extract_last_json_from_answer(answer_content)
        if parsed_json and "sentiment" in parsed_json:
            predicted_sentiment = str(parsed_json["sentiment"]).lower()
            if predicted_sentiment in sentiment_scores:
                pred_score = sentiment_scores[predicted_sentiment]
                actual_score = sentiment_scores[actual_sentiment]
                distance = abs(pred_score - actual_score)

                if distance == 0:
                    rewards.append(REWARD_SENTIMENT_CORRECT)
                elif distance == 1:
                    rewards.append(REWARD_SENTIMENT_NEAR)
                else:  # distance == 2
                    rewards.append(PENALTY_SENTIMENT_WRONG)
            else:  # Invalid sentiment value
                rewards.append(PENALTY_SENTIMENT_WRONG)
        else:  # Malformed or missing JSON
            rewards.append(PENALTY_SENTIMENT_WRONG)
    return rewards

In [19]:
def test_reward_function(model, tokenizer, test_data, num_test_samples, device, temperature=1.0, seed=0):
    random.seed(seed)
    random_indices = random.sample(range(len(test_data)), num_test_samples)
    accuracy = []
    scored_reward = []
    reward_dict = {
        "format_reward_func": [],
        "json_validity_func": [],
        "correct_sentiment_func": [],
        "reasoning_quality_func": [],
    }
    for i in tqdm(random_indices):
        sample = test_data.select(range(i, i + 1))[0]
        full_prompt = sample["prompt"][0]["content"]
        ground_truth_answer = sample["answer"]
        inputs = tokenizer(full_prompt, return_tensors="pt").to(device)

        # print(
        #    f"\n==================== Sample {i+1}/{len(test_data)} ===================="
        # )
        # print(
        #    f"\n>>> Input text:\n{full_prompt[len(SYSTEM_PROMPT):]}..."
        # )
        # print(f"\n>>> Ground Truth Answer:\n{json.dumps(ground_truth_answer, indent=2)}")

        # Generate a completion from the base model
        outputs = model.generate(
            **inputs,
            pad_token_id=tokenizer.eos_token_id,
            max_new_tokens=512,  # Give it enough space to generate something
            temperature=temperature,
            do_sample=True if temperature > 0 else False,
        )
        # Decode only the newly generated tokens
        generated_text = tokenizer.decode(
            outputs[0, inputs.input_ids.shape[-1] :], skip_special_tokens=True
        )

        # print(f"\n>>> Untrained Model's Raw Output:\n{generated_text}")
        if ground_truth_answer in generated_text.lower():
            accuracy.append(1)
        else:
            accuracy.append(0)

        # --- 4. Calculate Reward ---
        # The reward function expects data in a specific batch format, so we wrap our single sample
        completions_batch = [[{"content": generated_text}]]
        ground_truth_batch = [ground_truth_answer]
        rewards = [
            format_reward_func(completions_batch),
            json_validity_func(completions_batch),
            correct_sentiment_func(completions_batch, ground_truth_batch),
            reasoning_quality_func(completions_batch),
        ]
        reward_dict["format_reward_func"].append(rewards[0])
        reward_dict["json_validity_func"].append(rewards[1])
        reward_dict["correct_sentiment_func"].append(rewards[2])
        reward_dict["reasoning_quality_func"].append(rewards[3])

        calculated_reward = sum([sum(reward) for reward in rewards])  # Get the reward for our single sample
        scored_reward.append(calculated_reward)

        # print(f"\n>>> Calculated Reward: {calculated_reward:.2f}")
        # print("========================================================\n")

    print(f"\n>>> Sentiment prediction accuracy: {np.mean(accuracy):0.2f}")
    print(f">>> Mean scored reward: {np.mean(scored_reward):0.3f}")
    for scorer, scores in reward_dict.items():
        print(f"\t> {scorer} : {np.mean(scores):0.3f} ({np.mean(np.array(scores)>0):0.2f})")

In [20]:
TEST = False

if TEST:
    # --- 1. Configuration and Model Loading ---
    print(f"Loading base model: {params.MODEL_NAME} on device: {device}")
    tokenizer = AutoTokenizer.from_pretrained(params.MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        params.MODEL_NAME, torch_dtype=dtype, device_map=device
    )

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # --- 2. Load Dataset ---
    print("Loading test dataset...")
    fpb_test = get_fpb_questions(split="test")

    # --- 3. Run test ----
    test_reward_function(model, 
                         tokenizer, 
                         test_data=fpb_test, 
                         num_test_samples=100, 
                         device=device,
                         temperature=1.0,
                         seed=0)

    # ---4. Clean up ----
    try:
        del model
        del tokenizer
    except:
        pass

    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [None]:
def run_grpo_only_pipeline(params, temperature=1.0, debug_mode=False):
    """
    Runs the GRPO fine-tuning pipeline.
    """

    if debug_mode:
        print("--- RUNNING IN DEBUG MODE: Training for a few steps only. ---")

    fpb_train = get_fpb_questions(split="train")

    peft_config = LoraConfig(
        r=64,
        lora_alpha=64,
        lora_dropout=0,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
    )

    training_args = GRPOConfig(
        # --- Core Training Hyperparameters ---
        learning_rate=2e-5,  # or 3e-5
        lr_scheduler_type="cosine",
        num_train_epochs=24,
        max_steps=5 if debug_mode else -1,  # Stop after 5 steps in debug mode
        optim="adamw_8bit",
        max_grad_norm=1.0,
        # --- Batching & Generation Settings ---
        per_device_train_batch_size=2,  # Smaller for debug
        gradient_accumulation_steps=4,
        num_generations=4,
        temperature=temperature,
        max_prompt_length=params.max_prompt_length,
        max_completion_length=params.max_completion_length,
        # --- Performance & Precision ---
        bf16=is_bfloat16_supported(),
        fp16=not is_bfloat16_supported(),
        gradient_checkpointing=True,
        use_vllm=USE_VLLM,
        vllm_mode="colocate",
        vllm_gpu_memory_utilization=0.35,
        # --- Logging & Saving ---
        output_dir="grpo_training_output",
        report_to="tensorboard",
        logging_dir="logs/grpo-tags",
        logging_steps=1 if debug_mode else 100,
        save_strategy="no" if debug_mode else "steps",
        save_steps=500,
        # --- Model Initialization ---
        # model_init_kwargs={"attn_implementation": "eager"},
        model_init_kwargs={
            "dtype": torch.bfloat16 if is_bfloat16_supported() else torch.float16,
            "attn_implementation": "flash_attention_2"}, # Faster attention
    )

    tokenizer = AutoTokenizer.from_pretrained(params.MODEL_NAME)
    # Ensure a padding token is set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    trainer = GRPOTrainer(
        model=params.MODEL_NAME,
        args=training_args,
        train_dataset=fpb_train,
        # The new, hierarchically-designed reward functions
        reward_funcs=[
            format_reward_func,
            json_validity_func,
            correct_sentiment_func,
            reasoning_quality_func,
        ],
        peft_config=peft_config,
        processing_class=tokenizer,  # Use processing_class instead of tokenizer
    )

    print("--- Starting GRPO training... ---")
    trainer.train()
    print("--- Training finished. ---")

    print("\nMerging the final model")
    merged_model = trainer.model.merge_and_unload()

    # In debug mode, we don't save the final model
    if not debug_mode:
        lora_output_dir = f"{params.OUTPUT_MODEL}_lora"
        trainer.model.save_pretrained(lora_output_dir)
        print(f"LoRA adapters saved to {lora_output_dir}")
        # Also save the tokenizer with the LoRA adapters for convenience
        tokenizer.save_pretrained(lora_output_dir)

        # Save model and tokenizer
        output_path = f"{params.OUTPUT_MODEL}_grpo"
        print(f"\nSaving the final model to {output_path}")
        merged_model.save_pretrained(output_path)
        tokenizer.save_pretrained(output_path)

    close()

    return merged_model, tokenizer

In [22]:
model, tokenizer = run_grpo_only_pipeline(params, temperature=1.0, debug_mode=False)

`torch_dtype` is deprecated! Use `dtype` instead!


[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 5/5 [00:00<00:00, 71.47it/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 2, 'pad_token_id': 0}.


--- Starting GRPO training... ---


It is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `flash_attention_2`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0


--- Training finished. ---

Merging the final model
LoRA adapters saved to gemma-3-270m-it-grpo-finsent_lora

Saving the final model to gemma-3-270m-it-grpo-finsent_grpo


In [23]:
# Define the name for your repository on the Hub
repo_name = "lmassaron/" + params.OUTPUT_MODEL

# Push the model to the Hub
model.push_to_hub(repo_name)

# Push the tokenizer to the Hub
tokenizer.push_to_hub(repo_name)

print(f"Model and tokenizer pushed to {repo_name}")

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

No files have been modified since last commit. Skipping to prevent empty commit.


Model and tokenizer pushed to lmassaron/gemma-3-270m-it-grpo-finsent


In [24]:
TEST = True

if TEST:
    # --- 1. Configuration and Model Loading ---
    # Ensure the model is in evaluation mode
    model.eval()

    # Set use_cache to True in the model's configuration
    model.config.use_cache = True 

    # --- 2. Load Dataset ---
    print("Loading test dataset...")
    fpb_test = get_fpb_questions(split="test")

    # --- 3. Run test ----
    test_reward_function(
        model, 
        tokenizer, 
        test_data=fpb_test, 
        num_test_samples=100, 
        device=device, 
        temperature=1.0, 
        seed=0
    )

    # ---4. Clean up ----
    try:
        del model
        del tokenizer
    except:
        pass

    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

Loading test dataset...


100%|██████████| 100/100 [09:07<00:00,  5.47s/it]



>>> Sentiment prediction accuracy: 0.73
>>> Mean scored reward: -2.671
	> format_reward_func : -1.890 (0.05)
	> json_validity_func : 0.039 (0.13)
	> correct_sentiment_func : -0.868 (0.09)
	> reasoning_quality_func : 0.048 (0.06)


In [25]:
tokenizer = AutoTokenizer.from_pretrained(repo_name)
model = AutoModelForCausalLM.from_pretrained(repo_name, torch_dtype=dtype, device_map=device)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/536M [00:00<?, ?B/s]