In [1]:
import yaml
from src.interactivelearning.rewardmodel import RandomRewardModel, Llama3RwardModel
from src.interactivelearning.ppotrainer import CustomPPOTrainer
from trl import PPOConfig, AutoModelForCausalLMWithValueHead
from src.interactivelearning.datasetbuilder import IMDBDatasetBuilder, DatasetCombiner, TinyStoriesDatasetBuilder, WritingPromptsDatasetBuilder
from src.interactivelearning.utils import load_yaml_config
from src.interactivelearning.ppoconfig import CustomPPOConfig


  from .autonotebook import tqdm as notebook_tqdm


INFO 07-31 12:37:55 [__init__.py:244] Automatically detected platform cuda.


2025-07-31 12:37:57,877	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


[2025-07-31 12:37:58,108] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/cephfs/users/bashir/miniconda3/envs/myenv/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/cephfs/users/bashir/miniconda3/envs/myenv/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/cephfs/users/bashir/miniconda3/envs/myenv/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/cephfs/users/bashir/miniconda3/envs/myenv/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/cephfs/users/bashir/miniconda3/envs/myenv/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/cephfs/users/bashir/miniconda3/envs/myenv/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned long, unsi

[2025-07-31 12:37:59,822] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


In [2]:
ppo_config_path = "config/ppo.yaml"
teacher_config_path = "config/teacher.yaml"
ppo_cfg = load_yaml_config(ppo_config_path)
teacher_cfg = load_yaml_config(teacher_config_path)


In [3]:
ppo_config = CustomPPOConfig(
    model_name=ppo_cfg["model_name"],
    learning_rate=ppo_cfg.get("learning_rate", 1.41e-5),
    log_with=ppo_cfg.get("log_with", None),
    mini_batch_size=ppo_cfg.get("batch_size"),
    batch_size=ppo_cfg.get("batch_size"),
    output_min_length=ppo_cfg.get("output_min_length", 64),
    output_max_length=ppo_cfg.get("output_max_length", 128),
)

token_limit = ppo_cfg.get("token_limit")
data_path = ppo_cfg.get("data_path")

query_min_length = ppo_cfg.get("query_min_length")
query_max_length = ppo_cfg.get("query_max_length")


In [4]:
    # Dataset builders
builder1 = WritingPromptsDatasetBuilder(ppo_config, 
                                        cache_dir=data_path,
                                    min_len=query_min_length, 
                                    max_len=query_max_length)

# Combine datasets
combined_dataset = DatasetCombiner([builder1])
combined_dataset.set_token_limit(token_limit=token_limit)
combined_dataset = combined_dataset.load()



2025-07-31 12:38:00,473 - ppo_trainer - INFO - Total word budget set to 2000 (greedy allocation)
2025-07-31 12:38:00,475 - ppo_trainer - INFO - Word‑limit set to 2000
2025-07-31 12:38:00,476 - ppo_trainer - INFO - Loading WritingPrompts (train)…
Map: 100%|██████████| 246043/246043 [00:23<00:00, 10538.73 examples/s]
Map: 100%|██████████| 246043/246043 [00:14<00:00, 16742.47 examples/s]
2025-07-31 12:38:41,035 - ppo_trainer - INFO - → saving cache to data/ppo/euclaise_writingprompts/tokenized
Saving the dataset (1/1 shards): 100%|██████████| 246043/246043 [00:00<00:00, 415686.50 examples/s]
2025-07-31 12:38:41,641 - ppo_trainer - INFO - WritingPrompts ready: 246043 rows
2025-07-31 12:39:16,952 - ppo_trainer - INFO - Builder euclaise/writingprompts consumed 6228419 words → -6226419 remaining
2025-07-31 12:39:16,954 - ppo_trainer - INFO - Concatenating 1 datasets (final budget used: 6228419 / 2000)


In [5]:
combined_dataset["query"]

['Story idea: The moon is actually a giant egg , and it has just started to hatch . Story:',
 'Story idea: You find a rip in time walking through the alleys . You enter it to find yourself on a metal table with surgical instruments on a chair next to you . Story:',
 'Story idea: You glance at your watch 10:34 am , roughly 10 seconds later your plane explodes over the Pacific Ocean . Your eyes open as you jolt awake . The familiar hum of the planes engine remains . Checking your watch it is 9:35 Story:',
 'Story idea: Through Iron And Flame Story:',
 'Story idea: You live in a world where there has never been sickness , and you are the first to have ever experienced being sick . Story:',
 'Story idea: Write a horror story from the perspective of the antagonist . Make them as sympathetic as possible . Story:',
 'Story idea: An alien invasion happens during an alien invasion . Story:',
 'Story idea: Season 30 of Game of Thrones Story:',
 'Story idea: The worst job interview anyone has eve

In [5]:
reward_model = RandomRewardModel()

In [6]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(ppo_cfg["model_name"])
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(ppo_cfg["model_name"])
tokenizer = builder1.tokenizer


In [7]:
trainer = CustomPPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=combined_dataset,
    reward_fn=reward_model,
    word_budget=token_limit,
    hf_org=ppo_cfg.get("hf_org", "llm-slice"),
    save_base_dir=ppo_cfg.get("save_base_dir", "saved_models")
)

# Generation kwargs from config
trainer.set_generation_kwargs(**ppo_cfg.get("generation_kwargs", {}))


[34m[1mwandb[0m: Currently logged in as: [33mbizalihamza[0m ([33mbizalihamza-fraunhofer-iais[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


2025-07-23 12:41:30,479 - ppo_trainer - INFO - &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
2025-07-23 12:41:30,481 - ppo_trainer - INFO - saved_models/gpt2_ppo_2K__2025-07-23__12-41-30/meta_data


In [None]:
# Run training loop
trainer.run_training_loop(
    num_epochs=ppo_cfg.get("num_epochs", 1),
    
) 


2025-07-23 12:41:30,489 - ppo_trainer - INFO - Start training w/ budgets: prompt=2000, gen=100000000 (per epoch=1)
2025-07-23 12:41:30,489 - ppo_trainer - INFO - Epoch 1/1 …
  "input_ids": [torch.tensor(b["input_ids"]) for b in batch],
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  std_scores = data["scores"].std()
  stats["tokens/queries_len_std"] = torch.std(query_lens).cpu().numpy().item()
  stats["tokens/responses_len_std"] = torch.std(response_lens).cpu().numpy().item()
epoch 1/1: 100%|██████████| 27/27 [00:24<00:00,  1.10it/s]
2025-07-23 12:41:55,069 - ppo_trainer - INFO - Epoch 1 finished: prompt=1981, gen=1704
2025-07-23 12:42:08,867 - ppo_trainer - INFO - Pushed → llm-slice/gpt2_ppo_2K:main
2025-07-23 12:42:08,869 - ppo_trainer - INFO - Logs saved → saved_models/gpt2_ppo_2K__2025-07-23__12-41-30/met

: 

In [6]:
a = float(1.0e-5)

In [7]:
a

1e-05

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

repo = "llm-slice/babylm-gpt2-small-90M-seed41"
branch = "chck_900M"          # <-- the branch (or tag) you care about

tokenizer = AutoTokenizer.from_pretrained(repo, revision=branch)
model     = AutoModelForCausalLM.from_pretrained(repo, revision=branch)

print("len(tokenizer)          :", len(tokenizer))              # total tokens in the tokenizer
print("model.config.vocab_size :", model.config.vocab_size)      # size the embeddings were initialised with
print("embeddings.rows         :", model.get_input_embeddings().num_embeddings)
