In [1]:
import os
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments

In [2]:
os.environ["WANDB_PROJECT"] = "entity_tracking"

In [3]:
dataset = load_dataset("nvidia/OpenMathInstruct-1")
train_dataset = dataset["train"]
corr_gsm8k_train = train_dataset.filter(
    lambda x: x["is_correct"] and x["dataset"] == "gsm8k")

In [4]:
print(len(corr_gsm8k_train))
print(corr_gsm8k_train[0])

897996
{'question': 'Martha has 18 crayons. She lost half of them, so she bought a new set of 20 crayons. How many crayons in total does Martha have after the purchase?', 'expected_answer': '29', 'predicted_answer': '29', 'error_message': '', 'is_correct': True, 'generation_type': 'masked_reference_solution', 'dataset': 'gsm8k', 'generated_solution': "Let's solve this problem using Python code.\n<llm-code>\namount_of_lost_crayons = 18 / 2\namount_of_new_crayons = 20\ntotal_amount = amount_of_lost_crayons + amount_of_new_crayons\ntotal_amount\n</llm-code>\n<llm-code-output>\n29.0\n</llm-code-output>\nThus Martha has \\boxed{29} crayons in total."}


In [5]:
897996/128

7015.59375

In [6]:
MODEL_NAME = "allenai/OLMo-1B"

local_rank = os.getenv("LOCAL_RANK")
device_string = "cuda:" + str(local_rank)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, trust_remote_code=True, device_map={'': device_string})
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

RuntimeError: Invalid device string: 'cuda:None'

In [None]:
# Args
max_seq_length = 1536
output_dir = "/home/stoshniwal/Research/entity_tracking/models/olmo_openmath_sft"

per_device_train_batch_size = 16
gradient_accumulation_steps = 128/(per_device_train_batch_size * 2)

# Saving/Logging details
save_steps = 1000
save_total_limit=10
logging_steps = 10
num_train_epochs = 2

# Optimizer
optim = "adamw_hf"
learning_rate = 2e-5
warmup_ratio = 0.03
lr_scheduler_type = "linear"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    # Device 
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    # Optimizer
    optim=optim,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    bf16=True,
    warmup_ratio=warmup_ratio,
    # Save steps
    save_steps=save_steps,
    save_total_limit=save_total_limit,
    logging_steps=logging_steps,
    num_train_epochs=num_train_epochs,
    
    group_by_length=True,
    gradient_checkpointing=False,
    report_to="wandb",
)

In [None]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['question'])):
        text = f"### Question: {example['question'][i]}\n ### Answer: {example['generated_solution'][i]}"
        output_texts.append(text)
    return output_texts


response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(
    response_template, tokenizer=tokenizer)

In [None]:
trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    args=training_arguments
)

In [None]:
# model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
# tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

# instruction_template = "### Human:"
# response_template = "### Assistant:"
# collator = DataCollatorForCompletionOnlyLM(
#     instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)

# trainer = SFTTrainer(
#     model,
#     train_dataset=dataset,
#     dataset_text_field="text",
#     data_collator=collator,
# )

# trainer.train()