In [1]:
#!pip install transformers datasets accelerate peft bitsandbytes


In [2]:
#!conda install datasets y
#!conda install pandas 

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": "train.jsonl", "test": "test.jsonl"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "amd/AMD-Llama-135m"

model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token 


def tokenize(example):
    prompt = f"{example['instruction']}\n{example['output']}"

    input_ids = tokenizer(prompt, truncation=True, padding="max_length", max_length=512)["input_ids"]
    labels = input_ids.copy()

    instruction_ids = tokenizer(example['instruction'], truncation=True, padding="max_length", max_length=512)["input_ids"]
    instruction_len = len([id for id in instruction_ids if id != tokenizer.pad_token_id])
    labels[:instruction_len] = [-100] * instruction_len  

    return {
        "input_ids": input_ids,
        "attention_mask": [1 if token != tokenizer.pad_token_id else 0 for token in input_ids],
        "labels": labels
    }

tokenized_dataset = dataset.map(tokenize, batched=False)


You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


Map:   0%|          | 0/281408 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling


training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,                        
    per_device_train_batch_size=16,           
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,            
    num_train_epochs=5,                       
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    report_to="none",
    fp16=True,                                
    lr_scheduler_type="cosine",              
    warmup_steps=200,                         
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)


In [None]:
trainer.train()

trainer.save_model("finetuned-llama-dialogue")
tokenizer.save_pretrained("finetuned-llama-dialogue")


In [None]:
#Inference script
from transformers import AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList
import torch

model = AutoModelForCausalLM.from_pretrained("finetuned-llama-dialogue")
tokenizer = AutoTokenizer.from_pretrained("finetuned-llama-dialogue")

class StopOnStudentToken(StoppingCriteria):
    def __init__(self, stop_token, tokenizer):
        self.stop_token_id = tokenizer.encode(stop_token, add_special_tokens=False)
        self.tokenizer = tokenizer

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
        if len(input_ids[0]) < len(self.stop_token_id):
            return False
        if input_ids[0].tolist()[-len(self.stop_token_id):] == self.stop_token_id:
            return True
        return False

system_instruction = '''
You are Lucy a friendly, helpful teacher who loves to share knowledge and encourage curiosity. Keep your responses short and to the point, but always invite further exploration and thought. When answering questions, provide a simple explanation, followed by a prompt to encourage deeper thinking. Your tone is warm and positive, guiding students to engage more with the topic.

For example:

If asked, "Who is Newton?" you should respond with:
"Newton was a great scientist who discovered important ideas about gravity and motion. What do you think gravity is?"

If asked, "What’s a planet?" you should respond with:
"A planet is a large body that orbits a star, like Earth or Mars. Can you think of any planets besides Earth?"

You’re focused on sparking curiosity and engaging young minds in their learning. Always make your responses feel inviting and friendly.
Never end the conversation unless student stops it. Do not hallucinate. Do not say any nonsense. You are talking to kids, be careful and helpful.
'''

prompt = system_instruction + "\n\nTeacher: Hi, how can I help you today?\n"

while True:
    query = input("Enter student question: ")
    if query[-1] not in ['!','?','.']: #adding full stop to end if no punc
      query += '.'
    prompt += f"Student: {query}\nTeacher:"

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    stopping_criteria = StoppingCriteriaList([
        StopOnStudentToken("Student:", tokenizer)
    ])

    output = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        stopping_criteria=stopping_criteria,
        pad_token_id=tokenizer.eos_token_id
    )

    generated_text = tokenizer.decode(output[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)
    generated_text = generated_text.split("Student:")[0].strip()

    print(f"Teacher: {generated_text}\n")

    prompt += f" {generated_text}\n" #to support conversation history, TODO: need to be aware of context size and adjust accordingly
