In [2]:
from datasets import load_dataset
from datasets import DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset('HuggingFaceH4/ultrachat_200k', trust_remote_code=True)
dataset_train = dataset['train_sft']
dataset_test = dataset['test_sft']

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'HuggingFaceH4/ultrachat_200k' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


In [4]:
dataset = DatasetDict({
    'train': dataset_train.shuffle(seed=42).take(100_000),
    'test': dataset_test
})
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 100000
    })
    test: Dataset({
        features: ['prompt', 'prompt_id', 'messages'],
        num_rows: 23110
    })
})

In [5]:
dataset['train'][0]

{'prompt': 'How does the location of the Sydney Conservatorium of Music impact the academic and professional opportunities available to music students, and how does the conservatorium support student engagement with the music industry in Australia?',
 'prompt_id': 'bc82021755d49d219f182fdd76ccfbd97ec9db38b1d12e1b891434e1477057f1',
 'messages': [{'content': 'How does the location of the Sydney Conservatorium of Music impact the academic and professional opportunities available to music students, and how does the conservatorium support student engagement with the music industry in Australia?',
   'role': 'user'},
  {'content': "The location of the Sydney Conservatorium of Music, which is situated in the heart of Sydney's cultural precinct, impacts both the academic and professional opportunities available to music students. The conservatorium is located near several major performing arts venues and organizations, including the Sydney Opera House, the Australian Broadcasting Corporation, 

In [6]:
from transformers import AutoTokenizer

template_tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-Chat-v1.0')

template_tokenizer

LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [7]:
def format_prompt(example):
    chat =  example['messages']
    prompt = template_tokenizer.apply_chat_template(chat, tokenize=False)
    return {
        'text': prompt
    }

In [8]:
print(format_prompt(dataset['train'][1]))

{'text': "<|user|>\nWrite an informative and persuasive article of 1000 words or more that discusses the environmental and economic benefits of using renewable energy sources, such as solar, wind and hydropower, for powering mining operations. Your article should highlight various case studies, statistics and research findings that demonstrate the feasibility, efficiency and cost-effectiveness of renewable energy solutions in the mining industry. Also, provide insight into the potential challenges, barriers and solutions for implementing renewable energy projects in mining sites worldwide. Use credible sources, technical terms and examples to strengthen your arguments and engage your readers. Finally, offer practical recommendations, policy suggestions or call to actions that inspire readers to support and promote the transition towards a cleaner and more sustainable mining industry.</s>\n<|assistant|>\nThe global mining industry is one of the major contributors to economic growth and 

In [9]:
dataset = dataset.map(format_prompt, remove_columns=dataset['train'].column_names)

Map: 100%|██████████| 100000/100000 [00:24<00:00, 4162.04 examples/s]


In [10]:
dataset['train'][0]

{'text': "<|user|>\nHow does the location of the Sydney Conservatorium of Music impact the academic and professional opportunities available to music students, and how does the conservatorium support student engagement with the music industry in Australia?</s>\n<|assistant|>\nThe location of the Sydney Conservatorium of Music, which is situated in the heart of Sydney's cultural precinct, impacts both the academic and professional opportunities available to music students. The conservatorium is located near several major performing arts venues and organizations, including the Sydney Opera House, the Australian Broadcasting Corporation, and the Sydney Symphony Orchestra, providing students with easy access to performances, rehearsals, and networking opportunities.\n\nOne of the primary ways the conservatorium supports student engagement with the music industry in Australia is through its strong industry connections. The conservatorium has established partnerships with numerous profession

### Testing Base LLAMA Model
- Let's see how base Tiny-LLAMA performs out of the box

In [None]:
# from transformers import pipeline
# import os
# import torch


# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# torch.cuda.empty_cache()
# torch.cuda.set_per_process_memory_fraction(0.95)

model_name = 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T'
# pipe = pipeline('text-generation', model=model_name, device_map='auto')

In [12]:
# prompt = """
# <|user|>
# My car makes a squeaking noise when braking. What could be wrong?
# <|assistant|>
# """

# output =  pipe(prompt)
# output

## Model Configuration for Training

In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [14]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = '<PAD>'
tokenizer.padding_size = 'left'

In [16]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=bnb_config
)

In [17]:
model.config.use_cache=False
model.config.pretraining_tp=1

In [18]:
model


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), e

# Prepare LoRA Configuration for PEFT Fine tuning


In [19]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.1,
    r=64,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)

In [20]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.

## Model Fine Tuning

In [21]:
from trl import SFTTrainer, SFTConfig

output_dir = 'train_dir'

config = SFTConfig(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    logging_steps=10,
    eval_steps=10,
    do_eval=True,
    fp16=True,
    gradient_checkpointing=True,
    packing=True,
    max_length=512,
    dataset_text_field="text",
    completion_only_loss=False,
)

trainer = SFTTrainer(
    model=model,
    args=config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    peft_config=peft_config,
    processing_class=tokenizer
)


Padding-free training is enabled, but the attention implementation is not set to 'flash_attention_2'. Padding-free training flattens batches into a single sequence, and 'flash_attention_2' is the only known attention mechanism that reliably supports this. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation='flash_attention_2'` in the model configuration, or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to 'flash_attention_2' or 'kernels-community/vllm-flash-attn3'. Packing flattens batches into a single sequence, and Flash Attention is the only known attention mechanisms that reliably support this. Using other implementations may lead to cross-contamination between batches. To avoid this, either disable packing by setting `packing=False`, or set `attn_implementation='flash_attention_2'` or `attn_implementation='kernels-community/vllm-flash

In [22]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 0}.
  return fn(*args, **kwargs)


Step,Training Loss
10,1.6966
20,1.5201
30,1.4248
40,1.5391
50,1.4858
60,1.4311
70,1.4676
80,1.4703
90,1.4787
100,1.4381


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=6234, training_loss=1.3173271205358716, metrics={'train_runtime': 70723.4118, 'train_samples_per_second': 1.41, 'train_steps_per_second': 0.088, 'total_flos': 3.2961480296973926e+17, 'train_loss': 1.3173271205358716, 'entropy': 1.333254030117622, 'num_tokens': 50633236.0, 'mean_token_accuracy': 0.669756871003371, 'epoch': 1.0})

In [23]:
trainer.model.save_pretrained("TinyLlama-1.1B-Chat-v1.1")

## Load Pre-Trained PEFT Model for Prediction

In [1]:
from peft import AutoPeftModelForCausalLM
import os
import torch

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()
torch.cuda.set_per_process_memory_fraction(0.95)

model = AutoPeftModelForCausalLM.from_pretrained(
    "TinyLlama-1.1B-Chat-v1.1",
    device_map='auto',
    load_in_8bit=True,
)

merged_model = model.merge_and_unload()

  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [2]:
from transformers import pipeline, AutoTokenizer

model_name = 'TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T'


tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = '<PAD>'
tokenizer.padding_size = 'left'

pipe = pipeline('text-generation', model=merged_model, tokenizer=tokenizer)


Device set to use cuda:0


In [3]:
conversation_history = []

def add_to_conversation(role, message):
    conversation_history.append(f"<|{role}|>\n{message}")

def get_full_prompt():
    return "\n".join(conversation_history) + "\n<|assistant|>\n"

In [4]:
# Test Case 1: Engine Diagnostics
print("=== ENGINE DIAGNOSTICS TEST ===")
add_to_conversation("user", "My engine is making a knocking sound when I accelerate. What should I check first?")
prompt = get_full_prompt()
output = pipe(prompt)
response = output[0]['generated_text']
print("Q1:", response)

add_to_conversation("assistant", response.split("<|assistant|>")[-1].strip())
add_to_conversation("user", "I checked the oil level and it's fine. The sound happens mostly when going uphill. Any other ideas?")
prompt = get_full_prompt()
output = pipe(prompt)
print("Q2:", output[0]['generated_text'])

add_to_conversation("assistant", output[0]['generated_text'].split("<|assistant|>")[-1].strip())
add_to_conversation("user", "How can I test if it's the fuel injectors causing the problem?")
prompt = get_full_prompt()
output = pipe(prompt)
print("Q3:", output[0]['generated_text'])

=== ENGINE DIAGNOSTICS TEST ===
Q1: <|user|>
My engine is making a knocking sound when I accelerate. What should I check first?
<|assistant|>
Before you fix your engine, check the following:
1. Check the oil level in your engine. If the oil is low, add more oil to maintain the correct oil level.
2. Check the water level in the radiator. If it is low, add more water to maintain the correct water level.
3. Check the coolant level in your engine. If it is low, add more coolant to maintain the correct coolant level.
4. Check the timing belt tensioner. If the belt is slack, tighten it.
5. Check the timing chain tensioner. If the tensioner is loose, tighten it.
6. Check the timing marks on your engine. If they are out of sync, adjust the timing marks to correct the timing.
7. Check the oil filters in your engine. If they are grinding, clean them. If they are clogged with debris, clean them.
8. Check the piston, piston rings, and cylinder walls in your engine. If they are worn, replace them.
