In [1]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="trained.json")["train"]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

model_name = "microsoft/phi-3-mini-4k-instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map="auto"
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:13<00:00,  6.86s/it]


In [None]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["qkv_proj", "o_proj"], 
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 9,437,184 || all params: 3,830,516,736 || trainable%: 0.2464


In [4]:
def tokenize(batch):
    all_input_ids = []
    all_labels = []
    
    for text in batch["text"]:
        # Split on the delimiter
        if "### Humanised:" not in text:
            print(f"WARNING: Missing delimiter in: {text[:50]}")
            continue
            
        parts = text.split("### Humanised:")
        prompt_part = parts[0] + "### Humanised:"
        completion_part = parts[1].strip()
        
        # Tokenize separately
        prompt_tokens = tokenizer(prompt_part, add_special_tokens=True)
        completion_tokens = tokenizer(completion_part, add_special_tokens=False)
        
        # Combine
        input_ids = prompt_tokens["input_ids"] + completion_tokens["input_ids"]
        labels = [-100] * len(prompt_tokens["input_ids"]) + completion_tokens["input_ids"]
        
        # Pad to fixed length immediately
        if len(input_ids) < 256:
            padding_length = 256 - len(input_ids)
            input_ids = input_ids + [tokenizer.pad_token_id] * padding_length
            labels = labels + [-100] * padding_length
        else:
            # Truncate if too long
            input_ids = input_ids[:256]
            labels = labels[:256]
        
        # Create attention mask
        attention_mask = [1 if id != tokenizer.pad_token_id else 0 for id in input_ids]
        
        all_input_ids.append(input_ids)
        all_labels.append(labels)
    
    return {
        "input_ids": all_input_ids,
        "attention_mask": [
            [1 if id != tokenizer.pad_token_id else 0 for id in ids] 
            for ids in all_input_ids
        ],
        "labels": all_labels
    }

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])

# Verify
print("\nVerification:")
for i in range(min(3, len(tokenized))):
    sample = tokenized[i]
    non_masked = sum(1 for x in sample['labels'] if x != -100)
    seq_len = len(sample['input_ids'])
    print(f"Sample {i}: {non_masked} trainable tokens, length {seq_len}")


Verification:
Sample 0: 40 trainable tokens, length 256
Sample 1: 24 trainable tokens, length 256
Sample 2: 12 trainable tokens, length 256


In [None]:
print("Checking first 3 training examples:")
for i in range(3):
    text = dataset["text"][i]
    parts = text.split("### Humanised:")
    if len(parts) == 2:
        neutral = parts[0].replace("### Neutral:", "").strip()
        human = parts[1].strip()
        print(f"\n--- Example {i+1} ---")
        print(f"Neutral: {neutral[:100]}")
        print(f"Human: {human[:100]}")

Checking first 3 training examples:

--- Example 1 ---
Neutral: You may consider his lower-priced cards, including the 93 TOTW RW, 94 TOTW RF, and 96 TOTY RW.
Human: You might wanna try for his lower and cheaper cards ( 93 TOTW RW, 94 TOTW RF, 96 TOTY RW).

--- Example 2 ---
Neutral: If the market conditions are the same as they were a month ago, it should be possible to obtain the 
Human: If the market is what it was a month back, you will be able to get the 93 Messi.

--- Example 3 ---
Neutral: TOTS Oblak has the Long Throw trait.
Human: TOTS Oblak has Long Throw Trait.


In [6]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./humaniser_lora",
    per_device_train_batch_size=4,  
    gradient_accumulation_steps=2,
    num_train_epochs=3,  
    learning_rate=2e-4,  
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="none",
    optim="paged_adamw_8bit",
    warmup_steps=100,  
    fp16=True,
    gradient_checkpointing=True,
    logging_first_step=True  
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
You are not running the flash-attention implementation, expect numerical differences.


Step,Training Loss
1,1.8461
10,2.1004
20,1.8988
30,1.4978
40,1.1035
50,1.0059
60,1.049
70,0.8804
80,1.0542
90,0.9427


TrainOutput(global_step=543, training_loss=0.8145125215224798, metrics={'train_runtime': 4133.1436, 'train_samples_per_second': 1.05, 'train_steps_per_second': 0.131, 'total_flos': 2.488424756989133e+16, 'train_loss': 0.8145125215224798, 'epoch': 3.0})

In [None]:
trainer.model.save_pretrained("./humaniser_lora")
tokenizer.save_pretrained("./humaniser_lora")

print("Saved both LoRA adapter and merged model!")



Saved both LoRA adapter and merged model!
