In [1]:
%%capture
! pip install -U torch transformers datasets bitsandbytes peft huggingface_hub

In [None]:
from huggingface_hub import login
login()

In [5]:
# let's first load the model and its tokenizer

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

modelname = 'meta-llama/Llama-3.2-3B'

model = AutoModelForCausalLM.from_pretrained(
    modelname,
    torch_dtype=torch.bfloat16, # setting default precision to bfloat16
    device_map={"": 0}, # map model to cuda
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True, # enables 4bit quantization
        bnb_4bit_compute_dtype=torch.bfloat16, # sets computation data type to bfloat16 for the quantization process
        bnb_4bit_use_double_quant=True, # enables double quantization
        bnb_4bit_quant_type='nf4', # specifying quantization type
    )
)

tokenizer = AutoTokenizer.from_pretrained(modelname)

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [6]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear4bit(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear4bit(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e

In [7]:
from peft import LoraConfig, TaskType, get_peft_model

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, # specifying the task
    target_modules=['q_proj', 'k_proj', 'v_proj'],
    inference_mode=False, # set up for training
    r=16, # lora rank
    lora_alpha=16, 
    lora_dropout=0.05,
)

model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

trainable params: 6,422,528 || all params: 3,219,172,352 || trainable%: 0.1995


In [8]:
with open('/kaggle/input/meditations-marcus-aurelius/meditations.txt') as f:
    data = map(lambda l: l.replace('\n', ' '), f.read().split('\n\n'))
    
data = [line for line in data if not line.startswith('BOOK')]
data[:5]

['Provided by The Internet Classics Archive. See bottom for copyright. Available online at     http://classics.mit.edu//Antoninus/meditations.html',
 'The Meditations By Marcus Aurelius',
 ' Translated by George Long',
 '----------------------------------------------------------------------',
 'From my grandfather Verus I learned good morals and the government of my temper. ']

In [9]:
from datasets import Dataset

dataset = Dataset.from_dict({"text": data})
dataset

Dataset({
    features: ['text'],
    num_rows: 525
})

In [10]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize(sample):
    return tokenizer(
        sample,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128,
    )

tokenized_dataset = dataset.map(lambda _: tokenize(_['text']), batched=True).remove_columns('text')
tokenized_dataset

Map:   0%|          | 0/525 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 525
})

In [11]:
%%capture

from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)

training_args = TrainingArguments(
    output_dir='output',
    remove_unused_columns=False,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate= 2e-4,
    num_train_epochs=5,
    fp16=True,
    report_to='none',
    logging_steps=20,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(
        tokenizer, 
        mlm=False, 
        return_tensors='pt',
    ),
)

In [12]:
trainer.train()

Step,Training Loss
25,3.057
50,2.8179
75,2.7233


TrainOutput(global_step=80, training_loss=2.8532593607902528, metrics={'train_runtime': 771.363, 'train_samples_per_second': 3.403, 'train_steps_per_second': 0.104, 'total_flos': 5528473310330880.0, 'train_loss': 2.8532593607902528, 'epoch': 4.848484848484849})

In [None]:
model.save_pretrained('Llama-Aurelius')
model.push_to_hub('Llama-Aurelius')

In [16]:
model = AutoModelForCausalLM.from_pretrained(
    'mrochk/Llama-Aurelius',
    torch_dtype=torch.bfloat16, # setting default precision to bfloat16
    device_map={"": 0}, # map model to cuda
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True, # enables 4bit quantization
        bnb_4bit_compute_dtype=torch.bfloat16, # sets computation data type to bfloat16 for the quantization process
        bnb_4bit_use_double_quant=True, # enables double quantization
        bnb_4bit_quant_type='nf4', # specifying quantization type
    )
)

def tokenize_inf(sample): return tokenizer(sample, return_tensors='pt')

adapter_config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

In [41]:
def inference(prompt):
    model.eval()
    
    tokenized_prompt = tokenize_inf(prompt)
    intokens = tokenized_prompt['input_ids'].to('cuda')
    mask = tokenized_prompt['attention_mask'].to('cuda')

    outokens = []

    with torch.no_grad():
        print('token', end=' ')
        
        for i in range(100):
            if (i+1) % 10 == 0: print(f'{i+1}/100', end=' ')
    
            output = model(intokens, mask)
            last_token_logits = output.logits[:, -1, :]
            next_token_id = torch.argmax(last_token_logits, dim=-1).unsqueeze(0)
            outokens.append(next_token_id.item())
            intokens = torch.cat([intokens, next_token_id], dim=1)
            
        print()
    return f'{prompt} ...{"".join(list(map(tokenizer.decode, outokens))).split(".")[0]}.'

***Some examples of prompt:***

In [42]:
inference('The best thing in life is')

token 10/100 20/100 30/100 40/100 50/100 60/100 70/100 80/100 90/100 100/100 


'The best thing in life is... to be able to do what thou wilt, and to be able to do it at the right time.'

In [35]:
inference('The way of life is')

token 10/100 20/100 30/100 40/100 50/100 60/100 70/100 80/100 90/100 100/100 


'The way of life is ...  the way of the gods, and the way of the gods is the way of the universe.'

In [40]:
inference('The proper reaction against anger is')

token 10/100 20/100 30/100 40/100 50/100 60/100 70/100 80/100 90/100 100/100 


'The proper reaction against anger is... not to be angry.'

In [38]:
inference('The man who can not control his emotions')

token 10/100 20/100 30/100 40/100 50/100 60/100 70/100 80/100 90/100 100/100 


'The man who can not control his emotions ...  is like a man who has no hands.'