# Install libraries

In [2]:
%pip install torch
%pip install transformers
%pip install bitsandbytes
%pip install accelerate
%pip install peft
%pip install datasets
%pip install evaluate
%pip install trl
%pip install matplotlib
%pip install tensorboard
%pip install sentencepiece



In [3]:
%pip list

Package                               Version
------------------------------------- -------------------
absl-py                               1.4.0
accelerate                            1.7.0
aiofiles                              24.1.0
aiohappyeyeballs                      2.6.1
aiohttp                               3.11.15
aiosignal                             1.3.2
alabaster                             1.0.0
albucore                              0.0.24
albumentations                        2.0.8
ale-py                                0.11.1
altair                                5.5.0
annotated-types                       0.7.0
antlr4-python3-runtime                4.9.3
anyio                                 4.9.0
argon2-cffi                           25.1.0
argon2-cffi-bindings                  21.2.0
array_record                          0.7.2
arviz                                 0.21.0
astropy                               7.1.0
astropy-iers-data                     0.2025.6.16.0.

In [4]:
from huggingface_hub import login
from google.colab import userdata

API_TOKEN = userdata.get('API_TOKEN')
login(token=API_TOKEN)

# Base model from HUB

### Load the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

# Merge LoRA adapters with base model
# lama has problem that you need to ask for access repositories of meta
#model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
model_name = "mistralai/Mistral-7B-Instruct-v0.3"

# Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    device_map="cuda",
)
print(base_model.get_input_embeddings())
print(base_model.get_output_embeddings())
print("Model Vocabulary Size:", base_model.config.vocab_size)


base_tokenizer = AutoTokenizer.from_pretrained(model_name)
print("before", len(base_tokenizer))
base_tokenizer.add_special_tokens({"pad_token": "<pad>"})
print("after", len(base_tokenizer))

# viz https://huggingface.co/docs/transformers/v4.41.3/en/model_doc/llama3#usage-tips
print("before 2", base_model.config.pad_token_id)
base_model.config.pad_token_id = base_tokenizer.pad_token_id
print("after 2", base_model.config.pad_token_id)

print("Model Vocabulary Size:", base_model.config.vocab_size)
base_model.resize_token_embeddings(len(base_tokenizer))
print("Model Vocabulary Size:", base_model.config.vocab_size)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Log model and tokenizer

In [None]:
# Model
print("---Model---")
print("Type:", type(base_model))
print("Architecture:", base_model)
print("Config:", base_model.config)
print("Model Vocabulary Size:", base_model.config.vocab_size)
print("Input embeddings:")
print(base_model.get_input_embeddings())
print("Output embeddings:")
print(base_model.get_output_embeddings())

# Tokenizer
print("---Tokenzier---")
print("Type:", type(base_tokenizer))
# print(tokenizer_loaded)
print("Special tokens:", base_tokenizer.special_tokens_map)
print("All tokens count:", len(base_tokenizer))
print("Padding side:", base_tokenizer.padding_side)

---Model---
Type: <class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
Architecture: LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128257, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm)

### Test the model - OK

#### via Pipeline

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# Create a pipeline for text generation
my_pipeline = pipeline(
    "text-generation", model=base_model, tokenizer=base_tokenizer, max_length=300
)

result = my_pipeline("What is AutoGen in abstract?")

print(result)

NameError: name 'base_model' is not defined

#### via Model and Tokenizer

In [None]:
# Tokenize the prompt
# input_ids = base_tokenizer.encode(
#     "What is AutoGen in abstract?", return_tensors="pt"
# ).to("cuda")
# print(input_ids)

chat = [{"role": "user", "content": "What is AutoGen in abstract?"}]
input_ids = base_tokenizer.apply_chat_template(chat, return_tensors ='pt').to("cuda")
print(input_ids)

# Generate text
result = base_model.generate(input_ids, max_length=300)
print(result)

# Decode and print the generated text
output_text = base_tokenizer.decode(result[0])
print("Answer:")
print(output_text)

### Save the model (to disk)

In [None]:
base_model.save_pretrained("SAVED_MODEL")

base_tokenizer.save_pretrained("SAVED_MODEL")

('SAVED_MODEL/tokenizer_config.json',
 'SAVED_MODEL/special_tokens_map.json',
 'SAVED_MODEL/tokenizer.json')

### Push the model (from HUB )to HUB)

In [None]:
base_model.push_to_hub(
    repo_id="lukaskellerstein/my-base-llama-4bit-from-hub",
    token=API_TOKEN,
)
base_tokenizer.push_to_hub(
    repo_id="lukaskellerstein/my-base-llama-4bit-from-hub",
    token=API_TOKEN,
)

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lukaskellerstein/my-base-llama-4bit-from-hub/commit/afd77e4b6e548a7c58e60486c1d894ba9ff1edda', commit_message='Upload tokenizer', commit_description='', oid='afd77e4b6e548a7c58e60486c1d894ba9ff1edda', pr_url=None, pr_revision=None, pr_num=None)

# Hub model from HUB

### Load the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_hub_loaded_from_hub = AutoModelForCausalLM.from_pretrained(
    "lukaskellerstein/my-base-llama-4bit-from-hub", device_map="auto"
)
tokenizer_hub_loaded_from_hub = AutoTokenizer.from_pretrained(
    "lukaskellerstein/my-base-llama-4bit-from-hub"
)

config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors.index.json:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Log model and tokenizer

In [None]:
# Model
print("---Model---")
print("Type:", type(model_hub_loaded_from_hub))
print("Architecture:", model_hub_loaded_from_hub)
print("Config:", model_hub_loaded_from_hub.config)
print("Model Vocabulary Size:", model_hub_loaded_from_hub.config.vocab_size)
print("Input embeddings:")
print(model_hub_loaded_from_hub.get_input_embeddings())
print("Output embeddings:")
print(model_hub_loaded_from_hub.get_output_embeddings())

# Tokenizer
print("---Tokenzier---")
print("Type:", type(tokenizer_hub_loaded_from_hub))
# print(tokenizer_loaded)
print("Special tokens:", tokenizer_hub_loaded_from_hub.special_tokens_map)
print("All tokens count:", len(tokenizer_hub_loaded_from_hub))
print("Padding side:", tokenizer_hub_loaded_from_hub.padding_side)

---Model---
Type: <class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
Architecture: LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128257, 4096, padding_idx=128256)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
     

### Test the model - OK

#### via Pipeline

In [None]:
from transformers import pipeline

# Create a pipeline for text generation
my_pipeline = pipeline(
    "text-generation",
    model=model_hub_loaded_from_hub,
    tokenizer=tokenizer_hub_loaded_from_hub,
    max_length=300,
)

result = my_pipeline("What is AutoGen in abstract?")

print(result)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[{'generated_text': 'What is AutoGen in abstract?azorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazo

#### via Model and Tokenizer

In [None]:
# Tokenize the prompt
# input_ids = tokenizer_hub_loaded_from_hub.encode(
#     "What is AutoGen in abstract?", return_tensors="pt"
# )
# print(input_ids)

chat = [{"role": "user", "content": "What is AutoGen in abstract?"}]
input_ids = base_tokenizer.apply_chat_template(chat, return_tensors ='pt').to("cuda")
print(input_ids)

# Generate text
result = model_hub_loaded_from_hub.generate(input_ids, max_length=300)
print(result)

# Decode and print the generated text
output_text = tokenizer_hub_loaded_from_hub.decode(result[0])
print("Answer:")
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


tensor([[128000, 128006,    882, 128007,    271,   3923,    374,   9156,  10172,
            304,   8278,     30, 128009]], device='cuda:0')
tensor([[128000, 128006,    882, 128007,    271,   3923,    374,   9156,  10172,
            304,   8278,     30, 128009, 128006,  78191, 128007,    271,  13556,
          10172,    374,    264,   3241,   5507,    430,   9651,  27983,  28725,
           1787,   2082,     11,   1778,    439,   4342,   3626,     11,   7557,
           7346,     11,    323,   1023,  14054,   2082,     11,    369,    264,
           3241,   2447,     13,   1102,    374,   6319,    311,   3665,  13707,
            892,    323,   8108,    279,   3392,    315,  59177,  11058,    814,
           1205,    311,    656,    382,  13556,  10172,   5829,    264,   3896,
           6108,   5603,     11,   1405,    264,    743,    315,  20506,    374,
           1511,    311,   7068,    279,  28725,   1787,   2082,     13,    578,
          20506,    527,   5439,    304,    264, 

# Base model from DISK

### Load the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

model_loaded = AutoModelForCausalLM.from_pretrained(
    "SAVED_MODEL",
    torch_dtype=torch.float16,
    device_map="cuda",
)

tokenizer_loaded = AutoTokenizer.from_pretrained("SAVED_MODEL")

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Log model and tokenizer

In [None]:
# Model
print("---Model---")
print("Type:", type(model_loaded))
print("Architecture:", model_loaded)
print("Config:", model_loaded.config)
print("Model Vocabulary Size:", model_loaded.config.vocab_size)
print("Input embeddings:")
print(model_loaded.get_input_embeddings())
print("Output embeddings:")
print(model_loaded.get_output_embeddings())

# Tokenizer
print("---Tokenzier---")
print("Type:", type(tokenizer_loaded))
# print(tokenizer_loaded)
print("Special tokens:", tokenizer_loaded.special_tokens_map)
print("All tokens count:", len(tokenizer_loaded))
print("Padding side:", tokenizer_loaded.padding_side)

---Model---
Type: <class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
Architecture: LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128257, 4096, padding_idx=128256)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
     

### Test the model - OK

#### via Pipeline

In [None]:
from transformers import pipeline

# Create a pipeline for text generation
my_pipeline = pipeline(
    "text-generation", model=model_loaded, tokenizer=tokenizer_loaded, max_length=300
)

result = my_pipeline("What is AutoGen in abstract?")

print(result)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'What is AutoGen in abstract?azorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazorazo

#### via Model and Tokenizer

In [None]:
# Tokenize the prompt
# input_ids = tokenizer_loaded.encode(
#     "What is AutoGen in abstract?", return_tensors="pt"
# ).to("cuda")
# print(input_ids)

chat = [{"role": "user", "content": "What is AutoGen in abstract?"}]
input_ids = tokenizer_loaded.apply_chat_template(chat, return_tensors ='pt').to("cuda")
print(input_ids)

# Generate text
result = model_loaded.generate(input_ids, max_length=1000)
print(result)

# Decode and print the generated text
output_text = tokenizer_loaded.decode(result[0])
print("Answer:")
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


tensor([[128000, 128006,    882, 128007,    271,   3923,    374,   9156,  10172,
            304,   8278,     30, 128009]], device='cuda:0')
tensor([[128000, 128006,    882, 128007,    271,   3923,    374,   9156,  10172,
            304,   8278,     30, 128009, 128006,  78191, 128007,    271,  13556,
          10172,    374,    459,   1825,  31874,   5507,    430,   9651,  27983,
          28725,   1787,   2082,    369,   5370,  15840,  15823,     11,   2737,
            356,     11,    356,  23240,   8102,     11,    323,   3885,     13,
           1102,   5829,    264,   4382,     11,   9632,   1413,  20047,    311,
           7664,    279,   2082,   9659,   1920,     11,  10923,  13707,    311,
           5357,    389,    279,  12496,    315,    872,   2082,   4856,   1109,
            279,  66838,   3465,    315,   4477,  59177,  28725,   1787,   2082,
            382,    644,  28591,     11,   9156,  10172,  14385,    439,    264,
           2082,  14143,     11,   4737,    304, 

### Push my base model from DISK to HUB

In [None]:
model_loaded.push_to_hub(
    repo_id="lukaskellerstein/my-base-llama-8bit-from-disk",
    token=API_TOKEN,
)
tokenizer_loaded.push_to_hub(
    repo_id="lukaskellerstein/my-base-llama-8bit-from-disk",
    token=API_TOKEN,
)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/lukaskellerstein/my-base-llama-8bit-from-disk/commit/54c3cc2036ee76b63dfb0a31f89eae24b4f694ae', commit_message='Upload tokenizer', commit_description='', oid='54c3cc2036ee76b63dfb0a31f89eae24b4f694ae', pr_url=None, pr_revision=None, pr_num=None)

# Disk model from HUB

### Load the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

model_loaded_fromHF = AutoModelForCausalLM.from_pretrained(
    "lukaskellerstein/my-base-llama-4bit-from-disk", device_map="auto"
)
tokenizer_loaded_fromHF = AutoTokenizer.from_pretrained(
    "lukaskellerstein/my-base-llama-4bit-from-disk"
)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors.index.json:   0%|          | 0.00/132k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/194 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.2k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Log model and tokenizer

In [None]:
# Model
print("---Model---")
print("Type:", type(model_loaded_fromHF))
print("Architecture:", model_loaded_fromHF)
print("Config:", model_loaded_fromHF.config)
print("Model Vocabulary Size:", model_loaded_fromHF.config.vocab_size)
print("Input embeddings:")
print(model_loaded_fromHF.get_input_embeddings())
print("Output embeddings:")
print(model_loaded_fromHF.get_output_embeddings())

# Tokenizer
print("---Tokenzier---")
print("Type:", type(tokenizer_loaded_fromHF))
# print(tokenizer_loaded)
print("Special tokens:", tokenizer_loaded_fromHF.special_tokens_map)
print("All tokens count:", len(tokenizer_loaded_fromHF))
print("Padding side:", tokenizer_loaded_fromHF.padding_side)

---Model---
Type: <class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
Architecture: LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128257, 4096, padding_idx=128256)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
     

###  Test the model - OK

#### via Pipeline

In [None]:
from transformers import pipeline

# Create a pipeline for text generation
my_pipeline = pipeline(
    "text-generation",
    model=model_loaded_fromHF,
    tokenizer=tokenizer_loaded_fromHF,
    max_length=300,
)

result = my_pipeline("What is AutoGen in abstract?")

print(result)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'What is AutoGen in abstract?osuosuosudbaounderounderüstüounderamageounder�üstü�ounderoundingounderounder금.posterdbaounder-partamageWER�ounderosuoundingamage.posterbowerosubowerosuounder� Ampounder�osu YÖoundingblank Amp.posterounder-partounderounderosuesty�금.poster TREEónico Amp금.poster.posteraldoaldo.poster Amp Topsestyestyesty�bowerosu.poster�dba.posterounderounder.compress.poster�� Ampounderaldo.poster Amp.poster� YÖounderaldoaldo.poster Ampensaestyensaescaping�kbd.posterكال Ampblankounderesty�.posterounderblank.posterounder TREE.poster-partounderounderฤษ.compress�blankounder�blank YÖ篣�ounder.posterkbdounder.posterounder�esty� Amp� YÖ�.poster_EMIT�tsy���.poster�.poster�كالesty� chargeounder Tanks�tsy�.poster�ónico夢-partฤษescapingesty.poster пласти вдруг緣esty��� TREE.poster���ensa�opic�.poster�blank�esty���� Tanks_EMIT�� Tanks Tanksesty Tanksestyesty�opic��ounder��tsy蒙��煣�����_EMIT�esty�_EMIT��ónico� вдруг�kbdكال_EMIT�esty���� вдруг YÖ_EMIT�_EMITopic� вдруг_EMIT�

#### via Model and Tokenizer

In [None]:
# Tokenize the prompt
# input_ids = tokenizer_loaded_fromHF.encode(
#     "What is AutoGen in abstract?", return_tensors="pt"
# ).to("cuda")
# print(input_ids)
chat = [{"role": "user", "content": "What is AutoGen in abstract?"}]
input_ids = tokenizer_loaded_fromHF.apply_chat_template(chat, return_tensors ='pt').to("cuda")
print(input_ids)

# Generate text
result = model_loaded_fromHF.generate(input_ids, max_length=300)
print(result)

# Decode and print the generated text
output_text = tokenizer_loaded_fromHF.decode(result[0])
print("Answer:")
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


tensor([[128000, 128006,    882, 128007,    271,   3923,    374,   9156,  10172,
            304,   8278,     30, 128009]], device='cuda:0')
tensor([[128000, 128006,    882, 128007,    271,   3923,    374,   9156,  10172,
            304,   8278,     30, 128009, 128006,  78191, 128007,    271,  13556,
          10172,    374,    459,   1825,  31874,   5507,    430,  27983,  28725,
           1787,   2082,    369,   5370,  15840,  15823,     11,   2737,    356,
             11,    356,  23240,   8102,     11,  13325,     11,    323,   3885,
             13,   1102,    596,   6319,    311,  69711,    279,   9886,    315,
          59177,     11,  69782,   2082,    430,    374,   3629,   2631,    304,
           3241,   4500,     11,   1778,    439,   1473,      9,   5830,  33728,
            323,  47728,    198,      9,   4703,  11850,    323,   4788,  11850,
           2082,    198,      9,  45565,   6170,    198,      9,  26230,   5865,
            323,   6989,    271,  13556,  10172, 