# Merge LoRA Adapter Weights and aply AWQ quantization

In [1]:
model_path = "/mnt/personal/mlynatom/thesis_models/it-Llama-3.1-8B-Instruct-mix_11_cs_en_alpaca_dolly"
#base_model_path = "/mnt/personal/mlynatom/thesis_models/cp_Llama-3.1-8B-full_cs_fineweb2_seed42_neptune_bs128_samples500000/merge_16bit"

## 16 bit merge v3

https://colab.research.google.com/drive/12c_sx8pIwiStqKr_7CF5BVwyyJpXmMTf?usp=sharing#scrollTo=c9yLWqKRKKyd

https://kaitchup.substack.com/p/lora-adapters-when-a-naive-merge

https://huggingface.co/docs/peft/developer_guides/lora#a-more-convenient-way

In [2]:
from peft import replace_lora_weights_loftq, get_peft_model, LoraConfig
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
import torch

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map={"": 0}, quantization_config=bnb_config, torch_dtype=torch.bfloat16)
# note: don't pass init_lora_weights="loftq" or loftq_config!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
### Benjamin Marie's code #### https://kaitchup.substack.com/p/lora-adapters-when-a-naive-merge ###
import torch
import peft
import json
import shutil
from peft.utils import _get_submodules
import os
import bitsandbytes as bnb
from bitsandbytes.functional import dequantize_4bit
from peft import PeftModel
from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
import gc
import copy


def dequantize_model(model, to='./dequantized_model', dtype=torch.bfloat16, device="cuda"):
    """
    'model': the peftmodel you loaded with qlora.
    'tokenizer': the model's corresponding hf's tokenizer.
    'to': directory to save the dequantized model
    'dtype': dtype that the model was trained using
    'device': device to load the model to
    """


    os.makedirs(to, exist_ok=True)

    cls = bnb.nn.Linear4bit

    with torch.no_grad():
        for name, module in model.named_modules():
            if isinstance(module, cls):
                print(f"Dequantizing `{name}`...")
                quant_state = copy.deepcopy(module.weight.quant_state)
                quant_state.dtype = dtype

                weights = dequantize_4bit(module.weight.data, quant_state=quant_state, quant_type="nf4").to(dtype)

                new_module = torch.nn.Linear(module.in_features, module.out_features, bias=None, dtype=dtype)
                new_module.weight = torch.nn.Parameter(weights)
                new_module.to(device=device, dtype=dtype)

                parent, target, target_name = _get_submodules(model, name)
                setattr(parent, target_name, new_module)

        # a hack, setting this to avoid hf's saving error because hf
        # itself does not support saving a model that is registered to be loaded in 4bit.
        model.is_loaded_in_4bit = False

        print("Saving dequantized model...")
        model.save_pretrained(to)
        #tokenizer.save_pretrained(to)
        config_data = json.loads(open(os.path.join(to, 'config.json'), 'r').read())
        config_data.pop("quantization_config", None)
        config_data.pop("pretraining_tp", None)
        with open(os.path.join(to, 'config.json'), 'w') as config:
            config.write(json.dumps(config_data, indent=2))

        return model

In [4]:
import torch
import peft
import json
import shutil
from peft.utils import _get_submodules
import os
import bitsandbytes as bnb
from bitsandbytes.functional import dequantize_4bit
from peft import PeftModel
from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
import gc
import copy

dtype = torch.bfloat16

try:
    #print(f"Starting to load the model {model_name} into memory")

    base_model = dequantize_model(base_model, to=f"{model_path}/merge_16bit_v3",dtype=dtype)
    print(base_model)
    model = PeftModel.from_pretrained(base_model, f"{model_path}/final")
    print(model)
    model = model.merge_and_unload()
    print(model)

    #print(f"Successfully loaded the model {model_name} into memory")
    model.save_pretrained(f"{model_path}/merge_16bit_v3", safe_serialization=True)
except Exception as e:
    print(f"An error occurred: {e}")

    # Delete the model object if it exists
    if 'model' in locals():
        del model

    # Clear the GPU cache
    torch.cuda.empty_cache()

    # Run the garbage collection
    gc.collect()

    print("Model, GPU cache, and garbage have been cleared.")

Dequantizing `model.layers.0.self_attn.q_proj`...
Dequantizing `model.layers.0.self_attn.k_proj`...
Dequantizing `model.layers.0.self_attn.v_proj`...
Dequantizing `model.layers.0.self_attn.o_proj`...
Dequantizing `model.layers.0.mlp.gate_proj`...
Dequantizing `model.layers.0.mlp.up_proj`...
Dequantizing `model.layers.0.mlp.down_proj`...
Dequantizing `model.layers.1.self_attn.q_proj`...
Dequantizing `model.layers.1.self_attn.k_proj`...
Dequantizing `model.layers.1.self_attn.v_proj`...
Dequantizing `model.layers.1.self_attn.o_proj`...
Dequantizing `model.layers.1.mlp.gate_proj`...
Dequantizing `model.layers.1.mlp.up_proj`...
Dequantizing `model.layers.1.mlp.down_proj`...
Dequantizing `model.layers.2.self_attn.q_proj`...
Dequantizing `model.layers.2.self_attn.k_proj`...
Dequantizing `model.layers.2.self_attn.v_proj`...
Dequantizing `model.layers.2.self_attn.o_proj`...
Dequantizing `model.layers.2.mlp.gate_proj`...
Dequantizing `model.layers.2.mlp.up_proj`...
Dequantizing `model.layers.2.m

In [5]:
tokenizer = AutoTokenizer.from_pretrained(f"{model_path}/final")
tokenizer.save_pretrained(f"{model_path}/merge_16bit_v3")

('/mnt/personal/mlynatom/thesis_models/it-cp_Llama-3.1-8B-full_fineweb2-cs_finewebedu-en_31_500k_seed42_samples500000-full_mix_11_cs_en/merge_16bit_v3/tokenizer_config.json',
 '/mnt/personal/mlynatom/thesis_models/it-cp_Llama-3.1-8B-full_fineweb2-cs_finewebedu-en_31_500k_seed42_samples500000-full_mix_11_cs_en/merge_16bit_v3/special_tokens_map.json',
 '/mnt/personal/mlynatom/thesis_models/it-cp_Llama-3.1-8B-full_fineweb2-cs_finewebedu-en_31_500k_seed42_samples500000-full_mix_11_cs_en/merge_16bit_v3/tokenizer.json')

## 16 bit merge v2

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

model_name = model_path.split("/")[-1]

In [3]:
base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map="auto", torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(base_model, f"{model_path}/final")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
model = model.merge_and_unload()

In [5]:
model.save_pretrained(f"{model_path}/merge_16bit_v2")

In [6]:
tokenizer = AutoTokenizer.from_pretrained(f"{model_path}/final")
tokenizer.save_pretrained(f"{model_path}/merge_16bit_v2")

('/mnt/personal/mlynatom/thesis_models/it-cp_Llama-3.1-8B-full_cs_fineweb2_seed42_neptune_bs128_samples500000-full_cs_instruction_tuning_collection/merge_16bit_v2/tokenizer_config.json',
 '/mnt/personal/mlynatom/thesis_models/it-cp_Llama-3.1-8B-full_cs_fineweb2_seed42_neptune_bs128_samples500000-full_cs_instruction_tuning_collection/merge_16bit_v2/special_tokens_map.json',
 '/mnt/personal/mlynatom/thesis_models/it-cp_Llama-3.1-8B-full_cs_fineweb2_seed42_neptune_bs128_samples500000-full_cs_instruction_tuning_collection/merge_16bit_v2/tokenizer.json')

## 16 bit merge

In [2]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # Use 4bit quantization to reduce memory usage. Can be False.


model_name = model_path.split("/")[-1]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = f"{model_path}/final",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
# Merge to 16bit - local
model.save_pretrained_merged(f"{model_path}/merge_16bit", tokenizer, save_method = "merged_16bit",)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 367.2 out of 503.53 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:00<00:00, 63.01it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [None]:
# #merge # to 16bit - push to hub
# model.push_to_hub_merged(f"aic/model", tokenizer, save_method = "merged_16bit", token = "")

In [None]:
# Merge to 4bit - local
# model.save_pretrained_merged(f"{model_path}/merge_4bit", tokenizer, save_method = "merged_4bit",)

In [None]:
# merge to 4bit - push to hub
#model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

## AWQ

In [None]:
#%pip install autoawq

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

model_id = f"{model_path}/merge_16bit"
quant_path = f"{model_path}/awq"
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }

# Load model
model = AutoAWQForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Quantize
model.quantize(tokenizer, quant_config=quant_config)

# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)

print(f'Model is quantized and saved at "{quant_path}"')

## Eval Perplexity differences

In [2]:
#from unsloth import FastLanguageModel
from datasets import load_dataset
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
from tqdm import tqdm
import argparse
import os
class Perplexity:
    def __init__(self, model, tokenizer, device=None)->None:
        # self.model_id = model_id

        # #check device
        if device is not None:
            assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
            if device == "gpu":
                device = "cuda"
        else:
            device = "cuda" if torch.cuda.is_available() else "cpu"

        self.device = device
        
        # # #load model and move to desired device
        # # if load_in_16bit:
        # #     
        # # else:
        # #     self.model = AutoModelForCausalLM.from_pretrained(self.model_id)

        # #self.model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.bfloat16)
        # print(load_in_4bit, "load_in_4bit")

        # self.model, self.tokenizer = FastLanguageModel.from_pretrained(
        #     model_name = model_id, # YOUR MODEL YOU USED FOR TRAINING
        #     max_seq_length = 1024,
        #     dtype = torch.bfloat16,
        #     load_in_4bit = load_in_4bit,
        # )
        # FastLanguageModel.for_inference(self.model)


        # #self.model.to(device)


        # # #load tokenizer
        # #self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
        self.model = model
        self.tokenizer = tokenizer

    def compute(self, predictions, batch_size: int = 16, add_start_token: bool = True, max_length=None):
        # if batch_size > 1 (which generally leads to padding being required), and
        # if there is not an already assigned pad_token, assign an existing
        # special token to also be the padding token
        if self.tokenizer.pad_token is None and batch_size > 1:
            existing_special_tokens = list(self.tokenizer.special_tokens_map_extended.values())
            # check that the model already has at least one special token defined
            assert (
                len(existing_special_tokens) > 0
            ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
            # assign one of the special tokens to also be the pad token
            self.tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

        if add_start_token and max_length:
            # leave room for <BOS> token to be added:
            assert (
                self.tokenizer.bos_token is not None
            ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
            max_tokenized_len = max_length - 1
        else:
            max_tokenized_len = max_length

        ppls = []
        loss_fct = CrossEntropyLoss(reduction="none")

        for start_index in tqdm(range(0, len(predictions), batch_size)):
            end_index = min(start_index + batch_size, len(predictions))

            #compute encodings
            encodings = self.tokenizer(
                predictions[start_index:end_index],
                add_special_tokens=False,
                padding=True,
                truncation=True if max_tokenized_len else False,
                max_length=max_tokenized_len,
                return_tensors="pt",
                return_attention_mask=True,
            )

            encoded_batch = encodings["input_ids"]
            attn_mask = encodings["attention_mask"]


            # check that each input is long enough:
            if add_start_token:
                assert torch.all(torch.ge(attn_mask.sum(1), 1)), "Each input text must be at least one token long."
            else:
                assert torch.all(
                    torch.ge(attn_mask.sum(1), 2)
                ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."

            if add_start_token:
                bos_tokens_tensor = torch.tensor([[self.tokenizer.bos_token_id]] * encoded_batch.size(dim=0))
                encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
                attn_mask = torch.cat([torch.ones(bos_tokens_tensor.size(), dtype=torch.int64), attn_mask], dim=1)


            #now move to gpu
            encoded_batch = encoded_batch.to(self.device)
            attn_mask = attn_mask.to(self.device)

            labels = encoded_batch

            with torch.no_grad():
                out_logits = self.model(encoded_batch, attention_mask=attn_mask).logits

            shift_logits = out_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

            perplexity_batch = torch.exp(
                (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
                / shift_attention_mask_batch.sum(1)
            )

            ppls += perplexity_batch.tolist()

        return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}

In [2]:
dataset = load_dataset("HuggingFaceFW/fineweb-2", "ces_Latn", split="test")

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

In [4]:
model = AutoModelForCausalLM.from_pretrained(f"{model_path}/merge_16bit_v3", device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(f"{model_path}/merge_16bit_v3", trust_remote_code=True)

perplexity_evaluator = Perplexity(model, tokenizer)
result = perplexity_evaluator.compute(dataset["text"][:100], 4, True, 1024)

print(result)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:15<00:00,  1.62it/s]

{'perplexities': [9.3125, 4.875, 10.75, 9.3125, 10.125, 8.25, 8.625, 9.1875, 9.1875, 11.8125, 6.53125, 12.9375, 9.9375, 9.5, 9.8125, 5.9375, 16.875, 9.3125, 17.125, 7.625, 10.9375, 10.5625, 10.9375, 6.125, 6.0625, 14.25, 6.625, 11.625, 11.0625, 12.5625, 24.625, 7.5, 27.5, 23.875, 9.625, 9.9375, 7.375, 7.0625, 7.5, 7.15625, 13.1875, 9.9375, 8.5, 16.625, 5.09375, 9.0625, 11.4375, 17.5, 10.9375, 135.0, 13.8125, 6.84375, 9.1875, 7.34375, 24.25, 11.4375, 8.625, 14.4375, 18.25, 8.75, 6.53125, 12.0, 15.1875, 7.0625, 8.25, 9.1875, 11.4375, 9.9375, 9.5, 11.625, 15.625, 11.625, 6.21875, 12.1875, 14.0, 47.5, 20.125, 12.0, 24.625, 10.75, 4.71875, 10.25, 11.625, 6.9375, 10.75, 7.75, 8.25, 11.8125, 17.5, 7.625, 13.1875, 11.25, 9.3125, 8.9375, 14.0, 8.125, 9.5, 17.5, 6.1875, 7.28125], 'mean_perplexity': 12.6803125}





In [5]:
model = AutoModelForCausalLM.from_pretrained(f"{model_path}/merge_16bit_v2", device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(f"{model_path}/merge_16bit_v2")
perplexity_evaluator = Perplexity(model, tokenizer)
result = perplexity_evaluator.compute(dataset["text"][:100], 4, True, 1024)

print(result)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:15<00:00,  1.65it/s]

{'perplexities': [9.3125, 4.875, 10.75, 9.3125, 10.125, 8.25, 8.625, 9.1875, 9.1875, 11.8125, 6.53125, 12.9375, 9.9375, 9.5, 9.8125, 5.9375, 16.875, 9.3125, 17.125, 7.625, 10.9375, 10.5625, 10.9375, 6.125, 6.0625, 14.25, 6.625, 11.625, 11.0625, 12.5625, 24.625, 7.5, 27.5, 23.875, 9.625, 9.9375, 7.375, 7.0625, 7.5, 7.15625, 13.1875, 9.9375, 8.5, 16.625, 5.09375, 9.0625, 11.4375, 17.5, 10.9375, 135.0, 13.8125, 6.84375, 9.1875, 7.34375, 24.25, 11.4375, 8.625, 14.4375, 18.25, 8.75, 6.53125, 12.0, 15.1875, 7.0625, 8.25, 9.1875, 11.4375, 9.9375, 9.5, 11.625, 15.625, 11.625, 6.21875, 12.1875, 14.0, 47.5, 20.125, 12.0, 24.625, 10.75, 4.71875, 10.25, 11.625, 6.9375, 10.75, 7.75, 8.25, 11.8125, 17.5, 7.625, 13.1875, 11.25, 9.3125, 8.9375, 14.0, 8.125, 9.5, 17.5, 6.1875, 7.28125], 'mean_perplexity': 12.6803125}





In [None]:
model = AutoModelForCausalLM.from_pretrained(f"{model_path}/merge_16bit", device_map="auto", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(f"{model_path}/merge_16bit")
perplexity_evaluator = Perplexity(model, tokenizer)
result = perplexity_evaluator.compute(dataset["text"][:100], 4, True, 1024)

print(result)

In [7]:
from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = f"{model_path}/final", # YOUR MODEL YOU USED FOR TRAINING
            max_seq_length = 1024,
            dtype = torch.bfloat16,
            load_in_4bit = False,
)
FastLanguageModel.for_inference(model)

perplexity_evaluator = Perplexity(model, tokenizer)
result = perplexity_evaluator.compute(dataset["text"][:100], 4, True, 1024)

print(result)

==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
100%|██████████| 25/25 [00:20<00:00,  1.20it/s]

{'perplexities': [9.3125, 4.8125, 10.9375, 9.3125, 10.125, 8.125, 8.5, 8.375, 9.1875, 11.625, 6.3125, 12.9375, 9.5, 9.3125, 9.8125, 5.96875, 16.125, 9.8125, 16.875, 7.625, 11.0625, 9.9375, 10.4375, 6.1875, 6.0625, 14.25, 6.5625, 11.25, 11.0625, 12.375, 23.875, 7.5, 26.625, 23.125, 9.625, 9.9375, 7.375, 7.0625, 7.28125, 7.0, 13.1875, 9.9375, 8.25, 16.125, 5.15625, 8.0, 10.9375, 17.125, 10.9375, 123.0, 13.1875, 6.28125, 9.0625, 7.21875, 22.75, 11.625, 8.625, 13.8125, 18.0, 8.5, 6.71875, 11.8125, 15.1875, 7.0625, 8.375, 9.0625, 11.25, 9.3125, 9.625, 11.8125, 14.6875, 11.625, 6.40625, 12.1875, 14.0, 46.75, 20.375, 12.375, 22.75, 10.5625, 4.71875, 9.9375, 11.4375, 6.125, 10.25, 7.5, 8.25, 11.8125, 16.875, 7.75, 13.1875, 10.4375, 9.3125, 8.625, 14.0, 8.125, 9.3125, 17.5, 6.1875, 7.21875], 'mean_perplexity': 12.3540625}





### Other

In [9]:
dataset = load_dataset("HuggingFaceFW/fineweb-2", "ces_Latn", split="test")

perplexity_evaluator = Perplexity(model_id="/mnt/personal/mlynatom/thesis_models/cp_Llama-3.1-8B-full_fineweb2-cs_finewebedu-en_31_500k_seed42_samples500000/merge_16bit", load_in_4bit=False)
result = perplexity_evaluator.compute(dataset["text"][:100], 4, True, 1024)

print(result)

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:14<00:00,  1.67it/s]

{'perplexities': [8.25, 4.4375, 9.8125, 8.5, 8.5, 7.375, 7.75, 6.6875, 7.625, 9.8125, 5.75, 12.5625, 8.0, 8.5, 8.25, 5.5, 14.25, 7.625, 14.9375, 6.84375, 9.5, 5.25, 9.5, 5.53125, 5.5625, 12.1875, 5.84375, 9.8125, 9.9375, 10.9375, 20.375, 6.78125, 25.0, 18.25, 8.25, 9.1875, 6.71875, 6.5625, 6.3125, 6.0625, 10.9375, 9.1875, 6.78125, 13.8125, 4.625, 7.375, 9.625, 14.9375, 9.5, 105.0, 10.5625, 5.40625, 8.125, 6.53125, 18.25, 10.25, 7.625, 11.4375, 16.625, 6.03125, 5.84375, 11.0625, 13.375, 6.0625, 7.625, 7.875, 9.3125, 8.5, 8.5, 10.75, 12.375, 10.5625, 5.53125, 10.9375, 12.0, 18.0, 16.375, 10.75, 18.25, 9.8125, 4.09375, 9.0625, 10.4375, 5.5625, 7.5, 6.875, 7.34375, 10.25, 13.5625, 6.46875, 10.4375, 8.25, 8.25, 7.625, 12.375, 7.375, 8.0, 12.9375, 5.40625, 6.28125], 'mean_perplexity': 10.4275}





In [None]:
dataset = load_dataset("HuggingFaceFW/fineweb-2", "ces_Latn", split="test")

perplexity_evaluator = Perplexity(model_id="/mnt/personal/mlynatom/thesis_models/cp_Llama-3.1-8B-full_fineweb2-cs_finewebedu-en_31_500k_seed42_samples500000/final", load_in_4bit=False)
result = perplexity_evaluator.compute(dataset["text"][:100], 4, True, 1024)

print(result)

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

False load_in_4bit
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-80GB. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
100%|██████████| 25/25 [00:21<00:00,  1.19it/s]

{'perplexities': [8.25, 4.4375, 9.9375, 8.5, 8.5, 7.375, 7.75, 6.5625, 7.625, 9.9375, 5.75, 12.5625, 8.0, 8.5, 8.25, 5.53125, 14.25, 8.0, 15.375, 6.84375, 9.625, 5.25, 9.5, 5.53125, 5.53125, 12.375, 5.84375, 9.8125, 9.9375, 10.9375, 20.375, 6.78125, 25.0, 18.625, 8.25, 9.1875, 6.6875, 6.5625, 6.28125, 6.125, 10.9375, 9.1875, 6.78125, 13.8125, 4.59375, 7.375, 9.625, 14.9375, 9.5, 112.0, 10.9375, 5.4375, 8.25, 6.40625, 18.875, 10.125, 7.625, 11.4375, 16.625, 6.0625, 5.8125, 11.0625, 13.1875, 6.0625, 7.625, 7.875, 9.3125, 8.5, 8.625, 10.75, 12.5625, 10.5625, 5.53125, 10.9375, 12.375, 18.0, 16.375, 11.0625, 18.25, 9.8125, 4.09375, 9.1875, 10.4375, 5.5625, 7.5, 7.0625, 7.34375, 10.25, 13.5625, 6.46875, 10.5625, 8.375, 8.25, 7.625, 12.375, 7.34375, 8.0, 13.8125, 5.40625, 6.28125], 'mean_perplexity': 10.5446875}





In [3]:
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(base_model, "/mnt/personal/mlynatom/thesis_models/cp_Llama-3.1-8B-full_fineweb2-cs_finewebedu-en_31_500k_seed42_samples500000/final")
tokenizer = AutoTokenizer.from_pretrained("/mnt/personal/mlynatom/thesis_models/cp_Llama-3.1-8B-full_fineweb2-cs_finewebedu-en_31_500k_seed42_samples500000/final")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
dataset = load_dataset("HuggingFaceFW/fineweb-2", "ces_Latn", split="test")

perplexity_evaluator = Perplexity(model, tokenizer)
result = perplexity_evaluator.compute(dataset["text"][:100], 4, True, 1024)

print(result)

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:26<00:00,  1.04s/it]

{'perplexities': [8.25, 4.375, 9.625, 8.5, 8.5, 7.15625, 7.625, 6.125, 7.625, 9.0625, 5.65625, 12.1875, 7.75, 8.375, 8.25, 5.5, 13.8125, 7.0, 14.6875, 6.84375, 9.1875, 5.125, 9.3125, 5.53125, 5.5625, 12.1875, 5.75, 9.625, 9.625, 10.4375, 19.5, 6.78125, 23.875, 17.125, 8.25, 9.1875, 6.71875, 6.46875, 6.125, 5.96875, 10.4375, 9.0625, 6.625, 13.375, 4.53125, 6.84375, 9.5, 14.9375, 9.5, 96.0, 9.9375, 4.71875, 7.875, 6.28125, 17.75, 10.25, 7.625, 10.75, 15.875, 5.65625, 5.3125, 11.0625, 13.375, 5.5625, 7.375, 7.875, 9.1875, 8.375, 8.5, 10.25, 12.1875, 10.4375, 5.53125, 10.9375, 11.0625, 18.0, 16.375, 10.4375, 16.875, 9.5, 4.0625, 8.75, 10.125, 5.5, 7.09375, 6.71875, 7.34375, 10.25, 12.75, 5.96875, 10.4375, 7.625, 8.25, 7.375, 12.375, 7.375, 7.75, 11.8125, 5.40625, 6.1875], 'mean_perplexity': 10.0815625}





In [None]:
merged_model =  model.merge_and_unload()
dataset = load_dataset("HuggingFaceFW/fineweb-2", "ces_Latn", split="test")

perplexity_evaluator = Perplexity(merged_model, tokenizer)
result = perplexity_evaluator.compute(dataset["text"][:100], 4, True, 1024)

print(result)

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:15<00:00,  1.65it/s]

{'perplexities': [8.25, 4.4375, 9.625, 8.5, 8.5, 7.15625, 7.625, 6.125, 7.625, 8.9375, 5.65625, 12.1875, 7.75, 8.375, 8.25, 5.5, 13.5625, 7.0, 14.6875, 6.84375, 9.1875, 5.125, 9.3125, 5.53125, 5.5625, 12.1875, 5.75, 9.625, 9.5, 10.4375, 19.5, 6.78125, 23.875, 17.125, 8.25, 9.1875, 6.71875, 6.46875, 6.125, 5.9375, 10.4375, 9.1875, 6.625, 13.375, 4.53125, 6.9375, 9.5, 14.9375, 9.5, 96.0, 9.9375, 4.6875, 7.875, 6.28125, 17.125, 10.25, 7.625, 10.75, 15.875, 5.65625, 5.3125, 11.0625, 13.375, 5.65625, 7.375, 7.875, 9.1875, 8.25, 8.5, 10.25, 12.1875, 10.4375, 5.53125, 10.9375, 10.9375, 18.0, 16.375, 10.4375, 16.875, 9.5, 4.09375, 8.75, 10.125, 5.4375, 7.15625, 6.71875, 7.34375, 10.25, 12.75, 5.96875, 10.4375, 7.625, 8.25, 7.375, 12.375, 7.375, 7.75, 12.0, 5.40625, 6.0625], 'mean_perplexity': 10.071875}





Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/25 [00:00<?, ?it/s]

100%|██████████| 25/25 [00:15<00:00,  1.61it/s]

{'perplexities': [8.25, 4.4375, 9.625, 8.5, 8.5, 7.15625, 7.625, 6.125, 7.625, 8.9375, 5.65625, 12.1875, 7.75, 8.375, 8.25, 5.5, 13.5625, 7.0, 14.6875, 6.84375, 9.1875, 5.125, 9.3125, 5.53125, 5.5625, 12.1875, 5.75, 9.625, 9.5, 10.4375, 19.5, 6.78125, 23.875, 17.125, 8.25, 9.1875, 6.71875, 6.46875, 6.125, 5.9375, 10.4375, 9.1875, 6.625, 13.375, 4.53125, 6.9375, 9.5, 14.9375, 9.5, 96.0, 9.9375, 4.6875, 7.875, 6.28125, 17.125, 10.25, 7.625, 10.75, 15.875, 5.65625, 5.3125, 11.0625, 13.375, 5.65625, 7.375, 7.875, 9.1875, 8.25, 8.5, 10.25, 12.1875, 10.4375, 5.53125, 10.9375, 10.9375, 18.0, 16.375, 10.4375, 16.875, 9.5, 4.09375, 8.75, 10.125, 5.4375, 7.15625, 6.71875, 7.34375, 10.25, 12.75, 5.96875, 10.4375, 7.625, 8.25, 7.375, 12.375, 7.375, 7.75, 12.0, 5.40625, 6.0625], 'mean_perplexity': 10.071875}



