In [None]:
from evaluate import load

perplexity = load("perplexity", module_type="metric")

In [4]:
predictions = ["This is a test", "This is another test", "Toto je test", "asdfasdf"]

model_id = "meta-llama/Llama-3.2-3B-Instruct"

results = perplexity.compute(predictions=predictions, model_id=model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
results

{'perplexities': [44.88575744628906,
  212.6642303466797,
  5619.5263671875,
  1284.4708251953125],
 'mean_perplexity': np.float64(1790.3867950439453)}

In [6]:
from datasets import load_dataset

model_id = "meta-llama/Llama-3.2-3B-Instruct"

cs_dataset = load_dataset("HuggingFaceFW/fineweb-2", "ces_Latn", split="test")
cs_dataset

TypeError: ParquetConfig.__init__() got an unexpected keyword argument 'subset'

In [2]:
# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Perplexity Metric."""

import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from transformers import AutoModelForCausalLM, AutoTokenizer

from tqdm import tqdm

class Perplexity:
    def __init__(self, model_id:str, device:str=None, load_in_16bit=False)->None:
        self.model_id = model_id

        #check device
        if device is not None:
            assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
            if device == "gpu":
                device = "cuda"
        else:
            device = "cuda" if torch.cuda.is_available() else "cpu"

        self.device = device
        
        #load model and move to desired device
        if load_in_16bit:
            self.model = AutoModelForCausalLM.from_pretrained(self.model_id, torch_dtype=torch.float16)
        else:
            self.model = AutoModelForCausalLM.from_pretrained(self.model_id)
            
        self.model.to(device)

        #load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)

    def compute(self, predictions, batch_size: int = 16, add_start_token: bool = True, max_length=None):
        # if batch_size > 1 (which generally leads to padding being required), and
        # if there is not an already assigned pad_token, assign an existing
        # special token to also be the padding token
        if self.tokenizer.pad_token is None and batch_size > 1:
            existing_special_tokens = list(self.tokenizer.special_tokens_map_extended.values())
            # check that the model already has at least one special token defined
            assert (
                len(existing_special_tokens) > 0
            ), "If batch_size > 1, model must have at least one special token to use for padding. Please use a different model or set batch_size=1."
            # assign one of the special tokens to also be the pad token
            self.tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})

        if add_start_token and max_length:
            # leave room for <BOS> token to be added:
            assert (
                self.tokenizer.bos_token is not None
            ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
            max_tokenized_len = max_length - 1
        else:
            max_tokenized_len = max_length

        ppls = []
        loss_fct = CrossEntropyLoss(reduction="none")

        for start_index in tqdm(range(0, len(predictions), batch_size)):
            end_index = min(start_index + batch_size, len(predictions))

            #compute encodings
            encodings = self.tokenizer(
                predictions[start_index:end_index],
                add_special_tokens=False,
                padding=True,
                truncation=True if max_tokenized_len else False,
                max_length=max_tokenized_len,
                return_tensors="pt",
                return_attention_mask=True,
            )

            encoded_batch = encodings["input_ids"]
            attn_mask = encodings["attention_mask"]


            # check that each input is long enough:
            if add_start_token:
                assert torch.all(torch.ge(attn_mask.sum(1), 1)), "Each input text must be at least one token long."
            else:
                assert torch.all(
                    torch.ge(attn_mask.sum(1), 2)
                ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."

            if add_start_token:
                bos_tokens_tensor = torch.tensor([[self.tokenizer.bos_token_id]] * encoded_batch.size(dim=0))
                encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
                attn_mask = torch.cat([torch.ones(bos_tokens_tensor.size(), dtype=torch.int64), attn_mask], dim=1)


            #now move to gpu
            encoded_batch = encoded_batch.to(self.device)
            attn_mask = attn_mask.to(self.device)

            labels = encoded_batch

            with torch.no_grad():
                out_logits = self.model(encoded_batch, attention_mask=attn_mask).logits

            shift_logits = out_logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            shift_attention_mask_batch = attn_mask[..., 1:].contiguous()

            perplexity_batch = torch.exp(
                (loss_fct(shift_logits.transpose(1, 2), shift_labels) * shift_attention_mask_batch).sum(1)
                / shift_attention_mask_batch.sum(1)
            )

            ppls += perplexity_batch.tolist()

        return {"perplexities": ppls, "mean_perplexity": np.mean(ppls)}

In [3]:
perplexity_evaluator = Perplexity("meta-llama/Llama-3.2-3B-Instruct", device="cuda", load_in_16bit=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
perplexity_evaluator.compute(predictions= ["This is a test", "This is another test", "Toto je test", "asdfasdf"])

100%|██████████| 1/1 [00:02<00:00,  2.34s/it]


{'perplexities': [44.90625, 213.375, 5612.0, 1287.0],
 'mean_perplexity': np.float64(1789.3203125)}

In [5]:
cs_res = perplexity_evaluator.compute(predictions=cs_dataset["text"], batch_size=4, max_length=4096)

  4%|▍         | 290/6658 [02:47<1:01:16,  1.73it/s]


KeyboardInterrupt: 

In [2]:
en_dataset = load_dataset("HuggingFaceFW/fineweb", "sample-10BT")

README.md:   0%|          | 0.00/40.3k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/25428 [00:00<?, ?it/s]

000_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

001_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

002_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

003_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

004_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

005_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

006_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

007_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

008_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

009_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

010_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

011_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

012_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

013_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

014_00000.parquet:   0%|          | 0.00/576M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Loading dataset shards:   0%|          | 0/102 [00:00<?, ?it/s]

In [5]:
en_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'dump', 'url', 'date', 'file_path', 'language', 'language_score', 'token_count'],
        num_rows: 14873731
    })
})