Skip to content

Commit

Permalink
[Feature] Support inference ppl datasets (#1315)
Browse files Browse the repository at this point in the history
* commit inference ppl datasets

* revised format

* revise

* revise

* revise

* revise

* revise

* revise
  • Loading branch information
Quehry committed Jul 22, 2024
1 parent e938482 commit a244453
Show file tree
Hide file tree
Showing 12 changed files with 662 additions and 0 deletions.
26 changes: 26 additions & 0 deletions configs/datasets/inference_ppl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Inference-PPL Datasets

- **Description**: Compute the loss only on the labeled positions, especially used for reasoning corpus.
- **Datasets**: cn-reasoning-val.jsonl (example datasets, inference-ppl can be generalized to more corpus).

# PPL Computation

$$ \text{ppl} = - \frac{1}{n} \sum_{i=0}^n \sum_{c=0}^{vocab\_size} y_{i,c} \log p_{i,c} \tag{1} $$

where Eq. (1) is the normal mean ppl computation formula, for inference-ppl, we only compute the average score based on pre-labeled position.

# Quick Start

```shell
cd opencompass
python run.py configs/eval_inference_ppl.py
```

# Some results

| Model | Result |
| ----------- | ----------- |
| Qwen1.5-7b | 0.59 |
| Qwen1.5-14b | 0.54 |
| Llama2-7b | 0.49 |
| Llama2-13b | 0.43 |
38 changes: 38 additions & 0 deletions configs/datasets/inference_ppl/inference_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import InferencePPLOnlyInferencer
from opencompass.openicl.icl_evaluator import AverageInferencePPLEvaluator

from opencompass.datasets import InferencePPLDataset

# Build InferencePPLDataset
inference_ppl_datasets = []

llm_cmp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template='{text}',
),
# No in-context example, using ZeroRetriever
retriever=dict(type=ZeroRetriever),
# compute inference-ppl
inferencer=dict(type=InferencePPLOnlyInferencer),
)

# Average the inference-ppl scores
llm_cmp_eval_cfg = dict(evaluator=dict(type=AverageInferencePPLEvaluator))

inference_ppl_datasets.append(
dict(
abbr=f'inference-ppl',
type=InferencePPLDataset,
path='./data/inference_ppl',
name='cn-reasoning-val',
samples=None, # Set small samples for testing
reader_cfg=dict(
input_columns=['text'],
output_column=None,
),
infer_cfg=llm_cmp_infer_cfg,
eval_cfg=llm_cmp_eval_cfg,
))
62 changes: 62 additions & 0 deletions configs/eval_inference_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from mmengine.config import read_base

with read_base():
# Inference PPL datasets
from .datasets.inference_ppl.inference_ppl import inference_ppl_datasets

# Model configs
from .models.qwen.hf_qwen1_5_7b import models as qwen1_5_7b
from .models.qwen.hf_qwen1_5_14b import models as qwen1_5_14b
from .models.hf_llama.hf_llama2_7b import models as llama2_7b
from .models.hf_llama.hf_llama2_13b import models as llama2_13b


from opencompass.partitioners import NaivePartitioner
from opencompass.runners import LocalRunner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask


# -------------Inference Stage ----------------------------------------

datasets = [*inference_ppl_datasets]
workdir = 'outputs/inference_ppl'

models = [
*qwen1_5_7b,
*qwen1_5_14b,
*llama2_7b,
*llama2_13b,
]



# Set custom batch_size and num_gpus for faster loss calculation
# Smaller batch_size should give more precise results, at the cost of worse efficiency
model_cfg = dict(
batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1)
)

for mdl in models:
mdl.update(model_cfg)


infer = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLInferTask),
max_num_workers=256, # Maximum concurrent evaluation task count
),
)


# -------------Evaluation Stage ----------------------------------------
eval = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
task=dict(type=OpenICLEvalTask),
max_num_workers=256,
)
)
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
from .humanevalx import * # noqa: F401, F403
from .hungarian_math import * # noqa: F401, F403
from .IFEval.ifeval import IFEvalDataset, IFEvaluator # noqa: F401, F403
from .inference_ppl import InferencePPLDataset # noqa: F401, F403
from .infinitebench import * # noqa: F401, F403
from .iwslt2017 import * # noqa: F401, F403
from .jigsawmultilingual import * # noqa: F401, F403
Expand Down
37 changes: 37 additions & 0 deletions opencompass/datasets/inference_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import os.path as osp
from typing import List

from datasets import load_dataset

from opencompass.registry import LOAD_DATASET

from .base import BaseDataset


@LOAD_DATASET.register_module()
class InferencePPLDataset(BaseDataset):

@staticmethod
def load(path: str, name: List[str] = None, samples: int = None):

# Check if file exists in the given path
supported_extensions = ['jsonl']
for ext in supported_extensions:
filename = osp.join(
path, f'{name}.{ext}') # name refers to data subset name

if osp.exists(filename):
break
else:
raise FileNotFoundError(f'{filename} not found.')

samples = 'test' if samples is None else f'test[:{samples}]'

data_files = {'test': filename}

dataset = load_dataset('json', data_files=data_files, split=samples)

# Filter out empty samples
dataset = dataset.filter(lambda example: len(example['text']) > 0)

return dataset
36 changes: 36 additions & 0 deletions opencompass/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,28 @@ def get_ppl(self,
' ppl-based evaluation yet, try gen-based '
'instead.')

@abstractmethod
def get_ppl_tokenwise(
self,
inputs: List[str],
mask_length: Optional[List[int]] = None) -> List[float]:
"""Get tokenwise perplexity scores given a list of inputs.
Args:
inputs (List[str]): A list of strings.
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of perplexity scores.
"""
raise NotImplementedError(f'{self.__class__.__name__} does not support'
' ppl-based evaluation yet, try gen-based '
'instead.')

@abstractmethod
def encode(self, prompt: str) -> torch.Tensor:
"""Encode prompt to tokens. Not necessary for most cases.
Expand Down Expand Up @@ -151,6 +173,20 @@ def get_ppl_from_template(self,
inputs = self.parse_template(templates, mode='ppl')
return self.get_ppl(inputs, mask_length)

def get_ppl_tokenwise_from_template(self,
templates: List[PromptType],
label: List[List[int]],
mask_length=None):
"""Get token-wise perplexity given a list of templates.
Args:
templates (List[PromptType]): A list of templates.
mask_length (List[int]): A list of mask lengths. If provided, the
perplexity will be calculated only on the unmasked tokens.
"""
inputs = self.parse_template(templates, mode='ppl')
return self.get_ppl_tokenwise(inputs, label, mask_length)

def generate_from_template(self, templates: List[PromptType],
max_out_len: int, **kwargs):
"""Generate completion from a list of templates.
Expand Down
159 changes: 159 additions & 0 deletions opencompass/models/huggingface_above_v4_33.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,165 @@ def _load_model(self, path: str, kwargs: dict, peft_path: Optional[str] = None,
self.model.eval()
self.model.generation_config.do_sample = False


def get_ppl_tokenwise(self, inputs: List[str], label: List[List[int]], mask_length: Optional[List[int]] = None) -> List[float]:
"""Get inference-ppl per token given a list of inputs and label.
Args:
inputs (List[str]): A list of strings.
label (List[List[int]]): A list of list of label, each label is a tuple of (start, end, 1)
mask_length (Optional[List[int]]): A list of mask lengths. If
provided, the perplexity scores will be calculated with the
first mask_length[i] tokens masked out. It's okay to skip
its implementation if advanced features in PPLInfernecer is
not needed.
Returns:
List[float]: A list of perplexity scores.
"""
assert self.tokenizer.pad_token
import torch
import torch.nn.functional as F
pad_token_id = self.tokenizer.pad_token_id
messages = _convert_base_messages(inputs)

tokenize_kwargs = dict(
return_tensors='pt',
padding=True,
truncation=True,
add_special_tokens=True,
max_length=self.max_seq_len,
)

self.tokenizer.padding_side = 'right'
self.tokenizer.truncation_side = 'right'

tokens = self.tokenizer.batch_encode_plus(messages, **tokenize_kwargs)

tokens = {k: v.to(self.model.device) for k, v in tokens.items()}
outputs = self.model(**tokens)[0]

batch_size, seq_len, vocab_size = outputs.shape
shift_logits = outputs[:, :-1, :].contiguous().float()
shift_labels = tokens['input_ids'][:, 1:].contiguous()
loss = F.cross_entropy(
shift_logits.view(-1, vocab_size),
shift_labels.view(-1),
ignore_index=pad_token_id,
reduction='none').view(batch_size, seq_len - 1)
lens = (tokens['input_ids'] != pad_token_id).sum(-1).cpu().numpy()

if mask_length is not None:
import numpy as np
mask = torch.zeros_like(shift_labels) # [batch,seqlen]
for i in range(len(mask)):
for j in range(mask_length[i] - 1, len(mask[i])):
mask[i][j] = 1
loss = loss * mask
lens -= np.array(mask_length)

loss = loss.cpu().numpy()

decode_messages = [[self.tokenizer.decode([input_id]) for input_id in token] for token in tokens['input_ids']]
char_messages = [[ch for ch in message] for message in messages]

# shifted to align label and loss
for i in range(len(decode_messages)):
decode_messages[i] = decode_messages[i][1:]

aggregated_label_list = [[] for _ in range(len(decode_messages))]

tag_list = [[] for _ in range(len(decode_messages))]

for tmp_index, label_list in enumerate(label):
for single_label in label_list:
left = single_label[0]
right = single_label[1]
for i in range(left, right):
aggregated_label_list[tmp_index].append(i)


def align_sequences(seq1, seq2, sep_len):
"""
seq1: decoded sequence from token, one token may contain multiple characters
seq2: original separate character sequence
"""
i, j = 0, 0
matched_pairs = []
while i < len(seq1) and j < len(seq2):
word = seq1[i]
if len(word) == 0:
matched_pairs.append((word, []))
i += 1
continue

if '\ufffd' in word:
for _ in range(sep_len):
matched_pairs.append((word, [j]))
i += 1
j += 1
continue

char_sequence = ''
while j < len(seq2) and (char_sequence != word):
char_sequence += seq2[j]
if char_sequence == word:
matched_pairs.append((word, [k for k in range(j - len(word) + 1, j+1)]))
j += 1
break
elif len(char_sequence) > len(word):
if word == char_sequence[-len(word):]:
matched_pairs.append((word, [k for k in range(j - len(word) + 1, j+1)]))
j += 1
break
else:
j += 1
else:
j += 1
i += 1

return matched_pairs



if 'qwen' in self.path or 'Qwen' in self.path:
sep_len = 2
elif 'Llama-3' in self.path:
sep_len = 2
elif 'Yi' in self.path:
sep_len = 3
elif 'Llama-2' in self.path:
sep_len = 3
elif 'deepseek' in self.path:
sep_len = 2
else:
sep_len = 3


matched_pairs_list = [align_sequences(decode_messages[i], char_messages[i], sep_len) for i in range(len(decode_messages))]
for match_index, matched_pairs in enumerate(matched_pairs_list):
for i, (word, indices) in enumerate(matched_pairs):
for j in indices:
if j in aggregated_label_list[match_index]:
tag_list[match_index].append(i)
break

inference_loss_list = []
token_len_list = []
for i in range(len(loss)):
inference_loss = 0
token_len = 0
for j in range(len(loss[i])):
if j in tag_list[i]:

inference_loss += loss[i][j]
print(loss[i][j])
token_len += 1
inference_loss_list.append(inference_loss)
token_len_list.append(token_len)

return inference_loss_list, token_len_list

def _get_potential_stop_words(self, path: Optional[str]):
from transformers import GenerationConfig
potential_stop_words = []
Expand Down
1 change: 1 addition & 0 deletions opencompass/openicl/icl_evaluator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .icl_em_evaluator import EMEvaluator # noqa
from .icl_hf_evaluator import * # noqa
from .icl_jieba_rouge_evaluator import JiebaRougeEvaluator # noqa
from .icl_misc_evaluator import AverageInferencePPLEvaluator # noqa
from .icl_misc_evaluator import AverageMinKEvaluator # noqa
from .icl_misc_evaluator import AveragePPLEvaluator # noqa
from .icl_plugin_evaluator import TEvalEvaluator # noqa
Expand Down
Loading

0 comments on commit a244453

Please sign in to comment.