Skip to content

Commit

Permalink
Minor change to fix the incorrect response truncation (#3986)
Browse files Browse the repository at this point in the history
  • Loading branch information
amankhandelia committed Apr 8, 2024
1 parent 337f407 commit f28a3c5
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 8 deletions.
2 changes: 1 addition & 1 deletion ludwig/features/text_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def get_decoded_targets_and_predictions(
"""Returns the decoded targets and predictions, accounting for IGNORE_INDEX_TOKEN_ID."""
sanitized_targets = torch.where(targets != IGNORE_INDEX_TOKEN_ID, targets, tokenizer.pad_token_id)
sanitized_predictions = torch.where(
predictions[PREDICTIONS] != IGNORE_INDEX_TOKEN_ID,
targets != IGNORE_INDEX_TOKEN_ID,
predictions[PREDICTIONS],
tokenizer.pad_token_id,
)
Expand Down
27 changes: 20 additions & 7 deletions tests/ludwig/features/test_text_feature.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import pandas as pd
import pytest
import torch
from transformers import AutoTokenizer

from ludwig.backend import LocalBackend
from ludwig.constants import LOGITS, PREDICTIONS, PROBABILITIES
from ludwig.constants import IGNORE_INDEX_TOKEN_ID, LOGITS, PREDICTIONS, PROBABILITIES
from ludwig.features import text_feature

TEST_MODEL_NAME = "hf-internal-testing/tiny-random-OPTForCausalLM"
Expand Down Expand Up @@ -108,14 +109,23 @@ def test_backwards_compatibility():
assert list(feature_data[1]) == [1, 3, 3]


def test_get_decoded_targets_and_predictions():
@pytest.mark.parametrize("vocab_size", [8])
@pytest.mark.parametrize(
"targets",
[
([78, 79, 504, 76, 397, 84, 0], [" first she 18 yearman our<s>"]),
([IGNORE_INDEX_TOKEN_ID, IGNORE_INDEX_TOKEN_ID, IGNORE_INDEX_TOKEN_ID, 76, 397, 84, 0], [" yearman our<s>"]),
],
)
@pytest.mark.parametrize("predictions", [[78, 79, 504, 76, 397, 84, 0]])
def test_get_decoded_targets_and_predictions(vocab_size, targets, predictions):
tokenizer = AutoTokenizer.from_pretrained(TEST_MODEL_NAME)
vocab_size = 8

# Scenario 1: Prediction and target tensors have the same length, so nothing should change
targets = torch.tensor([[78, 79, 504, 76, 397, 84, 0]])
targets, decoded_texts_gt = targets
targets = torch.tensor([targets])
predictions = {
PREDICTIONS: torch.tensor([[78, 79, 504, 76, 397, 84, 0]], dtype=torch.int64),
PREDICTIONS: torch.tensor([predictions], dtype=torch.int64),
PROBABILITIES: torch.randn(1, 7, vocab_size).to(torch.float32),
LOGITS: torch.randn(1, 7, vocab_size).to(torch.float32),
}
Expand All @@ -124,5 +134,8 @@ def test_get_decoded_targets_and_predictions():
decoded_predictions,
) = text_feature.get_decoded_targets_and_predictions(targets, predictions, tokenizer)

assert decoded_targets == [" first she 18 yearman our<s>"]
assert decoded_predictions == [" first she 18 yearman our<s>"]
assert isinstance(decoded_targets, list)
assert isinstance(decoded_predictions, list)
assert all(isinstance(x, str) for x in decoded_targets)
assert all(isinstance(x, str) for x in decoded_predictions)
assert decoded_targets == decoded_predictions == decoded_texts_gt

0 comments on commit f28a3c5

Please sign in to comment.