In [2]:
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
)  # For sentiment analysis.

from genlm_control import InferenceEngine
from genlm_control.potential import PromptedLLM, BoolFSA, Potential
from genlm_control.sampler import direct_token_sampler, eager_token_sampler

INFO 02-25 22:41:44 __init__.py:183] Automatically detected platform cuda.


# Sampling from a language model

In [34]:
# Load gpt2 (or any other HuggingFace model) using the HuggingFace backend.
# (Setting backend='vllm' will be much faster, but requires a GPU).
mtl_llm = PromptedLLM.from_name("gpt2", backend="hf", temperature=0.5)

Task was destroyed but it is pending!
task: <Task cancelling name='Task-528' coro=<AsyncTokenCharacterTrie._background_loop() running at /home/mila/b/benjamin.lebrun/scratch/genlm-control/lib/python3.11/site-packages/genlm_backend/trie/async_impl.py:107> wait_for=<Future cancelled>>


In [35]:
# Set the fixed prompt prefix for the language model.
# All language model predictions will be conditioned on the
# token ids which this string encodes to (via the LM's tokenizer).
mtl_llm.set_prompt_from_str("Montreal is")

In [36]:
# Load a sampler that proposes tokens by sampling directly
# from the language model's distribution.
sampler = direct_token_sampler(mtl_llm)

In [37]:
# Create an inference engine.
engine = InferenceEngine(sampler)

In [38]:
# Run SMC with 10 particles, a max sequence length of 25 tokens
# and an ESS threshold of 0.5.
sequences = await engine(n_particles=10, max_tokens=10, ess_threshold=0.5)

In [39]:
# Get the inferred posterior distribution over sequences.
sequences.posterior

0,1
key,value
"(b' not', b' a', b' city', b' that', b' has', b' a', b' reputation', b' for', b' being', b' a')",0.10000089820812255
"(b' home', b' to', b' a', b' team', b' of', b' professional', b' soccer', b' players', b',', b' including')",0.10000035213773735
"(b' a', b' city', b' of', b' people', b' who', b' have', b' a', b' strong', b' sense', b' of')",0.10000033458578937
"(b' an', b' urban', b' area', b'.', b' The', b' city', b' is', b' a', b' pioneer', b' of')",0.10000022899048969
"(b' home', b' to', b' a', b' number', b' of', b' famous', b' movie', b' stars', b',', b' including')",0.10000021817279635
"(b' the', b' only', b' city', b' in', b' Canada', b' where', b' you', b' can', b' still', b' buy')",0.0999997843072465
"(b' also', b' the', b' most', b' populous', b' city', b' in', b' France', b',', b' with', b' more')",0.09999967070703746
"(b' a', b' city', b' with', b' a', b' population', b' of', b' over', b' 6', b' million', b' people')",0.09999964391270273
"(b' facing', b' a', b' budget', b' shortfall', b' of', b' $', b'8', b'.', b'6', b' billion')",0.09999944046738002


# Prompt intersection

In [40]:
# Spawn a new language model. This is shallow copy, so both models
# share the same underlying language model.
bos_llm = mtl_llm.spawn()
# Set a different prompt for the new language model.
bos_llm.set_prompt_from_str("Boston is")

In [41]:
# Take the product of the two language models.
product = mtl_llm * bos_llm

In [42]:
# Load a token sampler that samples next tokens directly from the
# product of the two language models.
sampler = direct_token_sampler(product)

In [43]:
# Create an inference engine.
engine = InferenceEngine(sampler)

In [44]:
# Run the inference engine for 10 particles with a max sequence length of 25 tokens
# and an ESS threshold of 0.5.
sequences = await engine(n_particles=10, max_tokens=10, ess_threshold=0.5)

In [45]:
sequences.posterior

0,1
key,value
"(b' a', b' great', b' place', b' to', b' live', b'.', b' It', b""'s"", b' a', b' great')",0.6
"(b' a', b' small', b' town', b' in', b' the', b' middle', b' of', b' nowhere', b',', b' and')",0.29999999999999993
"(b' a', b' great', b' place', b' to', b' live', b'.', b' I', b""'m"", b' looking', b' forward')",0.09999999999999999


# Adding a regex constraint

In [46]:
best_fsa = BoolFSA.from_regex(r"is\sthe\s(best|worst).*")

In [47]:
# The following is valid but will be slow!
# slow_sampler = direct_token_sampler(
#    product * best_fsa.coerce(product, f=b''.join)
# )

# This sampler is much faster.
sampler = eager_token_sampler(product, best_fsa)

In [48]:
engine = InferenceEngine(sampler)

In [49]:
sequences = await engine(n_particles=10, max_tokens=20, ess_threshold=0.5)

In [50]:
sequences.posterior

0,1
key,value
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b'.', b' They', b' are', b' the', b' best', b' team', b' in', b' the', b' league', b'.', b' They', b' are', b' the')",0.8700819897729549
"(b'is', b' the', b' best', b' place', b' to', b' start', b'.', b' It', b""'s"", b' a', b' great', b' place', b' to', b' start', b'.', b' It', b""'s"", b' a', b' great', b' place')",0.12595754038129175
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b',', b' and', b' the', b' team', b' that', b' has', b' been', b' the', b' best', b' in', b' the', b' league', b' for')",0.002369224994716196
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b',', b' and', b' it', b""'s"", b' hard', b' to', b' believe', b' that', b' the', b' team', b' will', b' be', b' able')",0.0007141910328063171
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b',', b' and', b' it', b""'s"", b' hard', b' to', b' believe', b' that', b' they', b""'re"", b' going', b' to', b' be')",0.0005464385592377434
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b',', b' and', b' it', b""'s"", b' hard', b' to', b' argue', b' with', b' that', b'.', b' The', b' team', b' is')",0.0003306152589931078


## Criticizing with a custom `Potential`

In [51]:
# A custom potential that does sentiment analysis.


class SentimentAnalysis(Potential):
    def __init__(self, model, tokenizer, sentiment="POSITIVE"):
        self.model = model
        self.tokenizer = tokenizer
        self.sentiment_idx = model.config.label2id.get(sentiment, None)
        if self.sentiment_idx is None:
            raise ValueError(f"Sentiment {sentiment} not found in model labels")

        super().__init__(vocabulary=list(range(256)))  # Defined over bytes.

    def _forward(self, contexts):
        strings = [
            bytes(context).decode("utf-8", errors="ignore") for context in contexts
        ]  # Convert bytes to strings.
        inputs = self.tokenizer(strings, return_tensors="pt", padding=True)  # Tokenize.
        with torch.no_grad():
            logits = self.model(**inputs).logits
        return logits.log_softmax(dim=-1)[:, self.sentiment_idx].cpu().numpy()

    async def prefix(self, context):
        return self._forward([context])[0].item()

    async def complete(self, context):
        return self._forward([context])[0].item()

    async def batch_complete(self, contexts):
        return self._forward(contexts)

    async def batch_prefix(self, contexts):
        return self._forward(contexts)


model_name = "distilbert-base-uncased-finetuned-sst-2-english"

sentiment_analysis = SentimentAnalysis(
    model=DistilBertForSequenceClassification.from_pretrained(model_name),
    tokenizer=DistilBertTokenizer.from_pretrained(model_name),
    sentiment="POSITIVE",
)

In [52]:
await sentiment_analysis.prefix(b"so good"), await sentiment_analysis.prefix(b"so bad")

(-0.00015841660206206143, -8.44865894317627)

In [53]:
# Check that our custom potential satisfies the potential contract.
await sentiment_analysis.assert_logw_next_consistency(b"the best", top=5)
await sentiment_analysis.assert_autoreg_fact(b"the best")

In [54]:
# The following is valid but will be slow!
# slow_sampler = eager_token_sampler(
#    iter_potential=product, item_potential=best_fsa * sentiment_analysis
# )

# This setup will be much faster.
sampler = eager_token_sampler(product, best_fsa)
critic = sentiment_analysis.coerce(sampler.target, f=b"".join)
engine = InferenceEngine(sampler, critic=critic)

Task was destroyed but it is pending!
task: <Task cancelling name='Task-1848' coro=<AsyncTokenCharacterTrie._background_loop() running at /home/mila/b/benjamin.lebrun/scratch/genlm-control/lib/python3.11/site-packages/genlm_backend/trie/async_impl.py:107> wait_for=<Future cancelled>>


In [55]:
sequences = await engine(n_particles=10, max_tokens=10, ess_threshold=0.5)

## Optimizing with autobatching

In [56]:
# This creates a new potential that automatically batches concurrent
# requests to the instance methods (`prefix`, `complete`, `logw_next`)
# and processes them using the batch methods (`batch_complete`, `batch_prefix`, `batch_logw_next`).
critic = critic.to_autobatched()
engine = InferenceEngine(sampler, critic=critic)

In [57]:
sequences = await engine(n_particles=10, max_tokens=10, ess_threshold=0.5)

In [58]:
sequences.posterior

0,1
key,value
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b',', b' but', b' they')",0.3356544170239431
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b'.', b' The', b' only')",0.1458168939011587
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b'.', b' They', b' are')",0.12042975039630092
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b'.', b' They', b' have')",0.12042961370049911
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b',', b' but', b' it')",0.1118988828314853
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b',', b' and', b' they')",0.09623392011459835
"(b'is', b' the', b' best', b' team', b' in', b' the', b' league', b' and', b' has', b' the')",0.06355272991534008
"(b'is', b' the', b' best', b' place', b' to', b' be', b' a', b' kid', b' in', b' the')",0.0059837921166744435


# Criticizing with a CPU-intensive potential