In [1]:
from transformers import AutoTokenizer, pipeline, GPT2LMHeadModel

model_name = "openai-community/gpt2"
# model_name = "distilbert/distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# model = AutoModelForCausalLM.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
model.config.pad_token_id = tokenizer.pad_token_id


generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=-1,
)

prompt = "AI is transforming industries by"


def clean_to_last_sentence(full_text: str, prompt: str) -> str:
    # Remove the prompt from the beginning (handle case where prompt might not match exactly)
    if full_text.startswith(prompt):
        gen_part = full_text[len(prompt):].strip()
    else:
        # If prompt doesn't match, try to find it
        idx = full_text.find(prompt)
        if idx != -1:
            gen_part = full_text[idx + len(prompt):].strip()
        else:
            gen_part = full_text.strip()

    # find last sentence-ending punctuation
    last_idx = -1
    for ch in [".", "!", "?"]:
        idx = gen_part.rfind(ch)
        if idx > last_idx:
            last_idx = idx

    if last_idx != -1:
        gen_part = gen_part[: last_idx + 1]
    else:
        # fallback: cut at last space so it doesn't end mid-word
        last_space = gen_part.rfind(" ")
        if last_space != -1:
            gen_part = gen_part[: last_space]

    return (prompt.strip() + " " + gen_part).strip()


outputs = generator(
    prompt + " ",
    max_new_tokens=100,      # more room to finish
    num_return_sequences=3,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
)

for i, out in enumerate(outputs, 1):
    raw = out["generated_text"]
    # Fix: account for the space added to prompt
    cleaned = clean_to_last_sentence(raw, prompt + " ")
    # print(f"\n=== Completion {i} ===")
    # print(raw)
    print(f"\n=== Completion {i} (cleaned) ===")
    print(cleaned)

Device set to use cpu



=== Completion 1 (cleaned) ===
AI is transforming industries by treating them with the same care as the rest of the population.
In my book, I've discussed how we can increase productivity by using technology to change the way we live and work.  It's time to get back to that.  I have some interesting ideas for you.
In the beginning, I used to think that we should focus on technology.  I thought we needed to focus on the people and how they interacted with technology.

=== Completion 2 (cleaned) ===
AI is transforming industries by vernacularizing and reinventing the way we think and act.

In this year's edition of the American Economic Review, I'll talk about the latest in the "Big Five" research on the value of innovation. It's a fascinating look at how those big five groups are shaping the way we think, act and live.

=== Completion 3 (cleaned) ===
AI is transforming industries by creating new jobs and creating new revenue streams, while also bringing about a return to a world of "ze

In [2]:
sentence = "LLMs are powerful tools for natural language understanding."

# Tokenise
tokens = tokenizer.tokenize(sentence)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
encoded = tokenizer(sentence)

print("Tokens:", tokens)
print("\nToken IDs:", token_ids)
print("\nSequence Length:", len(token_ids))
print("\nFull Encoded Output:", encoded)

Tokens: ['LL', 'Ms', 'Ġare', 'Ġpowerful', 'Ġtools', 'Ġfor', 'Ġnatural', 'Ġlanguage', 'Ġunderstanding', '.']

Token IDs: [3069, 10128, 389, 3665, 4899, 329, 3288, 3303, 4547, 13]

Sequence Length: 10

Full Encoded Output: {'input_ids': [3069, 10128, 389, 3665, 4899, 329, 3288, 3303, 4547, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
