In [4]:
from transformers import AutoTokenizer, pipeline, GPT2LMHeadModel

model_name = "openai-community/gpt2"
# model_name = "distilbert/distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# model = AutoModelForCausalLM.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained("openai-community/gpt2")
model.config.pad_token_id = tokenizer.pad_token_id


generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=-1,
)

prompt = "AI is transforming industries by"


def clean_to_last_sentence(full_text: str, prompt: str) -> str:
    # Remove the prompt from the beginning (handle case where prompt might not match exactly)
    if full_text.startswith(prompt):
        gen_part = full_text[len(prompt):].strip()
    else:
        # If prompt doesn't match, try to find it
        idx = full_text.find(prompt)
        if idx != -1:
            gen_part = full_text[idx + len(prompt):].strip()
        else:
            gen_part = full_text.strip()

    # find last sentence-ending punctuation
    last_idx = -1
    for ch in [".", "!", "?"]:
        idx = gen_part.rfind(ch)
        if idx > last_idx:
            last_idx = idx

    if last_idx != -1:
        gen_part = gen_part[: last_idx + 1]
    else:
        # fallback: cut at last space so it doesn't end mid-word
        last_space = gen_part.rfind(" ")
        if last_space != -1:
            gen_part = gen_part[: last_space]

    return (prompt.strip() + " " + gen_part).strip()


outputs = generator(
    prompt + " ",
    max_new_tokens=100,      # more room to finish
    num_return_sequences=3,
    do_sample=True,
    temperature=0.7,
    top_k=50,
    top_p=0.9,
)

for i, out in enumerate(outputs, 1):
    raw = out["generated_text"]
    # Fix: account for the space added to prompt
    cleaned = clean_to_last_sentence(raw, prompt + " ")
    # print(f"\n=== Completion {i} ===")
    # print(raw)
    print(f"\n=== Completion {i} (cleaned) ===")
    print(cleaned)

Device set to use cpu



=== Completion 1 (cleaned) ===
AI is transforming industries by focusing on new technology.
The new tech will help transform our lives, our economy, and our lives around the world.
I've been a bit of a skeptic, but I was excited to hear about the big tech companies.
In the past few years, I've been learning about the emerging technologies that have made their way into our lives. They're like the Google Glass that's out there, the smart speaker that you hear every day.

=== Completion 2 (cleaned) ===
AI is transforming industries by making sure they have a better supply chain, and more efficient and cost-effective ways to sell products, and by making sure they are environmentally sound.
I've been involved in a lot of things for the past ten years, and I am very happy with the way things are moving forward. I'm a big fan of the new products that are being made here, and I think that the industry is very well positioned to do well in this new environment.

=== Completion 3 (cleaned) ===


In [5]:
sentence = "LLMs are powerful tools for natural language understanding."

# Tokenise
tokens = tokenizer.tokenize(sentence)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
encoded = tokenizer(sentence)

print("Tokens:", tokens)
print("\nToken IDs:", token_ids)
print("\nSequence Length:", len(token_ids))
print("\nFull Encoded Output:", encoded)

Tokens: ['LL', 'Ms', 'Ġare', 'Ġpowerful', 'Ġtools', 'Ġfor', 'Ġnatural', 'Ġlanguage', 'Ġunderstanding', '.']

Token IDs: [3069, 10128, 389, 3665, 4899, 329, 3288, 3303, 4547, 13]

Sequence Length: 10

Full Encoded Output: {'input_ids': [3069, 10128, 389, 3665, 4899, 329, 3288, 3303, 4547, 13], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
