# Pipeline

In [1]:
from transformers import pipeline

In [2]:
generator = pipeline("text2text-generation", model="t5-small")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


In [3]:
prompt = "translate English to Italian: The house is small."

In [4]:
result = generator(prompt, max_length=64, num_beams=4) # beams is top_k aka number of top choices

Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [5]:
result #german

[{'generated_text': 'Das Haus ist klein.'}]

In [6]:
result #spanish

[{'generated_text': 'Das Haus ist klein.'}]

In [7]:
result #italian

[{'generated_text': 'Das Haus ist klein.'}]

In [8]:
result[0]["generated_text"]

'Das Haus ist klein.'

# Manual Inference

In [9]:
#Basic imports
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [10]:
#Basic Model and tokenizer calling
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [11]:
text = "translate English to German: The weather is nice."
inputs = tokenizer(text, return_tensors="pt")

In [12]:
outputs = model.generate(
    inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=64,
    num_beams=4, # like top_k or take more into considrration but instead of tokens it does seq of highest prob tokens
    early_stopping=True #for cleaner and faster so it doenst hit the fan or limit and create half sense statements
)


In [13]:
outputs

tensor([[    0,   644, 14845,   229,  9685,     5,     1]])

In [14]:
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [15]:
decoded

['Das Wetter ist schön.']

In [16]:
decoded[0]

'Das Wetter ist schön.'

# Using Trainer Training

In [17]:
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments, Seq2SeqTrainer
)

In [18]:
# 1. Small toy dataset
train_data = {
    "input": [
        "translate English to German: The house is small.",
        "summarize: The quick brown fox jumps over the lazy dog."
    ],
    "target": [
        "Das Haus ist klein.",
        "A fox jumped over a dog."
    ]
}

eval_data = {
    "input": [
        "translate English to German: The cat sits on the mat."
    ],
    "target": [
        "Die Katze sitzt auf der Matte."
    ]
}

datasets = DatasetDict({
    "train": Dataset.from_dict(train_data),#Wraps your raw dictionary into a Hugging Face Dataset object.
    "eval": Dataset.from_dict(eval_data)#Wraps your raw dictionary into a Hugging Face Dataset object.
})

In [19]:
# 2. Load tokenizer & model
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [20]:
# 3. Preprocessing function
def preprocess(examples):
    # 1. Tokenize input text (English or source sentence)
    model_inputs = tokenizer(
        examples["input"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

    # 2. Tokenize target text (German or summary, etc.)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"],
            padding="max_length",
            truncation=True,
            max_length=64
        )

    # 3. Replace padding tokens in labels with -100
    model_inputs["labels"] = [
        [(t if t != tokenizer.pad_token_id else -100) for t in l]
        for l in labels["input_ids"]
    ]

    return model_inputs


In [21]:
tokenized = datasets.map(preprocess, batched=True)
tokenized

Map:   0%|          | 0/2 [00:00<?, ? examples/s]



Map:   0%|          | 0/1 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
    eval: Dataset({
        features: ['input', 'target', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})

In [22]:
tokenized = datasets.map(preprocess, batched=True, remove_columns=["input", "target"])
tokenized

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2
    })
    eval: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1
    })
})

In [23]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) #Padding tokens in labels must be set to ignore_index → so the model doesn’t compute loss on them
#Shifted labels for decoder (Teacher Forcing)-Takes previous tokens as input.- Predicts the next token at each step.

In [24]:
# 5. TrainingArguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    report_to="none",
)

In [25]:
# 6. Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [26]:
trainer.train()
print(trainer.evaluate())



Epoch,Training Loss,Validation Loss
1,No log,0.187417




{'eval_loss': 0.18741720914840698, 'eval_runtime': 1.1461, 'eval_samples_per_second': 0.873, 'eval_steps_per_second': 0.873, 'epoch': 1.0}


In [27]:
# Save model
trainer.save_model("./custom_Seq2Seq_model")
tokenizer.save_pretrained("./custom_Seq2Seq_model")

('./custom_Seq2Seq_model/tokenizer_config.json',
 './custom_Seq2Seq_model/special_tokens_map.json',
 './custom_Seq2Seq_model/spiece.model',
 './custom_Seq2Seq_model/added_tokens.json',
 './custom_Seq2Seq_model/tokenizer.json')

In [28]:
hf_code = ''

In [29]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [30]:
name = './custom_Seq2Seq_model'
model = AutoModelForSeq2SeqLM.from_pretrained(name)
tokenizer = AutoTokenizer.from_pretrained(name)

# Push to Hub (your username/repo_name)
repo_name = "Noobhacker69/Custom_Seq2Seq_model"  # you choose the name
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Noobhacker69/Custom_Seq2Seq_model/commit/110871d2601245f3847ef9a892039030507a206e', commit_message='Upload tokenizer', commit_description='', oid='110871d2601245f3847ef9a892039030507a206e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Noobhacker69/Custom_Seq2Seq_model', endpoint='https://huggingface.co', repo_type='model', repo_id='Noobhacker69/Custom_Seq2Seq_model'), pr_revision=None, pr_num=None)

In [34]:
from transformers import pipeline
mname = 'Noobhacker69/Custom_Seq2Seq_model'
tranli = pipeline(task="text2text-generation",model=mname,tokenizer=mname)


Device set to use cpu


In [35]:
prompt = "translate English to Italian: The house is small."

In [37]:
out = tranli(prompt)
out[0]['generated_text']

'Das Haus ist klein.'