In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the dataset
data= pd.read_csv("/content/webNLG2020_train.csv")

In [None]:
data.columns

Index(['Unnamed: 0', 'prefix', 'input_text', 'target_text'], dtype='object')

In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,prefix,input_text,target_text
0,0,webNLG,"Aarhus_Airport | cityServed | ""Aarhus, Denmark""","The Aarhus is the airport of Aarhus, Denmark."
1,1,webNLG,"Aarhus_Airport | cityServed | ""Aarhus, Denmark""","Aarhus Airport serves the city of Aarhus, Denm..."
2,2,webNLG,Aarhus_Airport | cityServed | Aarhus,Aarhus airport serves the city of Aarhus.
3,3,webNLG,Aarhus_Airport | elevationAboveTheSeaLevel | 25.0,Aarhus Airport is 25 metres above sea level.
4,4,webNLG,Aarhus_Airport | elevationAboveTheSeaLevel | 25.0,Aarhus airport is at an elevation of 25 metres...


In [None]:
# Rename and ensure the dataset has the correct columns
data.rename(columns={"Unnamed: 0": "id"}, inplace=True)
assert all(col in data.columns for col in ["prefix", "input_text", "target_text"]), "Dataset must have 'prefix', 'input_text', and 'target_text' columns"

In [None]:
data.columns

Index(['id', 'prefix', 'input_text', 'target_text'], dtype='object')

In [None]:
data.head()

Unnamed: 0,id,prefix,input_text,target_text
0,0,webNLG,"Aarhus_Airport | cityServed | ""Aarhus, Denmark""","The Aarhus is the airport of Aarhus, Denmark."
1,1,webNLG,"Aarhus_Airport | cityServed | ""Aarhus, Denmark""","Aarhus Airport serves the city of Aarhus, Denm..."
2,2,webNLG,Aarhus_Airport | cityServed | Aarhus,Aarhus airport serves the city of Aarhus.
3,3,webNLG,Aarhus_Airport | elevationAboveTheSeaLevel | 25.0,Aarhus Airport is 25 metres above sea level.
4,4,webNLG,Aarhus_Airport | elevationAboveTheSeaLevel | 25.0,Aarhus airport is at an elevation of 25 metres...


In [None]:
# Replace underscores and pipes in the input_text column
data["input_text"] = data["input_text"].str.replace("_", " ").str.replace("|", ":")

In [None]:
data.head()

Unnamed: 0,id,prefix,input_text,target_text
0,0,webNLG,"Aarhus Airport : cityServed : ""Aarhus, Denmark""","The Aarhus is the airport of Aarhus, Denmark."
1,1,webNLG,"Aarhus Airport : cityServed : ""Aarhus, Denmark""","Aarhus Airport serves the city of Aarhus, Denm..."
2,2,webNLG,Aarhus Airport : cityServed : Aarhus,Aarhus airport serves the city of Aarhus.
3,3,webNLG,Aarhus Airport : elevationAboveTheSeaLevel : 25.0,Aarhus Airport is 25 metres above sea level.
4,4,webNLG,Aarhus Airport : elevationAboveTheSeaLevel : 25.0,Aarhus airport is at an elevation of 25 metres...


In [None]:
# Normalize specific cases in the input_text column
for index, row in data.iterrows():
    text = row["input_text"]
    # Normalize elevation values
    if "elevationAboveTheSeaLevel" in text:
        text = text.replace("elevationAboveTheSeaLevel", "is at an elevation of")
        text = text.replace("25.0", "25 metres")
    # Normalize cityServed values
    elif "cityServed" in text:
        text = text.replace("cityServed", "serves the city of")
    # Normalize operatingOrganisation values
    elif "operatingOrganisation" in text:
        text = text.replace("operatingOrganisation", "is operated by")
    # Normalize runwayLength values
    elif "runwayLength" in text:
        text = text.replace("runwayLength", "has a runway length of")
    # Update the row with the processed text
    data.at[index, "input_text"] = text

In [None]:
data.head()

Unnamed: 0,id,prefix,input_text,target_text
0,0,webNLG,"Aarhus Airport : serves the city of : ""Aarhus,...","The Aarhus is the airport of Aarhus, Denmark."
1,1,webNLG,"Aarhus Airport : serves the city of : ""Aarhus,...","Aarhus Airport serves the city of Aarhus, Denm..."
2,2,webNLG,Aarhus Airport : serves the city of : Aarhus,Aarhus airport serves the city of Aarhus.
3,3,webNLG,Aarhus Airport : is at an elevation of : 25 me...,Aarhus Airport is 25 metres above sea level.
4,4,webNLG,Aarhus Airport : is at an elevation of : 25 me...,Aarhus airport is at an elevation of 25 metres...


In [None]:
# Split the data into training and testing sets
train_data = data.sample(frac=0.8, random_state=42)
test_data = data.drop(train_data.index)

# Optionally limit the number of rows for training (quick testing)
train_data = train_data.sample(n=100, random_state=42)  # Use 100 rows for training

In [None]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-large") #t5-small
model = T5ForConditionalGeneration.from_pretrained("t5-large")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# Function to fine-tune the model on CPU
def fine_tune(data, model, tokenizer, epochs=1):
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}")
        for idx, row in data.iterrows():
            input_text = f"{row['prefix']} {row['input_text']}"
            target_text = row['target_text']

            # Tokenize inputs and labels
            inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True)
            labels = tokenizer(target_text, return_tensors="pt", max_length=512, truncation=True, padding=True).input_ids

            # Compute loss and optimize
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            print(f"Loss for row {idx}: {loss.item()}")


In [None]:
# Fine-tune the model
print("Fine-tuning the model...")
fine_tune(train_data, model, tokenizer, epochs=1)

Fine-tuning the model...
Epoch 1


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Loss for row 18567: 3.3029465675354004
Loss for row 32806: 2.6341726779937744
Loss for row 26115: 2.460157632827759
Loss for row 21915: 3.1865859031677246
Loss for row 4755: 3.2486493587493896
Loss for row 21517: 2.8817672729492188
Loss for row 9568: 2.743863821029663
Loss for row 1955: 4.4881672859191895
Loss for row 34244: 2.52679705619812
Loss for row 8739: 5.360865592956543
Loss for row 25171: 3.169844388961792
Loss for row 26382: 3.558657646179199
Loss for row 11647: 3.9049222469329834
Loss for row 21509: 3.12100887298584
Loss for row 18522: 2.115838050842285
Loss for row 32109: 2.4868338108062744
Loss for row 27907: 2.263439416885376
Loss for row 21106: 2.9294674396514893
Loss for row 9118: 2.3519461154937744
Loss for row 5987: 5.063332557678223
Loss for row 30174: 3.7041478157043457
Loss for row 24174: 2.296049118041992
Loss for row 896: 3.622509002685547
Loss for row 25082: 2.2325096130371094
Loss for row 3628: 1.9458823204040527
Loss for row 12272: 2.4086902141571045
Loss for 

In [None]:
def generate_text(input_text, prefix="data2text"):
    input_sequence = f"{prefix} {input_text}"
    inputs = tokenizer(input_sequence, return_tensors="pt", max_length=512, truncation=True)

    # Generate with appropriate sampling settings
    outputs = model.generate(
        inputs["input_ids"],
        max_length=512,
        do_sample=True,  # Enable sampling for top_p and temperature to take effect
        num_beams=5,  # Beam search can still be useful for controlled exploration
        repetition_penalty=2.5,  # Penalize repeated phrases
        no_repeat_ngram_size=2,  # Avoid repeating n-grams
        temperature=1.0,  # Control randomness in sampling
        top_k=50,  # Top-k sampling
        top_p=0.95  # Nucleus sampling (only consider top p most probable tokens)
    )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Postprocess the output to clean formatting
    if ":" in generated_text:
        parts = [part.strip() for part in generated_text.split(":") if part.strip()]
        generated_text = " ".join(parts).capitalize()

    return generated_text

# Example usage
sample_input = test_data.iloc[4]['input_text']
expected_output = test_data.iloc[4]['target_text']
print(f"Expected Target Text: {expected_output}")
print("Generated Text:", generate_text(sample_input))

Expected Target Text: The runway length of Abilene Regional Airport is 2195.0.
Generated Text: 2text abilene regional airport has a runway length of 2195.0 km.
