In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [7]:
import transformers
print(transformers.__version__)


4.36.2


In [8]:
import torch
print(torch.__version__)

2.9.0+cu128


In [9]:
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')
tokenizer = T5Tokenizer.from_pretrained(
    'google/flan-t5-base',
    skip_chat_template=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
input_text = "summarize: Parents face video game lessons Ways of ensuring that parents know which video games are suitable for children are to be considered by the games industry.The issue was discussed at a meeting between UK government officials, industry representatives and the British Board of Film Classification. It follows concerns that children may be playing games aimed at adults which include high levels of violence. In 2003, Britons spent £1,152m on games, more than ever before. And this Christmas, parents are expected to spend millions on video games and consoles."
inputs = tokenizer(input_text, return_tensors='pt')

output = model.generate(**inputs, max_length=50)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

print(summary)

Parents are being asked to consider whether video games are suitable for children.


## Read and combine the dataset

In [4]:
import os
import pandas as pd


base_path = "BBC News Summary" 

articles_dir = os.path.join(base_path, "News Articles")
summaries_dir = os.path.join(base_path, "Summaries")

data = []

# Loop through categories (business, politics, sport, etc.)
for category in os.listdir(articles_dir):
    article_cat_dir = os.path.join(articles_dir, category)
    summary_cat_dir = os.path.join(summaries_dir, category)
    
    for fname in os.listdir(article_cat_dir):
        article_path = os.path.join(article_cat_dir, fname)
        summary_path = os.path.join(summary_cat_dir, fname)
        
        if os.path.exists(article_path) and os.path.exists(summary_path):
            
            with open(article_path, "r", encoding="utf-8", errors="replace") as f:
                article = f.read().strip()
            with open(summary_path, "r", encoding="utf-8", errors="replace") as f:
                summary = f.read().strip()
            
            data.append({
                "category": category,
                "article": article,
                "summary": summary
            })


df = pd.DataFrame(data)


print(f"Total samples: {len(df)}")
print("Categories:", df['category'].unique())
print(df.head())


Total samples: 2225
Categories: ['business' 'entertainment' 'politics' 'sport' 'tech']
   category                                            article  \
0  business  Ad sales boost Time Warner profit\n\nQuarterly...   
1  business  Dollar gains on Greenspan speech\n\nThe dollar...   
2  business  Yukos unit buyer faces loan claim\n\nThe owner...   
3  business  High fuel prices hit BA's profits\n\nBritish A...   
4  business  Pernod takeover talk lifts Domecq\n\nShares in...   

                                             summary  
0  TimeWarner said fourth quarter sales rose 2% t...  
1  The dollar has hit its highest level against t...  
2  Yukos' owner Menatep Group says it will ask Ro...  
3  Rod Eddington, BA's chief executive, said the ...  
4  Pernod has reduced the debt it took on to fund...  


## Clean and inspect text

In [5]:
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text) 
    text = re.sub(r'([.,!?;:])', r' \1 ', text)  
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

df["article"] = df["article"].apply(clean_text)
df["summary"] = df["summary"].apply(clean_text)

print("Average article length:", df["article"].apply(lambda x: len(x.split())).mean())
print("Average summary length:", df["summary"].apply(lambda x: len(x.split())).mean())

df.sample(5)


Average article length: 425.78426966292136
Average summary length: 189.6485393258427


Unnamed: 0,category,article,summary
1310,politics,Labour MP praises Tory campaign The Conservati...,A Labour party spokesman played down differenc...
2121,tech,Napster offers rented music to go Music downlo...,"This has outraged some digital music lovers , ..."
239,business,Economy 'strong' in election year UK businesse...,The BDO optimism index - a leading indicator o...
0,business,Ad sales boost Time Warner profit Quarterly pr...,TimeWarner said fourth quarter sales rose 2% t...
1774,sport,Navratilova hits out at critics Martina Navrat...,"Navratilova , who made a comeback after retiri..."


## Split into train, validation, test sets

In [6]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


Train: 1801, Val: 201, Test: 223


## Save cleaned data for later stages

In [7]:
train_df.to_csv("data/train.csv", index=False)
val_df.to_csv("data/val.csv", index=False)
test_df.to_csv("data/test.csv", index=False)


## Quick sanity check

In [8]:
for i in range(3):
    print(f"\nCATEGORY: {train_df.iloc[i]['category']}")
    print("ARTICLE:", train_df.iloc[i]['article'][:500], "...")
    print("SUMMARY:", train_df.iloc[i]['summary'])


CATEGORY: tech
ARTICLE: Norway upholds 'Napster' ruling A Norwegian student who ran a website which linked to downloadable MP3 files has been ordered to pay compensation by the country's Supreme Court . Frank Allan Bruvik was ordered to pay 100 , 000 kroner (£8 , 000) to the music industry in Norway . He was a student when he set up his napster . no site , which allowed users to submit and receive links to MP3 files . Bruvik had earlier been cleared on appeal after a lower court had found for the music industry . Music ...
SUMMARY: Frank Allan Bruvik was ordered to pay 100 , 000 kroner (£8 , 000) to the music industry in Norway . Norway's music industry said it was satisfied with the ruling , because showed that music piracy would not be accepted . A Norwegian court ruled in 2003 that Bruvik would have to pay 100 , 000 kroner to the music industry , but the country's Court of Appeal cleared him , saying that the copyright violation occurred when others posted the music . Bruvik's site

## Prepare for tokenization (T5/BART) 

In [9]:
from transformers import AutoTokenizer


model_name_t5 = "google/flan-t5-base"
model_name_bart = "facebook/bart-base"

tokenizer_t5 = AutoTokenizer.from_pretrained(model_name_t5)
tokenizer_bart = AutoTokenizer.from_pretrained(model_name_bart)

# Example tokenization test
sample = train_df.iloc[0]["article"]
inputs = tokenizer_t5("summarize: " + sample, max_length=512, truncation=True, return_tensors="pt")
print(inputs["input_ids"].shape)


  _torch_pytree._register_pytree_node(


torch.Size([1, 512])


## Tokenize and prepare Hugging Face Datasets

In [10]:
from datasets import Dataset
import pandas as pd
#Load CSVs into Hugging Face Datasets

train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
test_df = pd.read_csv("data/test.csv")

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

# Define tokenization function

def tokenize_function_t5(batch):
    inputs = ["summarize: " + text for text in batch["article"]]
    model_inputs = tokenizer_t5(inputs, max_length=512, truncation=True)
    labels = tokenizer_t5(batch["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

def tokenize_function_bart(batch):
    inputs = batch["article"]
    model_inputs = tokenizer_bart(inputs, max_length=512, truncation=True)
    labels = tokenizer_bart(batch["summary"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#  Apply tokenization

tokenized_train_t5 = train_ds.map(tokenize_function_t5, batched=True)
tokenized_val_t5 = val_ds.map(tokenize_function_t5, batched=True)
tokenized_test_t5 = test_ds.map(tokenize_function_t5, batched=True)

tokenized_train_bart = train_ds.map(tokenize_function_bart, batched=True)
tokenized_val_bart = val_ds.map(tokenize_function_bart, batched=True)
tokenized_test_bart = test_ds.map(tokenize_function_bart, batched=True)

# Quick sanity check

print("T5 sample input_ids:", tokenized_train_t5[0]["input_ids"][:10])
print("BART sample input_ids:", tokenized_train_bart[0]["input_ids"][:10])


Map: 100%|██████████| 1801/1801 [00:02<00:00, 897.28 examples/s]
Map: 100%|██████████| 201/201 [00:00<00:00, 1163.48 examples/s]
Map: 100%|██████████| 223/223 [00:00<00:00, 856.31 examples/s]
Map: 100%|██████████| 1801/1801 [00:01<00:00, 933.80 examples/s] 
Map: 100%|██████████| 201/201 [00:00<00:00, 1389.56 examples/s]
Map: 100%|██████████| 223/223 [00:00<00:00, 1147.17 examples/s]


T5 sample input_ids: [21603, 10, 16491, 95, 6134, 7, 3, 31, 567, 9]
BART sample input_ids: [0, 29723, 1970, 16060, 29, 128, 37549, 3121, 108, 2255]
