# Data Cleaning


In [1]:
from datasets import load_dataset

ds = load_dataset("gursi26/wikihow-cleaned")
print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/792 [00:00<?, ?B/s]

wikihow-cleaned.csv:   0%|          | 0.00/619M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/214293 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['summary', 'title', 'text'],
        num_rows: 214293
    })
})


# Feature Engineering


In [2]:
input = ds["train"]["text"]
target = ds["train"]["summary"]

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
X = list(input)
y = list(target)
X_main, X_test, y_main, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_main, y_main, test_size=(0.15 / (1 - 0.15)), random_state=42)
print(f"Main set size: X: {len(X_main)}, Y: {len(y_main)}")
print(f"Training set size: X: {len(X_train)} , Y: {len(y_train)}")
print(f"Validation set size: {len(X_val)}, Y: {len(y_val)}")
print(f"Test set size: {len(X_test)}, Y: {len(y_test)}")
print(type(X_train))

Main set size: X: 182149, Y: 182149
Training set size: X: 150005 , Y: 150005
Validation set size: 32144, Y: 32144
Test set size: 32144, Y: 32144
<class 'list'>


In [4]:
X_train = [str(x) for x in X_train]
X_val = [str(x) for x in X_val]
X_test = [str(x) for x in X_test]

y_train = [str(y) for y in y_train]
y_val = [str(y) for y in y_val]
y_test = [str(y) for y in y_test]


In [5]:
from transformers import T5Tokenizer

# Load T5-small tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Add "summarize: " prefix to inputs
def tokenize_batch(texts, summaries, max_input_length=256, max_output_length=128):
    texts = ["summarize: " + t for t in texts]

    model_inputs = tokenizer(
        texts,
        max_length=max_input_length,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        summaries,
        max_length=max_output_length,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize datasets
train_enc = tokenize_batch(X_train, y_train)
val_enc   = tokenize_batch(X_val, y_val)
test_enc  = tokenize_batch(X_test, y_test)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
import torch
from torch.utils.data import Dataset

class T5Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

# Create dataset objects
train_dataset = T5Dataset(train_enc)
val_dataset   = T5Dataset(val_enc)
test_dataset  = T5Dataset(test_enc)


# Model Selection

In [7]:
from transformers import T5ForConditionalGeneration

# Load T5-small model
model = T5ForConditionalGeneration.from_pretrained("t5-small")


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Model Training

In [8]:
# from transformers import Trainer, TrainingArguments

# training_args = TrainingArguments(
#     output_dir="./t5-small-finetuned",
#     per_device_train_batch_size=8,   # can adjust based on GPU memory
#     per_device_eval_batch_size=8,
#     num_train_epochs=2,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     logging_steps=50,
#     fp16=True                       # use mixed precision for speed
#     # Removed predict_with_generate=True as it is causing a TypeError
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=val_dataset,
#     tokenizer=tokenizer
# )
# trainer.train()

In [9]:
# model.save_pretrained("t5-finetuned-xsum")
# tokenizer.save_pretrained("t5Q267-finetuned-xsum")

In [10]:
# !pip install huggingface_hub

In [11]:
# from huggingface_hub import login
# login()

In [12]:
# # Push the model
# model.push_to_hub("Sakshi-1234/NLPFinal")
# # Push the tokenizer
# tokenizer.push_to_hub("Sakshi-1234/NLPFinal")

# Inference Pipeline

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("Sakshi-1234/NLPFinal")
model = AutoModelForSeq2SeqLM.from_pretrained("Sakshi-1234/NLPFinal")

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

In [14]:
def generate_summary(text, max_input_length=512, max_summary_length=128, num_beams=4):
    """
    Generates a summary for a single text input.
    """
    # Tokenize input
    inputs = tokenizer(
        text,
        max_length=max_input_length,
        truncation=True,
        return_tensors="pt"
    )

    # Generate summary tokens
    summary_ids = model.generate(
      input_ids=inputs["input_ids"],
      attention_mask=inputs["attention_mask"],
      max_length=128,
      num_beams=6,
      early_stopping=True,
      repetition_penalty=2.0,
      length_penalty=1.0,
      no_repeat_ngram_size = 3,
    )


    # Decode tokens to string
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
article = X_test[90]
print("article: ", article)
summary = generate_summary(article)
print("Summary:", summary)

article:  sometimes you may feel sad or lonely and that is normal . try not to let your sadness affect your college or high school experience , though . it is essential for you to get involved with campus and school activities and to make new friends . dont let your schoolwork slide , even though you might feel lonely at first . try to make friends in each of your classes . if you are feeling down , get together with a friend or call your partner . if you are in a relationship with someone , you need to trust them . this will be one of the biggest challenges of maintaining a long distance relationship when your partner goes to a different school . its easy to assume your partner is out partying or cheating on you if you arent seeing them everyday . try not to allow negative thoughts to affect your trust in the other person . know that long distance couples who stay together are statistically more likely to stay together than traditional couples . their relationship has been tested by l

# Postprocessing

In [15]:
def clean_summary(summary_text):
    # Strip leading/trailing whitespace
    summary_text = summary_text.strip()

    # Optional: replace multiple spaces/newlines with a single space
    summary_text = ' '.join(summary_text.split())

    # Optional: additional cleaning logic
    # e.g., remove unwanted characters, fix punctuation

    return summary_text

# Apply postprocessing
cleaned_summary = clean_summary(summary)
print("Cleaned Summary:", cleaned_summary)

Cleaned Summary: you may feel sad or lonely and that is normal. try not to let your sadness affect your college or high school experience. it is essential for you to get involved with campus and school activities.


# Evaluation

In [16]:
!pip install rouge_score
!pip install evaluate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=c9993bc9bdb7d2f2975cffbc163c00b136f562c038e2b653afbed0fc6cfae9c0
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'])

reference_summary = y_test[random_int]
candidate_summary = summary

# print("Reference Summary:", reference_summary)
# print("Candidate Summary:", candidate_summary)

scores = scorer.score(reference_summary, candidate_summary)
scores_wrapped = fill(str(scores), width=80)
print(scores_wrapped)

Downloading builder script: 0.00B [00:00, ?B/s]

{'rouge1': np.float64(0.23516481017125038), 'rouge2': np.float64(0.05041304553955303), 'rougeL': np.float64(0.15475740596492243), 'rougeLsum': np.float64(0.1542438586047078)}
