# Preprocessing and Model Fine-tuning


In [None]:
import torch
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

Device: cuda


In [None]:
!pip install -q transformers datasets sentencepiece
!pip install scikit-learn

import pandas as pd

import re
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch

df = pd.read_csv(r"C:\Users\GAMING\Downloads\simplification_dataset_clean.csv")

## Data Cleaning

In [None]:
def clean_text(text):
    text = text.strip()  # remove leading/trailing spaces
    text = text.lower()  # lowercase
    text = re.sub(r'\s+', ' ', text)  # collapse multiple spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # remove non-ASCII (optional)
    return text

# Apply cleaning
df['source_text'] = df['source_text'].apply(clean_text)
df['target_text'] = df['target_text'].apply(clean_text)

## Train/Test split

In [None]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
train_df.to_csv(r"E:\train_dataset.csv", index=False)
val_df.to_csv(r"E:\val_dataset.csv", index=False)

## Load Pre-trained Simplification Model (T5-small)

In [None]:
!pip install hf_xet
MODEL_NAME = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

Collecting hf_xet
  Downloading hf_xet-1.1.10-cp37-abi3-win_amd64.whl.metadata (4.7 kB)
Downloading hf_xet-1.1.10-cp37-abi3-win_amd64.whl (2.8 MB)
   ---------------------------------------- 0.0/2.8 MB ? eta -:--:--
   --- ------------------------------------ 0.3/2.8 MB ? eta -:--:--
   ----------- ---------------------------- 0.8/2.8 MB 1.7 MB/s eta 0:00:02
   -------------- ------------------------- 1.0/2.8 MB 2.1 MB/s eta 0:00:01
   ---------------------- ----------------- 1.6/2.8 MB 2.0 MB/s eta 0:00:01
   --------------------------------- ------ 2.4/2.8 MB 2.2 MB/s eta 0:00:01
   ---------------------------------------- 2.8/2.8 MB 2.3 MB/s  0:00:01
Installing collected packages: hf_xet
Successfully installed hf_xet-1.1.10


## Tokenize and encode source/target sentences

In [None]:
from datasets import Dataset

MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128

def preprocess_function(examples):
    inputs = examples['source_text']
    targets = examples['target_text']

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        text_target=targets,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

print("Tokenizing training dataset:", end=" ")
train_dataset = train_dataset.map(preprocess_function, batched=True)
print("Tokenizing validation dataset:", end=" ")
val_dataset = val_dataset.map(preprocess_function, batched=True)

columns_to_keep = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns_to_keep)
val_dataset.set_format(type="torch", columns=columns_to_keep)

Tokenizing training dataset: 100%|██████████| 97467/97467 [00:35<00:00, 2741.72 examples/s]
Tokenizing validation dataset: 100%|██████████| 10830/10830 [00:03<00:00, 3109.59 examples/s]


## Training Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_simplification",
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=1000,
    eval_steps=1000,
    logging_steps=200,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False
)

## Training

In [None]:
import numpy as np

def numpy_collator(features):
    batch = data_collator(features)
    batch["labels"] = torch.tensor(np.array(batch["labels"]), dtype=torch.int64)
    return batch

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator
)

trainer.train()

Training Progress: [9138/9138 14:21:51, Epoch 3/3]

Step | Training Loss | Validation Loss
-----|--------------|----------------
1000 | 0.242100     | 0.225193
2000 | 0.236300     | 0.218573
3000 | 0.231000     | 0.214652
4000 | 0.226800     | 0.212884
5000 | 0.224700     | 0.211309
6000 | 0.225700     | 0.210139
7000 | 0.225700     | 0.209595
8000 | 0.218700     | 0.209358
9000 | 0.224400     | 0.208971

TrainOutput(global_step=9138, training_loss=0.25559080661999944, metrics={'train_runtime': 51716.5227, 'train_samples_per_second': 5.654, 'train_steps_per_second': 0.177, 'total_flos': 9893519523053568.0, 'train_loss': 0.25559080661999944, 'epoch': 3.0})


Insights:
- Small gap between train/val → no strong overfitting.  
- Validation loss stayed stable → model generalizes well.  


In [None]:
model.save_pretrained(r"E:\simplification_model")
tokenizer.save_pretrained(r"E:\simplification_model")

('E:\\simplification_model\\tokenizer_config.json',
 'E:\\simplification_model\\special_tokens_map.json',
 'E:\\simplification_model\\spiece.model',
 'E:\\simplification_model\\added_tokens.json',
 'E:\\simplification_model\\tokenizer.json')