# Preprocessing and Model Fine-tuning


In [3]:
!pip install -q transformers datasets sentencepiece
import pandas as pd
import seaborn as sns
import re
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import torch



from google.colab import drive
drive.mount('/content/drive')

df = pd.read_csv("/content/drive/MyDrive/all_levels_with_readability.csv")

Mounted at /content/drive


## Data Cleaning

In [4]:
def clean_text(text):
    text = text.strip()  # remove leading/trailing spaces
    text = text.lower()  # lowercase
    text = re.sub(r'\s+', ' ', text)  # collapse multiple spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # remove non-ASCII (optional)
    return text

# Apply cleaning
df['source_text'] = df['source_text'].apply(clean_text)
df['target_text'] = df['target_text'].apply(clean_text)

## Train/Test split

In [5]:
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)
print("Train size:", train_df.shape)
print("Validation size:", val_df.shape)

Train size: (5405, 8)
Validation size: (601, 8)


## Load Pre-trained Simplification Model (T5-small)

In [6]:
MODEL_NAME = "t5-small"  # smaller model for Colab free GPU
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## Tokenize and encode source/target sentences

In [7]:
MAX_INPUT_LENGTH = 128
MAX_TARGET_LENGTH = 128

def preprocess_function(examples):
    inputs = examples['source_text']
    targets = examples['target_text']
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)

columns_to_keep = ["input_ids", "attention_mask", "labels"]
train_dataset.set_format(type="torch", columns=columns_to_keep)
val_dataset.set_format(type="torch", columns=columns_to_keep)

Map:   0%|          | 0/5405 [00:00<?, ? examples/s]



Map:   0%|          | 0/601 [00:00<?, ? examples/s]

To prepare the dataset for model we :

- Convert raw text to **token IDs** so the model can process it.
- Inputs: `source_text` (complex sentence)
- Targets: `target_text` (simplified sentence)
- Tokenization includes **truncation** and **padding** to fixed length.
- Use `map()` to apply preprocessing to the whole dataset.
- `set_format` converts to **PyTorch tensors** and keeps only required columns: `input_ids`, `attention_mask`, `labels`.

input_ids → tokenized source sentences

attention_mask → which tokens are real vs padding

labels → tokenized target sentences

## Training Arguments

In [8]:
from transformers import Seq2SeqTrainingArguments
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/T5_finetuned_",
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=50,
    logging_steps=50,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to=[]
)




## Training

In [10]:
seed=42
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator
)
checkpoint_path = "/content/drive/MyDrive/checkpoint-5900"

tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)

trainer.train(resume_from_checkpoint=checkpoint_path)

#trainer.train()

There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Step,Training Loss,Validation Loss
5950,0.1507,0.17633
6000,0.1593,0.176205
6050,0.1771,0.176177
6100,0.1589,0.174564
6150,0.171,0.175182
6200,0.1611,0.174859
6250,0.1323,0.174644
6300,0.1775,0.17437
6350,0.2029,0.174048
6400,0.1815,0.173969


TrainOutput(global_step=6760, training_loss=0.02075029220101396, metrics={'train_runtime': 222.3495, 'train_samples_per_second': 121.543, 'train_steps_per_second': 30.403, 'total_flos': 914403046195200.0, 'train_loss': 0.02075029220101396, 'epoch': 5.0})

Insights:
- Small gap between train/val → no strong overfitting.  
- Validation loss stayed stable → model generalizes well.  
- Overall: fine-tuning successful, model learned effectively.

In [11]:
model.save_pretrained("/content/drive/MyDrive/simplification_model")
tokenizer.save_pretrained("/content/drive/MyDrive/simplification_model")

('/content/drive/MyDrive/simplification_model/tokenizer_config.json',
 '/content/drive/MyDrive/simplification_model/special_tokens_map.json',
 '/content/drive/MyDrive/simplification_model/spiece.model',
 '/content/drive/MyDrive/simplification_model/added_tokens.json',
 '/content/drive/MyDrive/simplification_model/tokenizer.json')