In [None]:
import os
print(os.getcwd())  # Should show '/content'

/content


In [None]:
%cd /content/t5_claim_normalization


/content/t5_claim_normalization


In [None]:
!pip install -r /content/requirements.txt


Collecting datasets (from -r /content/requirements.txt (line 3))
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r /content/requirements.txt (line 3))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->-r /content/requirements.txt (line 3))
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets->-r /content/requirements.txt (line 3))
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets->-r /content/requirements.txt (line 3))
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->-r /content/requirements.txt (line 5))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

In [None]:
!python run_training.py


2025-03-16 15:20:30.327596: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742138430.590939    4564 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742138430.671668    4564 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-16 15:20:31.218536: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading data/train-eng.csv...
Saved dataset to data/train-eng.csv
Downloading data/dev-eng.csv...
Saved dataset to data/d

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# ✅ Set model path
MODEL_DIR = "./model_output"  # Change this if needed

# ✅ Load Model & Tokenizer
try:
    tokenizer = T5Tokenizer.from_pretrained(MODEL_DIR)
    model = T5ForConditionalGeneration.from_pretrained(MODEL_DIR).to("cuda" if torch.cuda.is_available() else "cpu")
    print("✅ Model and tokenizer loaded successfully!")
except Exception as e:
    print(f"❌ Error loading model/tokenizer: {e}")
    exit()

# ✅ Function to Normalize Claims
def normalize_claim(post):
    prompt_text = "Normalize this claim: "  # Ensure prompt is consistent with training
    inputs = tokenizer(prompt_text + post, return_tensors="pt", truncation=True, max_length=512).to(model.device)

    # 🔥 Generate Output
    with torch.no_grad():
        output = model.generate(**inputs, max_length=50)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# ✅ Test with Sample Claims
test_claims = [
    "The earth is flat and NASA is lying to us.",
    "Drinking lemon water cures cancer.",
    "COVID-19 vaccines contain microchips for tracking.",
    "5G technology causes coronavirus."
]

print("\n🔍 Testing Claim Normalization:\n")
for claim in test_claims:
    normalized = normalize_claim(claim)
    print(f"📌 Input Claim: {claim}")
    print(f"✅ Normalized Claim: {normalized}\n")


✅ Model and tokenizer loaded successfully!

🔍 Testing Claim Normalization:

📌 Input Claim: The earth is flat and NASA is lying to us.
✅ Normalized Claim: NASA is lying to us.

📌 Input Claim: Drinking lemon water cures cancer.
✅ Normalized Claim: Drinking lemon water cures cancer

📌 Input Claim: COVID-19 vaccines contain microchips for tracking.
✅ Normalized Claim: Covid-19 vaccines contain microchips for tracking.

📌 Input Claim: 5G technology causes coronavirus.
✅ Normalized Claim: 5G technology causes coronavirus



In [None]:
# 🛠 Install required libraries
!pip install -q transformers datasets accelerate
import nltk; nltk.download('wordnet'); nltk.download('omw-1.4')

# 📚 Imports
import os
import gc
import torch
import pandas as pd
import logging
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from transformers.utils import logging as hf_logging
from google.colab import files

# ✅ Logging setup
hf_logging.set_verbosity_info()
hf_logging.enable_propagation()
logging.basicConfig(level=logging.INFO)

# ✅ Model config
model_name = "google/flan-t5-base"
max_input_length = 32
max_target_length = 32
per_device_batch_size = 4
gradient_accum_steps = 8
output_dir = "./results"

# ✅ Upload data
print("📁 Upload your train-eng.csv and dev-eng.csv")
uploaded = files.upload()

# ✅ Load datasets
train_df = pd.read_csv("train-eng.csv")
dev_df = pd.read_csv("dev-eng.csv")

# ✅ Drop empty rows
train_df.dropna(subset=["post", "normalized claim"], inplace=True)
dev_df.dropna(subset=["post", "normalized claim"], inplace=True)

# ✅ Tokenizer fix (AutoTokenizer for FLAN)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Preprocess
def preprocess_function(examples):
    prompt = "Normalize this claim: "
    inputs = [prompt + post for post in examples["post"]]
    targets = examples["normalized claim"]

    # Skip examples with empty targets
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# ✅ Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

tokenized_train = train_dataset.map(preprocess_function, batched=True, num_proc=1)
tokenized_dev = dev_dataset.map(preprocess_function, batched=True, num_proc=1)

tokenized_train = tokenized_train.remove_columns(["post", "normalized claim"])
tokenized_dev = tokenized_dev.remove_columns(["post", "normalized claim"])

# ✅ Load model
model = T5ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable()
model.config.use_cache = False

# ✅ Safe training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=per_device_batch_size,
    per_device_eval_batch_size=per_device_batch_size,
    gradient_accumulation_steps=gradient_accum_steps,
    weight_decay=0.01,
    num_train_epochs=3,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    fp16_full_eval=True,
    eval_accumulation_steps=2,
    logging_dir="./logs",
    logging_steps=1,
    logging_first_step=True,
    label_smoothing_factor=0.1,
    report_to="none",
    save_strategy="epoch",
    load_best_model_at_end=True,
    disable_tqdm=False,
)

# ✅ Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ✅ Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ✅ Test one forward pass to check for NaNs
sample_input = tokenizer("Normalize this claim: The Earth is flat", return_tensors="pt").to(model.device)
sample_label = tokenizer("The Earth is not flat", return_tensors="pt").input_ids.to(model.device)
with torch.no_grad():
    test_loss = model(input_ids=sample_input["input_ids"], labels=sample_label).loss.item()
print(f"🧪 Test loss on dummy input: {test_loss:.4f}")

# ✅ Train
torch.cuda.empty_cache()
gc.collect()
print(f"💻 Using {torch.cuda.device_count()} GPU(s)...")
trainer.train()

# ✅ Evaluate & Save
results = trainer.evaluate()
print("✅ Evaluation results:", results)

output_path = "./model_result/t5_claim_normalization"
os.makedirs(output_path, exist_ok=True)
model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"✅ Model and tokenizer saved to {output_path}")


📁 Upload your train-eng.csv and dev-eng.csv


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Saving dev-eng.csv to dev-eng (2).csv
Saving train-eng.csv to train-eng (2).csv


loading file spiece.model from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/spiece.model
INFO:transformers.tokenization_utils_base:loading file spiece.model from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/tokenizer.json
INFO:transformers.tokenization_utils_base:loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/tokenizer.json
loading file added_tokens.json from cache at None
INFO:transformers.tokenization_utils_base:loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots

Map:   0%|          | 0/11374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1171 [00:00<?, ? examples/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
INFO:transformers.configuration_utils:loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--google--flan-t5-base/snapshots/7bcac572ce56db69c1ea7c8af255c5d7c9672fc2/config.json
Model config T5Config {
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
 

🧪 Test loss on dummy input: 1.4245
💻 Using 1 GPU(s)...


***** Running training *****
INFO:transformers.trainer:***** Running training *****
  Num examples = 11,374
INFO:transformers.trainer:  Num examples = 11,374
  Num Epochs = 3
INFO:transformers.trainer:  Num Epochs = 3
  Instantaneous batch size per device = 4
INFO:transformers.trainer:  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
INFO:transformers.trainer:  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
INFO:transformers.trainer:  Gradient Accumulation steps = 8
  Total optimization steps = 1,065
INFO:transformers.trainer:  Total optimization steps = 1,065
  Number of trainable parameters = 247,577,856
INFO:transformers.trainer:  Number of trainable parameters = 247,577,856


Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,



***** Running Evaluation *****
INFO:transformers.trainer:
***** Running Evaluation *****
  Num examples = 1171
INFO:transformers.trainer:  Num examples = 1171
  Batch size = 4
INFO:transformers.trainer:  Batch size = 4
Saving model checkpoint to ./results/checkpoint-356
INFO:transformers.trainer:Saving model checkpoint to ./results/checkpoint-356
Configuration saved in ./results/checkpoint-356/config.json
INFO:transformers.configuration_utils:Configuration saved in ./results/checkpoint-356/config.json
Configuration saved in ./results/checkpoint-356/generation_config.json
INFO:transformers.generation.configuration_utils:Configuration saved in ./results/checkpoint-356/generation_config.json
Model weights saved in ./results/checkpoint-356/model.safetensors
INFO:transformers.modeling_utils:Model weights saved in ./results/checkpoint-356/model.safetensors
tokenizer config file saved in ./results/checkpoint-356/tokenizer_config.json
INFO:transformers.tokenization_utils_base:tokenizer config

Configuration saved in ./model_result/t5_claim_normalization/config.json
INFO:transformers.configuration_utils:Configuration saved in ./model_result/t5_claim_normalization/config.json
Configuration saved in ./model_result/t5_claim_normalization/generation_config.json
INFO:transformers.generation.configuration_utils:Configuration saved in ./model_result/t5_claim_normalization/generation_config.json


✅ Evaluation results: {'eval_loss': 19.802576065063477, 'eval_runtime': 18.0901, 'eval_samples_per_second': 64.731, 'eval_steps_per_second': 16.197, 'epoch': 2.9929676511954995}


Model weights saved in ./model_result/t5_claim_normalization/model.safetensors
INFO:transformers.modeling_utils:Model weights saved in ./model_result/t5_claim_normalization/model.safetensors
tokenizer config file saved in ./model_result/t5_claim_normalization/tokenizer_config.json
INFO:transformers.tokenization_utils_base:tokenizer config file saved in ./model_result/t5_claim_normalization/tokenizer_config.json
Special tokens file saved in ./model_result/t5_claim_normalization/special_tokens_map.json
INFO:transformers.tokenization_utils_base:Special tokens file saved in ./model_result/t5_claim_normalization/special_tokens_map.json
Copy vocab file to ./model_result/t5_claim_normalization/spiece.model
INFO:transformers.models.t5.tokenization_t5_fast:Copy vocab file to ./model_result/t5_claim_normalization/spiece.model


✅ Model and tokenizer saved to ./model_result/t5_claim_normalization


In [None]:
!zip -r model_output.zip ./model_result
from google.colab import files
files.download("model_output.zip")

  adding: model_result/ (stored 0%)
  adding: model_result/t5_claim_normalization/ (stored 0%)
  adding: model_result/t5_claim_normalization/special_tokens_map.json (deflated 86%)
  adding: model_result/t5_claim_normalization/tokenizer_config.json (deflated 95%)
  adding: model_result/t5_claim_normalization/spiece.model (deflated 48%)
  adding: model_result/t5_claim_normalization/model.safetensors (deflated 8%)
  adding: model_result/t5_claim_normalization/tokenizer.json (deflated 74%)
  adding: model_result/t5_claim_normalization/config.json (deflated 63%)
  adding: model_result/t5_claim_normalization/generation_config.json (deflated 29%)
  adding: model_result/t5_claim_normalization_peft/ (stored 0%)
  adding: model_result/t5_claim_normalization_peft/adapter_model.safetensors (deflated 7%)
  adding: model_result/t5_claim_normalization_peft/special_tokens_map.json (deflated 86%)
  adding: model_result/t5_claim_normalization_peft/tokenizer_config.json (deflated 95%)
  adding: model_res

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# === Load the saved model ===
model_path = "./model_result/t5_claim_normalization"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === Inference Function ===
def normalize_claim(text: str, max_input_length=32, max_output_length=32):
    prompt = "Normalize this claim: " + text
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=max_input_length)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_output_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# === Test your model ===
example_claim = "Magarmacch // Heavy Rain //Hyderabad //Crocodile // Alert Magarmacch // Heavy Rain //Hyderabad //Crocodile // Alert Magarmacch // Heavy Rain //Hyderabad //Crocodile // Alert None"
normalized = normalize_claim(example_claim)
print("🗣️ Original:", example_claim)
print("✅ Normalized:", normalized)


loading configuration file ./model_result/t5_claim_normalization/config.json
INFO:transformers.configuration_utils:loading configuration file ./model_result/t5_claim_normalization/config.json
Model config T5Config {
  "_name_or_path": "google/flan-t5-base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
   

🗣️ Original: Magarmacch // Heavy Rain //Hyderabad //Crocodile // Alert Magarmacch // Heavy Rain //Hyderabad //Crocodile // Alert Magarmacch // Heavy Rain //Hyderabad //Crocodile // Alert None
✅ Normalized: Magarmacch is a crocodile in the rain.


In [None]:
!pip install -q nltk
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from nltk.translate.meteor_score import meteor_score
from tqdm import tqdm

def evaluate_model_on_dev(model, tokenizer, dev_df, max_input_length=32, max_output_length=32):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    predictions = []
    references = []

    for _, row in tqdm(dev_df.iterrows(), total=len(dev_df), desc="Evaluating"):
        text = row["post"]
        reference = row["normalized claim"]

        prompt = "Normalize this claim: " + text
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding="max_length", max_length=max_input_length)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_output_length)
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        predictions.append(prediction)
        references.append(reference)

    # Compute METEOR
    meteor_scores = [meteor_score([ref.split()], pred.split()) for pred, ref in zip(predictions, references)]
    average_meteor = sum(meteor_scores) / len(meteor_scores)

    print(f"🔥 Average METEOR score on dev set: {average_meteor:.4f}")
    return average_meteor

# Assuming you already have model, tokenizer, and dev_df loaded
evaluate_model_on_dev(model, tokenizer, dev_df)


Evaluating: 100%|██████████| 1171/1171 [06:46<00:00,  2.88it/s]


🔥 Average METEOR score on dev set: 0.1073


0.10730207172943068

In [3]:
!pip install -q transformers datasets accelerate peft nltk evaluate
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

import os
import gc
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    T5ForConditionalGeneration, AutoTokenizer, Trainer,
    TrainingArguments, DataCollatorForSeq2Seq,
)
from peft import PromptTuningConfig, TaskType, get_peft_model
import evaluate
from google.colab import files


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/487.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
files.upload()
train_df = pd.read_csv("train-eng.csv").dropna(subset=["post", "normalized claim"])
dev_df = pd.read_csv("dev-eng.csv").dropna(subset=["post", "normalized claim"])
test_df = pd.read_csv("test-eng.csv").dropna(subset=["post"])

print(train_df.head())


Saving dev-eng.csv to dev-eng.csv
Saving test-eng.csv to test-eng.csv
Saving train-eng.csv to train-eng.csv
                                                post  \
0  Lieutenant Retired General Asif Mumtaz appoint...   
1  A priceless clip of 1970 of Bruce Lee playing ...   
2  Hydrate\nYOURSELF\nW\nAfter Waking Up\nWater\n...   
3  Pa alam sayu idol🥺 Pa alam sayu idol🥺 Pa alam ...   
4  Look how the media LIE \n\nTO STIR UP TROUBLE\...   

                                    normalized claim  
0  Pakistani government appoints former army gene...  
1  Late actor and martial artist Bruce Lee playin...  
2  Drinking water at specific times can have diff...  
3  Mr. Bean actor Rowan Atkinson died on May 29, ...  
4  Kendall Jenner doctored a photo of her holding...  


In [7]:
model_name = "t5-base"
max_input_length = 256
max_target_length = 64
batch_size = 8
gradient_accumulation_steps = 4
output_dir = "./results"


In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

# For TRAIN and DEV datasets (with labels)
def preprocess_function(examples):
    prompt = "Normalize this claim: "
    inputs = [prompt + post.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for post in examples["post"]]
    targets = [t.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for t in examples["normalized claim"]]

    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True, padding="max_length"
    )
    labels = tokenizer(
        text_target=targets, max_length=max_target_length, truncation=True, padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# For TEST dataset (no labels)
def preprocess_test_function(examples):
    prompt = "Normalize this claim: "
    inputs = [prompt + post.encode('utf-8', 'ignore').decode('utf-8', 'ignore') for post in examples["post"]]

    model_inputs = tokenizer(
        inputs, max_length=max_input_length, truncation=True, padding="max_length"
    )
    return model_inputs

train_ds = Dataset.from_pandas(train_df).map(preprocess_function, batched=True)
dev_ds = Dataset.from_pandas(dev_df).map(preprocess_function, batched=True)
test_ds = Dataset.from_pandas(test_df).map(preprocess_test_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/11374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1171 [00:00<?, ? examples/s]

Map:   0%|          | 0/1285 [00:00<?, ? examples/s]

In [None]:

model = T5ForConditionalGeneration.from_pretrained(model_name)
model.gradient_checkpointing_enable()
model.config.use_cache = False

training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    weight_decay=0.01,
    num_train_epochs=5,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    eval_accumulation_steps=2,
    load_best_model_at_end=True,
    logging_steps=10,
    report_to="none",  # <-- Add this
)





data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [None]:
torch.cuda.empty_cache(); gc.collect()
trainer.train()

os.makedirs("./model_result/t5_claim_normalization", exist_ok=True)
model.save_pretrained("./model_result/t5_claim_normalization")
tokenizer.save_pretrained("./model_result/t5_claim_normalization")


Epoch,Training Loss,Validation Loss
1,0.9866,0.78771
2,0.9046,0.752239
3,0.8339,0.748444
4,0.8979,0.74873


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


('./model_result/t5_claim_normalization/tokenizer_config.json',
 './model_result/t5_claim_normalization/special_tokens_map.json',
 './model_result/t5_claim_normalization/spiece.model',
 './model_result/t5_claim_normalization/added_tokens.json',
 './model_result/t5_claim_normalization/tokenizer.json')

In [None]:
meteor = evaluate.load('meteor')

def evaluate_meteor(dataset):
    references = dataset['normalized claim']
    inputs = ["Normalize this claim: " + post for post in dataset['post']]
    predictions = []

    for input_text in inputs:
        inputs_tok = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_input_length).to(model.device)
        output_ids = model.generate(**inputs_tok, max_new_tokens=64)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)

    score = meteor.compute(predictions=predictions, references=references)
    return score

meteor_base_score = evaluate_meteor(dev_df)
print(f"🌟 METEOR Base Model Score: {meteor_base_score['meteor']:.4f}")


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🌟 METEOR Base Model Score: 0.2655


In [None]:
def normalize_claim(model, tokenizer, text, max_input_length=256, max_new_tokens=64):
    prompt = "Normalize this claim: "
    input_text = prompt + text
    inputs_tok = tokenizer(
        input_text,
        return_tensors="pt",
        truncation=True,
        max_length=max_input_length
    ).to(model.device)

    output_ids = model.generate(
        **inputs_tok,
        max_new_tokens=max_new_tokens,
        num_beams=4,               # smoother outputs
        early_stopping=True
    )

    normalized_claim = tokenizer.decode(
        output_ids[0],
        skip_special_tokens=True
    )
    return normalized_claim

# ✅ Example test
test_post = "The salary of a U.S. Senator is $174,000 per year. This is Joe Bidenâ€™s house.... seems legit ðŸ™„ The salary of a U.S. Senator is $174,000 per year. This is Joe Bidenâ€™s house.... seems legit ðŸ™„ The salary of a U.S. Senator is $174,000 per year. This is Joe Bidenâ€™s house.... seems legit ðŸ™„ None"
normalized_output = normalize_claim(model, tokenizer, test_post)
print(f"✅ Normalized Claim: {normalized_output}")


✅ Normalized Claim: Joe Biden’s house is worth $174,000 per year


In [9]:
from transformers import Trainer, DataCollatorForSeq2Seq

peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    num_virtual_tokens=50,
    prompt_tuning_init='TEXT',
    prompt_tuning_init_text="Normalize this claim:",
    tokenizer_name_or_path="/content/drive/MyDrive/model_no_prompt"
)

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/model_no_prompt")
prompt_tuned_model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/model_no_prompt")
prompt_tuned_model = get_peft_model(prompt_tuned_model, peft_config)

training_args_peft = TrainingArguments(
    output_dir="./peft_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    weight_decay=0.01,
    num_train_epochs=10,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    load_best_model_at_end=True,
    logging_steps=10,
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=prompt_tuned_model)

# ✅ You forgot this step:
trainer_peft = Trainer(
    model=prompt_tuned_model,
    args=training_args_peft,
    train_dataset=train_ds,
    eval_dataset=dev_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

torch.cuda.empty_cache()
gc.collect()

# ✅ Now you can train
trainer_peft.train()

# ✅ Save the prompt-tuned model
os.makedirs("./model_result/t5_claim_normalization_peft", exist_ok=True)
prompt_tuned_model.save_pretrained("./model_result/t5_claim_normalization_peft")
tokenizer.save_pretrained("./model_result/t5_claim_normalization_peft")


  trainer_peft = Trainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,1.1389,0.92929
2,1.1289,0.927982
3,1.046,0.926708
4,1.0474,0.925654
5,1.055,0.924779
6,1.053,0.924119
7,1.0708,0.923603
8,1.0867,0.923241
9,1.1057,0.922978


('./model_result/t5_claim_normalization_peft/tokenizer_config.json',
 './model_result/t5_claim_normalization_peft/special_tokens_map.json',
 './model_result/t5_claim_normalization_peft/spiece.model',
 './model_result/t5_claim_normalization_peft/added_tokens.json',
 './model_result/t5_claim_normalization_peft/tokenizer.json')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
meteor = evaluate.load('meteor')

def evaluate_meteor_peft(dataset):
    references = dataset['normalized claim']
    inputs = ["Normalize this claim: " + post for post in dataset['post']]
    predictions = []

    for input_text in inputs:
        inputs_tok = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_input_length).to(prompt_tuned_model.device)
        output_ids = prompt_tuned_model.generate(**inputs_tok, max_new_tokens=64)
        pred = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(pred)

    score = meteor.compute(predictions=predictions, references=references)
    return score

meteor_peft_score = evaluate_meteor_peft(dev_df)
print(f"🚀 METEOR Prompt-Tuned Model Score: {meteor_peft_score['meteor']:.4f}")


Downloading builder script:   0%|          | 0.00/7.02k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


🚀 METEOR Prompt-Tuned Model Score: 0.1934
