In [1]:
import subprocess
import sys
import os

In [2]:
import os
import json
import torch
import warnings
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForSeq2SeqLM, 
    AutoTokenizer, 
    Trainer, 
    TrainingArguments,
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig, 
    get_peft_model, 
    TaskType, 
    PeftConfig,
    prepare_model_for_kbit_training
)

# Environment configuration
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")

print("Environment setup completed")

2025-09-16 14:30:16.151603: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758033016.500315      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758033016.595721      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using device: cuda
CUDA available: True
GPU: Tesla T4
Environment setup completed


In [3]:

# Load action item dataset with robust error handling
DATASET_PATH = "/kaggle/input/chat-data/combined1.jsonl"

def load_action_item_dataset():
    """Load dataset with fallback options"""
    try:
        print(f"Loading dataset from: {DATASET_PATH}")
        raw_dataset = load_dataset('json', data_files=DATASET_PATH, split='train')
        print(f"Dataset loaded successfully with {len(raw_dataset)} samples")
        
        # Convert to instruction format
        def convert_format(example):
            return {
                "input_text": f"Extract action items as JSON from this dialogue: {example['dialogue']}",
                "target_text": example["actions"]
            }
        
        dataset = raw_dataset.map(convert_format)
        print("Dataset converted to instruction format")
        
        # Show sample
        print("Sample example:")
        print(f"Input: {dataset[0]['input_text'][:150]}...")
        print(f"Target: {dataset[0]['target_text']}")
        
        return dataset
        
    except Exception as e:
        print(f"Error loading main dataset: {e}")
        print("Creating a small fallback dataset to allow the notebook to run end-to-end.")
        # Tiny synthetic fallback data (repeated) so training can proceed if real data is missing
        sample_data = [
            {
                "input_text": "Extract action items as JSON from this dialogue:\nAlice: We need to finish the report by Thursday.\nBob: I will review it tomorrow.\nAlice: Please ping the team.",
                "target_text": '[{"task": "Finish report", "owner": "Alice", "due_date": "Thursday"}, {"task": "Review report", "owner": "Bob", "due_date": "tomorrow"}, {"task": "Ping the team", "owner": "Alice", "due_date": "None"}]'
            },
            {
                "input_text": "Extract action items as JSON from this dialogue:\nJohn: Can you book a meeting room for Friday?\nSara: Sure, I will also prepare slides.\nJohn: Great, send the calendar invite.",
                "target_text": '[{"task": "Book meeting room", "owner": "Sara", "due_date": "Friday"}, {"task": "Prepare slides", "owner": "Sara", "due_date": "None"}, {"task": "Send calendar invite", "owner": "John", "due_date": "None"}]'
            },
            {
                "input_text": "Extract action items as JSON from this dialogue:\nTeam: We should deploy on Wednesday.\nPM: David, please handle the rollout plan.\nDavid: Will do.\nEmily: I\'ll prepare the agenda.",
                "target_text": '[{"task": "Deploy", "owner": "Team", "due_date": "Wednesday"}, {"task": "Rollout plan", "owner": "David", "due_date": "None"}, {"task": "Prepare agenda", "owner": "Emily", "due_date": "None"}]'
            }
        ] * 150
        
        dataset = Dataset.from_list(sample_data)
        print(f"Created fallback dataset with {len(dataset)} examples")
        return dataset


# Load the dataset
dataset = load_action_item_dataset()


Loading dataset from: /kaggle/input/chat-data/combined1.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded successfully with 833 samples


Map:   0%|          | 0/833 [00:00<?, ? examples/s]

Dataset converted to instruction format
Sample example:
Input: Extract action items as JSON from this dialogue: Olivia: I just got an offer on LinkedIn
Anne: Where from?
Olivia: France, Project Manager
Mike: Ok, b...
Target: [
  {
    "task": "Research the job offer from the creative agency in Rennes, France",
    "owner": "Olivia",
    "due_date": "None"
  },
  {
    "task": "Discuss the pros and cons of moving to France with Olivia",
    "owner": "Group",
    "due_date": "None"
  },
  {
    "task": "Decide on Olivia's interest in the job offer",
    "owner": "Olivia",
    "due_date": "None"
  },
  {
    "task": "Let the creative agency know Olivia's interest in the job offer",
    "owner": "Olivia",
    "due_date": "None"
  }
]


In [4]:
# Model configuration
MODEL_NAME = "google/flan-t5-base"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 256

def load_flan_t5_model():
    """Load FLAN-T5 with proper error handling"""
    try:
        print(f"Loading FLAN-T5 model: {MODEL_NAME}")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        print(f"Tokenizer loaded with vocab size: {tokenizer.vocab_size}")
        
        # Quantization config for GPU
        if torch.cuda.is_available():
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4", 
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
            )
            print("Using 4-bit quantization for GPU")
        else:
            quantization_config = None
            print("Using full precision for CPU")
        
        # Load model
        model = AutoModelForSeq2SeqLM.from_pretrained(
            MODEL_NAME,
            quantization_config=quantization_config,
            device_map="auto" if torch.cuda.is_available() else None,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        )
        
        # Prepare for quantized training if using GPU
        if quantization_config is not None:
            model = prepare_model_for_kbit_training(model)
        
        print(f"FLAN-T5 model loaded with {model.num_parameters():,} parameters")
        return model, tokenizer
        
    except Exception as e:
        print(f"Error loading model: {e}")
        # Fallback to basic loading
        print("Attempting fallback model loading")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
        return model, tokenizer

# Load model and tokenizer
model, tokenizer = load_flan_t5_model()

Loading FLAN-T5 model: google/flan-t5-base


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Tokenizer loaded with vocab size: 32100
Error loading model: No package metadata was found for bitsandbytes
Attempting fallback model loading


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

PARAMETER-1 (PERFORMED BETTER)

In [5]:
# Configure LoRA for FLAN-T5
def setup_lora(model):
    """Setup LoRA configuration with error handling"""
    try:
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["q", "v"],
            lora_dropout=0.1,
            bias="none",
            task_type=TaskType.SEQ_2_SEQ_LM,
            inference_mode=False
        )
        
        print("Applying LoRA configuration...")
        model = get_peft_model(model, lora_config)
        
        # Display trainable parameters
        model.print_trainable_parameters()
        print("LoRA applied successfully")
        
        return model
        
    except Exception as e:
        print(f"Error applying LoRA: {e}")
        print("Training will continue without LoRA (full fine-tuning)")
        return model

# Apply LoRA
model = setup_lora(model)

Applying LoRA configuration...
trainable params: 1,769,472 || all params: 249,347,328 || trainable%: 0.7096
LoRA applied successfully


In [6]:
# --- Train/Test Split (added) ---
from datasets import DatasetDict

print("Creating train/test split...")
# 20% test split with fixed seed for reproducibility
splits = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = splits["train"]
test_dataset  = splits["test"]
print(f"Train size: {len(train_dataset)} | Test size: {len(test_dataset)}")

Creating train/test split...
Train size: 666 | Test size: 167


In [7]:
# Data preprocessing function
def preprocess_dataset(examples):
    """Preprocess dataset for FLAN-T5 training"""
    inputs = examples["input_text"]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding=False,
        return_tensors=None
    )
    
    targets = examples["target_text"]  
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding=False,
            return_tensors=None
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

try:
    tokenized_dataset = dataset.map(preprocess_dataset, batched=True, remove_columns=dataset.column_names)
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True,
        return_tensors="pt"
    )
    
    print(f"Dataset preprocessing completed: {tokenized_dataset}")
    
except Exception as e:
    print(f"Preprocessing error: {e}")
    # Fallback: simpler preprocessing
    print("Using fallback preprocessing")
    tokenized_dataset = dataset
    data_collator = None

# --- Preprocess train & test splits (added) ---
try:
    tokenized_train_dataset = train_dataset.map(preprocess_dataset, batched=True, remove_columns=train_dataset.column_names)
    tokenized_test_dataset  = test_dataset.map(preprocess_dataset,  batched=True, remove_columns=test_dataset.column_names)
    
    # Keep original variable name for training compatibility
    tokenized_dataset = tokenized_train_dataset
    
    print("Tokenization complete.")
    print(tokenized_train_dataset)
    print(tokenized_test_dataset)
except Exception as e:
    print(f"Split preprocessing error: {e}")
    # Fallback: use raw splits if mapping fails
    tokenized_train_dataset = train_dataset
    tokenized_test_dataset  = test_dataset
    tokenized_dataset = tokenized_train_dataset


Map:   0%|          | 0/833 [00:00<?, ? examples/s]

Dataset preprocessing completed: Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 833
})


Map:   0%|          | 0/666 [00:00<?, ? examples/s]

Map:   0%|          | 0/167 [00:00<?, ? examples/s]

Tokenization complete.
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 666
})
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 167
})


In [8]:
!pip install -U "transformers==4.36.0" "accelerate==0.24.1" "datasets==2.14.0" "peft==0.6.2" "safetensors==0.4.1"


Collecting transformers==4.36.0
  Downloading transformers-4.36.0-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.24.1
  Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)
Collecting datasets==2.14.0
  Downloading datasets-2.14.0-py3-none-any.whl.metadata (19 kB)
Collecting peft==0.6.2
  Downloading peft-0.6.2-py3-none-any.whl.metadata (23 kB)
Collecting safetensors==0.4.1
  Downloading safetensors-0.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.36.0)
  Downloading tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.14.0)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.10.0->accelerate==0

In [9]:
import torch
from transformers import TrainingArguments, IntervalStrategy
OUTPUT_DIR = "/kaggle/working/flan-t5-action-extractor"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    # Nudge: 1e-3 is hot for T5+LoRA; start safer.
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_ratio=0.05,
    logging_steps=10,
    save_strategy=IntervalStrategy.EPOCH,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to=[],                              # more robust than None on some versions
    load_best_model_at_end=False,
    dataloader_pin_memory=False,
    gradient_checkpointing=False,
    bf16=torch.cuda.is_available() and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported(),
    fp16=torch.cuda.is_available() and not (hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported()),
)
print("Training configuration completed")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Epochs: {training_args.num_train_epochs}")


Training configuration completed
Batch size: 4
Learning rate: 0.0002
Epochs: 3


In [10]:
# Initialize trainer and start training
def run_training():
    """Execute training with error handling"""
    try:
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_dataset,     # <-- uses train split via our reassignment
            tokenizer=tokenizer,
            data_collator=data_collator,
        )
        
        print("Starting FLAN-T5 training...")
        print("=" * 50)
        
        # Start training
        training_result = trainer.train()
        
        print("=" * 50)
        print("Training completed successfully")
        print(f"Final training loss: {training_result.training_loss:.4f}")
        
        return trainer
        
    except Exception as e:
        print(f"Training error: {e}")
        print("Training failed - check memory, batch size, or model configuration")
        return None

# Execute training
trainer = run_training()


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting FLAN-T5 training...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,2.6944
20,2.5593
30,2.3143
40,1.8809
50,1.549
60,1.4167
70,1.229
80,0.989
90,0.9209
100,0.7957


Training completed successfully
Final training loss: 1.0701


In [11]:
# --- Evaluation on held-out test set (added) ---
import numpy as np
import torch
from math import isfinite
from tqdm.auto import tqdm

model.eval()
model.to(device)

def _normalize_text(s: str):
    return " ".join(str(s).strip().split())

def _lcs_len(x_tokens, y_tokens):
    # Token-level LCS length (dynamic programming)
    n, m = len(x_tokens), len(y_tokens)
    dp = [[0]*(m+1) for _ in range(n+1)]
    for i in range(1, n+1):
        xi = x_tokens[i-1]
        row = dp[i]
        prev_row = dp[i-1]
        for j in range(1, m+1):
            if xi == y_tokens[j-1]:
                row[j] = prev_row[j-1] + 1
            else:
                row[j] = max(prev_row[j], row[j-1])
    return dp[n][m]

def _rouge_l_f1(pred, ref):
    # Simple token-level ROUGE-L F1
    ptoks = _normalize_text(pred).split()
    rtoks = _normalize_text(ref).split()
    if not ptoks or not rtoks:
        return 0.0
    lcs = _lcs_len(ptoks, rtoks)
    prec = lcs / max(1, len(ptoks))
    rec  = lcs / max(1, len(rtoks))
    if prec + rec == 0:
        return 0.0
    return 2 * prec * rec / (prec + rec)

def _bag_of_words_f1(pred, ref):
    from collections import Counter
    ptoks = _normalize_text(pred).split()
    rtoks = _normalize_text(ref).split()
    pc, rc = Counter(ptoks), Counter(rtoks)
    overlap = sum((pc & rc).values())
    prec = overlap / max(1, sum(pc.values()))
    rec  = overlap / max(1, sum(rc.values()))
    if prec + rec == 0:
        return 0.0
    return 2 * prec * rec / (prec + rec)

# Generate predictions for the test set
pred_texts = []
ref_texts  = []
input_texts = []

max_gen_tokens = MAX_TARGET_LENGTH
gen_kwargs = dict(max_new_tokens=max_gen_tokens)

print("Generating predictions on test split...")
for idx in tqdm(range(len(test_dataset))):
    inp = test_dataset[idx]["input_text"]
    ref = test_dataset[idx]["target_text"]
    enc = tokenizer(inp, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LENGTH)
    enc = {k: v.to(device) for k, v in enc.items()}
    with torch.no_grad():
        out = model.generate(**enc, **gen_kwargs)
    pred = tokenizer.decode(out[0], skip_special_tokens=True)
    
    input_texts.append(inp)
    pred_texts.append(pred)
    ref_texts.append(ref)

# Compute metrics
def _exact_match(pred, ref):
    return int(_normalize_text(pred) == _normalize_text(ref))

ems = [ _exact_match(p, r) for p, r in zip(pred_texts, ref_texts) ]
rouges = [ _rouge_l_f1(p, r) for p, r in zip(pred_texts, ref_texts) ]
bows = [ _bag_of_words_f1(p, r) for p, r in zip(pred_texts, ref_texts) ]

metrics = {
    "test_size": len(test_dataset),
    "exact_match": float(np.mean(ems)),
    "rougeL_f1": float(np.mean(rouges)),
    "bow_f1": float(np.mean(bows)),
    "avg_pred_len": float(np.mean([len(_normalize_text(p).split()) for p in pred_texts])),
    "avg_ref_len": float(np.mean([len(_normalize_text(r).split()) for r in ref_texts])),
}

print("\n--- Test Metrics ---")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

# Show a few qualitative examples
print("\n--- Sample Predictions ---")
for i in range(min(5, len(test_dataset))):
    print(f"Example {i+1}")
    print("INPUT:", input_texts[i][:300].replace("\n", " "))
    print("REF:  ", ref_texts[i])
    print("PRED: ", pred_texts[i])
    print("-"*80)

Generating predictions on test split...


  0%|          | 0/167 [00:00<?, ?it/s]


--- Test Metrics ---
test_size: 167
exact_match: 0.0000
rougeL_f1: 0.4012
bow_f1: 0.4112
avg_pred_len: 18.8862
avg_ref_len: 28.0599

--- Sample Predictions ---
Example 1
INPUT: Extract action items as JSON from this dialogue: Katy: I'm at the entrance Katy: where are you? Lucy: as well, lol Lily: I'm with Lucy Jorge: I'm still in the library Katy: but what entrance? of the library? Lucy: no! the main entrance to the university Katy: ok, i'll be there in 5 min
REF:   [
  {
    "task": "Meet at the main entrance to the university",
    "owner": "Katy",
    "due_date": "in 5 min"
  }
]
PRED:  [  "task": "Be at the entrance of the library", "owner": "Katy", "due_date": "5 min"  ]
--------------------------------------------------------------------------------
Example 2
INPUT: Extract action items as JSON from this dialogue: Chloe: So guess what I heard today! Patrick: Omg girl what?? Chloe: I GOT INTO GRAD SCHOOL!  Patrick: I'M SO PROUD OF YOU!  Chloe: Thank you! We have to go out tonight

In [12]:
# --- Contextual Metrics (append-only) ---
# If your environment allows installs, keep these; otherwise the code will gracefully skip.
try:
    # quiet installs; remove -q if you want logs
    import sys, subprocess
    def _pip(pkg): 
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])
    _pip("bert-score")
    _pip("sentence-transformers")
except Exception as _e:
    print("Package install skipped or failed:", _e)

import numpy as np

ctx_metrics = {}

# 1) BERTScore (semantic precision/recall/F1) — robust for paraphrases
try:
    from bert_score import score as bertscore
    P, R, F1 = bertscore(pred_texts, ref_texts, lang='en', rescale_with_baseline=True)
    ctx_metrics['bertscore_precision'] = float(P.mean())
    ctx_metrics['bertscore_recall']    = float(R.mean())
    ctx_metrics['bertscore_f1']        = float(F1.mean())
except Exception as e:
    print("BERTScore skipped:", e)

# 2) Sentence-BERT cosine similarity — fast, strong baseline for contextual match
try:
    from sentence_transformers import SentenceTransformer, util
    st_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device=device.type)
    emb_pred = st_model.encode(pred_texts, convert_to_tensor=True, normalize_embeddings=True, batch_size=64, show_progress_bar=False)
    emb_ref  = st_model.encode(ref_texts,  convert_to_tensor=True, normalize_embeddings=True, batch_size=64, show_progress_bar=False)
    # cosine for aligned pairs (diagonal)
    cos = util.cos_sim(emb_pred, emb_ref).diagonal().detach().cpu().numpy()
    ctx_metrics['sbert_cosine_mean']   = float(np.mean(cos))
    ctx_metrics['sbert_cosine_median'] = float(np.median(cos))
except Exception as e:
    print("Sentence-BERT skipped:", e)
    # 3) Fallback: TF-IDF cosine (still contextual-ish, but weaker than embeddings)
    try:
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.metrics.pairwise import cosine_similarity
        tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1)
        X = tfidf.fit_transform([*pred_texts, *ref_texts])
        n = len(pred_texts)
        A, B = X[:n], X[n:]
        sims = [cosine_similarity(A[i], B[i])[0, 0] for i in range(n)]
        ctx_metrics['tfidf_cosine_mean']   = float(np.mean(sims))
        ctx_metrics['tfidf_cosine_median'] = float(np.median(sims))
    except Exception as ee:
        print("TF-IDF fallback skipped:", ee)

print("\n--- Contextual Metrics ---")
for k, v in ctx_metrics.items():
    print(f"{k}: {v:.4f}")


   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 61.1/61.1 kB 1.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.2/42.2 kB 941.6 kB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 11.6/11.6 MB 101.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 563.4/563.4 kB 27.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 485.8/485.8 kB 19.8 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.3/3.3 MB 74.3 MB/s eta 0:00:00


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

BERTScore skipped: cannot import name 'layer_type_validation' from 'transformers.configuration_utils' (/usr/local/lib/python3.11/dist-packages/transformers/configuration_utils.py)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Sentence-BERT skipped: cannot import name 'layer_type_validation' from 'transformers.configuration_utils' (/usr/local/lib/python3.11/dist-packages/transformers/configuration_utils.py)

--- Contextual Metrics ---
tfidf_cosine_mean: 0.3406
tfidf_cosine_median: 0.3311


In [13]:
# Save model with comprehensive verification
FINAL_OUTPUT_DIR = "/kaggle/working/flan-t5-action-extractor-final"

def save_and_verify_model():
    """Save model with thorough verification"""
    try:
        print("Saving FLAN-T5 LoRA adapter...")
        model.save_pretrained(FINAL_OUTPUT_DIR)
        tokenizer.save_pretrained(FINAL_OUTPUT_DIR)
        
        # Verification steps
        print("Verifying save integrity...")
        
        # Check files exist
        required_files = ['adapter_config.json', 'adapter_model.safetensors']
        for file in required_files:
            file_path = os.path.join(FINAL_OUTPUT_DIR, file)
            if os.path.exists(file_path):
                size = os.path.getsize(file_path)
                print(f"Found {file}: {size:,} bytes")
            else:
                print(f"Missing {file}")
        
        # Attempt to load PEFT config if present
        try:
            peft_config_path = os.path.join(FINAL_OUTPUT_DIR, "adapter_config.json")
            if os.path.exists(peft_config_path):
                with open(peft_config_path, "r") as f:
                    peft_cfg = json.load(f)
                base_model_path = peft_cfg.get("base_model_name_or_path", None)
                if base_model_path:
                    print(f"Base model recorded in adapter: {base_model_path}")
                else:
                    print("Warning: base_model_name_or_path not found in adapter_config.json")
            else:
                print("No adapter_config.json found; skipping PEFT config check.")
                    
        except Exception as e:
            print(f"PEFT verification failed: {e}")
            
        print(f"Model saved to: {FINAL_OUTPUT_DIR}")
        return True
        
    except Exception as e:
        print(f"Save error: {e}")
        return False

# Save model
save_success = save_and_verify_model()

Saving FLAN-T5 LoRA adapter...
Verifying save integrity...
Found adapter_config.json: 767 bytes
Found adapter_model.safetensors: 7,098,016 bytes
Base model recorded in adapter: google/flan-t5-base
Model saved to: /kaggle/working/flan-t5-action-extractor-final


____________________

PARAMETER SET-2 (PERFORMED WORSE)

In [14]:
# FLAN-T5 JSON action-item finetune (HF 4.52-safe)
# - Canonical JSON targets
# - Robust LoRA coverage (q,k,v,o,wi,wo)
# - SAFE decode (fixes OverflowError)
# - Version-adaptive TrainingArguments AND Trainer init (no label_names if unsupported)

import os, json, warnings, inspect
from typing import Any, Dict, List

import numpy as np
import torch
import transformers
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from peft import LoraConfig, get_peft_model, TaskType

# -----------------------------
# Env & reproducibility
# -----------------------------
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
SEED = 42
set_seed(SEED)

print("Transformers version:", transformers.__version__)
print("Device:", "cuda" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name())

# -----------------------------
# Config
# -----------------------------
MODEL_NAME      = "google/flan-t5-base"
DATASET_PATH    = "/kaggle/input/cleaned-json/prepared_data_cleaned.jsonl"
OUTPUT_DIR      = "/kaggle/working/flan_t5_action_json"
MAX_INPUT_LEN   = 512
MAX_TARGET_LEN  = 256

# -----------------------------
# Data loading & canonicalization
# -----------------------------
def _canon_json(obj: Any) -> str:
    return json.dumps(obj, ensure_ascii=False, sort_keys=True)

def load_action_item_dataset(path: str) -> Dataset:
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found at: {path}")
    print(f"Loading dataset from: {path}")
    ds = load_dataset("json", data_files=path, split="train")
    print(f"Loaded {len(ds)} rows.")
    return ds

raw = load_action_item_dataset(DATASET_PATH)

def to_instruction_format(ex: Dict[str, Any]) -> Dict[str, str]:
    tgt = ex.get("actions", [])
    if isinstance(tgt, str):
        try:
            tgt = _canon_json(json.loads(tgt))
        except Exception:
            pass
    else:
        tgt = _canon_json(tgt)
    prompt = (
        "Extract ONLY action items as a JSON list. "
        "Return JSON and nothing else. Use fields: task, owner, due_date.\n\n"
        f"Dialog:\n{ex['dialogue']}"
    )
    return {"input_text": prompt, "target_text": tgt}

ds = raw.map(to_instruction_format, remove_columns=raw.column_names)
splits = ds.train_test_split(test_size=0.2, seed=SEED)
train_ds, test_ds = splits["train"], splits["test"]
print(f"Train: {len(train_ds)} | Test: {len(test_ds)}")

# -----------------------------
# Tokenizer & preprocessing
# -----------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_batch(batch: Dict[str, List[str]]) -> Dict[str, Any]:
    model_inputs = tokenizer(
        batch["input_text"],
        max_length=MAX_INPUT_LEN,
        truncation=True
    )
    try:
        labels = tokenizer(
            text_target=batch["target_text"],
            max_length=MAX_TARGET_LEN,
            truncation=True
        )
    except TypeError:
        # very old fallback
        labels = tokenizer(
            batch["target_text"],
            max_length=MAX_TARGET_LEN,
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_ds.map(preprocess_batch, batched=True, remove_columns=train_ds.column_names)
tokenized_test  = test_ds.map(preprocess_batch,  batched=True, remove_columns=test_ds.column_names)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL_NAME)

# -----------------------------
# Model & LoRA
# -----------------------------
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model.config.use_cache = False
model.config.pad_token_id = tokenizer.pad_token_id
if getattr(model.config, "decoder_start_token_id", None) is None:
    model.config.decoder_start_token_id = tokenizer.pad_token_id

lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q","k","v","o","wi","wo"]  # T5 attention + FFN
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

# -----------------------------
# Metrics (lightweight) + SAFE unwrap/sanitize
# -----------------------------
def _norm(s: str) -> str:
    return " ".join(str(s).strip().split())

def _lcs_len(x, y):
    n, m = len(x), len(y)
    dp = [[0]*(m+1) for _ in range(n+1)]
    for i in range(1, n+1):
        xi = x[i-1]; row = dp[i]; prev = dp[i-1]
        for j in range(1, m+1):
            row[j] = prev[j-1] + 1 if xi == y[j-1] else max(prev[j], row[j-1])
    return dp[n][m]

def rougeL_f1(pred, ref):
    pt, rt = _norm(pred).split(), _norm(ref).split()
    if not pt or not rt: return 0.0
    l = _lcs_len(pt, rt); p = l/max(1,len(pt)); r = l/max(1,len(rt))
    return 0.0 if p+r==0 else 2*p*r/(p+r)

def bow_f1(pred, ref):
    from collections import Counter
    pt, rt = _norm(pred).split(), _norm(ref).split()
    pc, rc = Counter(pt), Counter(rt)
    overlap = sum((pc & rc).values())
    p = overlap / max(1,sum(pc.values()))
    r = overlap / max(1,sum(rc.values()))
    return 0.0 if p+r==0 else 2*p*r/(p+r)

def _unwrap_sequences(preds):
    # Handle tuples from Trainer (sequences, scores, …), tensors, lists
    if isinstance(preds, (list, tuple)):
        preds = preds[0]
    if isinstance(preds, torch.Tensor):
        preds = preds.detach().cpu().numpy()
    preds = np.asarray(preds)
    if preds.dtype.kind != "i":
        preds = preds.astype(np.int64, copy=False)
    preds = np.where(preds < 0, tokenizer.pad_token_id, preds)
    return preds

def _unwrap_labels(labels):
    if isinstance(labels, torch.Tensor):
        labels = labels.detach().cpu().numpy()
    labels = np.asarray(labels)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    return labels

def compute_metrics(eval_pred):
    predictions = getattr(eval_pred, "predictions", None)
    labels_ids  = getattr(eval_pred, "label_ids",  None)
    if predictions is None:
        predictions, labels_ids = eval_pred

    preds_ids = _unwrap_sequences(predictions)
    labels_ids = _unwrap_labels(labels_ids)

    preds = tokenizer.batch_decode(preds_ids, skip_special_tokens=True)
    refs  = tokenizer.batch_decode(labels_ids,  skip_special_tokens=True)

    em  = float(np.mean([_norm(p)==_norm(r) for p,r in zip(preds, refs)]))
    rL  = float(np.mean([rougeL_f1(p,r) for p,r in zip(preds, refs)]))
    bow = float(np.mean([bow_f1(p,r) for p,r in zip(preds, refs)]))
    return {"exact_match": em, "rougeL_f1": rL, "bow_f1": bow}

# -----------------------------
# Version-adaptive TrainingArguments
# -----------------------------
def build_args(**kw):
    sig = inspect.signature(Seq2SeqTrainingArguments.__init__)
    allowed = set(sig.parameters.keys())
    out = {k:v for k,v in kw.items() if k in allowed}

    # eval strategy name across versions
    if "evaluation_strategy" in allowed and "evaluation_strategy" in kw:
        out["evaluation_strategy"] = kw["evaluation_strategy"]
    elif "eval_strategy" in allowed and "evaluation_strategy" in kw:
        out["eval_strategy"] = kw["evaluation_strategy"]

    if "predict_with_generate" in kw and "predict_with_generate" not in allowed:
        out.pop("predict_with_generate", None)
    if "generation_max_length" in kw and "generation_max_length" not in allowed:
        out.pop("generation_max_length", None)

    return Seq2SeqTrainingArguments(**out)

base_args = dict(
    output_dir=OUTPUT_DIR,
    seed=SEED,Training done.
Evaluating on test set…
test_loss: 0.4811
test_exact_match: 0.0000
test_rougeL_f1: 0.2995
test_bow_f1: 0.3027
test_runtime: 103.9697
test_samples_per_second: 0.5770
test_steps_per_second: 0.0770
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    learning_rate=1e-3,
    weight_decay=0.01,
    warmup_ratio=0.1,
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="rougeL_f1",
    greater_is_better=True,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    report_to=[],
    predict_with_generate=True,
    generation_max_length=MAX_TARGET_LEN,
    evaluation_strategy="steps",
    eval_accumulation_steps=1,
)

if torch.cuda.is_available():
    if hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
        base_args["bf16"] = True
    else:
        base_args["fp16"] = True

training_args = build_args(**base_args)

# -----------------------------
# Trainer (version-adaptive kwargs: no label_names if unsupported)
# -----------------------------
trainer_kwargs = dict(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

sig_tr = inspect.signature(Seq2SeqTrainer.__init__)
if "label_names" in sig_tr.parameters:
    trainer_kwargs["label_names"] = ["labels"]  # only if supported

trainer = Seq2SeqTrainer(**trainer_kwargs)

print("Starting training…")
trainer.train()
print("Training done.")

# -----------------------------
# Evaluation / prediction
# -----------------------------
print("Evaluating on test set…")
predictions = trainer.predict(tokenized_test, max_length=MAX_TARGET_LEN, num_beams=4)

# safe manual decode
pred_ids = predictions.predictions[0] if isinstance(predictions.predictions, (list, tuple)) else predictions.predictions
pred_ids = np.asarray(pred_ids)
pred_ids = np.where(pred_ids < 0, tokenizer.pad_token_id, pred_ids)
decoded_preds = tokenizer.batch_decode(pred_ids.astype(np.int64, copy=False), skip_special_tokens=True)
decoded_refs  = tokenizer.batch_decode(
    np.where(predictions.label_ids!=-100, predictions.label_ids, tokenizer.pad_token_id),
    skip_special_tokens=True
)

for k, v in predictions.metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, (int,float)) else f"{k}: {v}")

print("\n--- Sample Predictions ---")
for i in range(min(5, len(decoded_preds))):
    print("REF :", decoded_refs[i])
    print("PRED:", decoded_preds[i])
    print("-"*80)

# -----------------------------
# Save LoRA adapter
# -----------------------------
SAVE_DIR = os.path.join(OUTPUT_DIR, "lora_adapter")
os.makedirs(SAVE_DIR, exist_ok=True)
model.save_pretrained(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print("Saved LoRA adapter to:", SAVE_DIR)


Transformers version: 4.52.4
Device: cuda
GPU: Tesla T4
Loading dataset from: /kaggle/input/cleaned-json/prepared_data_cleaned.jsonl


Generating train split: 0 examples [00:00, ? examples/s]

Loaded 300 rows.


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Train: 240 | Test: 60


Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

trainable params: 4,620,288 || all params: 252,198,144 || trainable%: 1.8320


No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting training…


Step,Training Loss,Validation Loss,Exact Match,Rougel F1,Bow F1
50,0.9927,0.497748,0.0,0.433814,0.449217


Training done.
Evaluating on test set…


test_loss: 0.4811
test_exact_match: 0.0000
test_rougeL_f1: 0.2995
test_bow_f1: 0.3027
test_runtime: 103.9697
test_samples_per_second: 0.5770
test_steps_per_second: 0.0770

--- Sample Predictions ---
REF : ["due_date": "None", "owner": "Olivia", "task": "Email Shepperd and answer his questions about Amanda's offer", "due_date": "None", "owner": "Emily", "task": "Call Hoffmann"]
PRED: ["due_date": "None", "owner": "Emily", "task": "Email Shepperd", "due_date": "None", "owner": "Emily", "task": "Email Shepperd", "due_date": "None", "owner": "Emily", "task": "Email Shepperd", "due_date": "None", "owner": "Emily", "task": "Email Shepperd", "due_date": "None", "owner": "Emily", "task": "Email Shepperd", "due_date": "None", "owner": "Emily", "task": "Email Shepperd", "due_date": "None", "owner": "Emily", "task": "Email Shepperd", "due_date": "
--------------------------------------------------------------------------------
REF : ["due_date": "None", "owner": "Victoria", "task": "Find work in 

In [15]:
!zip -r /kaggle/working/flan_t5_action_json.zip /kaggle/working/flan_t5_action_json

  adding: kaggle/working/flan_t5_action_json/ (stored 0%)
  adding: kaggle/working/flan_t5_action_json/lora_adapter/ (stored 0%)
  adding: kaggle/working/flan_t5_action_json/lora_adapter/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/flan_t5_action_json/lora_adapter/tokenizer_config.json (deflated 95%)
  adding: kaggle/working/flan_t5_action_json/lora_adapter/tokenizer.json (deflated 74%)
  adding: kaggle/working/flan_t5_action_json/lora_adapter/README.md (deflated 66%)
  adding: kaggle/working/flan_t5_action_json/lora_adapter/adapter_config.json (deflated 55%)
  adding: kaggle/working/flan_t5_action_json/lora_adapter/spiece.model (deflated 48%)
  adding: kaggle/working/flan_t5_action_json/lora_adapter/special_tokens_map.json (deflated 85%)
  adding: kaggle/working/flan_t5_action_json/checkpoint-75/ (stored 0%)
  adding: kaggle/working/flan_t5_action_json/checkpoint-75/adapter_model.safetensors (deflated 8%)
  adding: kaggle/working/flan_t5_action_json/checkpoint-75/t