In [1]:
# ✨ INSTALL DEPENDENCIES
!pip install transformers datasets evaluate accelerate -q
!pip install rouge_score -q

# 🌐 IMPORTS
import pandas as pd
import torch
from datasets import Dataset , DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
)
import evaluate
from sklearn.model_selection import train_test_split

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m842.7 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# 🗂️ LOAD DATA (JSON format is already uploaded)
import json

with open("/content/legalQueriesTrainingData.json") as f:
    raw_data = json.load(f)

df = pd.DataFrame(raw_data)

In [3]:
## Format input-output pairs for FLAN-T5
def format_example(row):
    input_text = f"Evaluate the factual correctness of the answer to this legal question.\nQuestion: {row['question']}\nAnswer: {row['answer']}"
    output_text = f"Score: {row['average_score']}. Rationale: {row['rationale']}"
    # Include 'question_id', 'average_score', and 'rationale' in the returned dictionary
    return {
        "input": input_text,
        "output": output_text,
        "question_id": row['question_id'],
        "average_score": row['average_score'], # Include average_score
        "rationale": row['rationale']         # Include rationale
    }

# Apply the function and create the new DataFrame
# No need for result_type="expand" when returning a dict with multiple keys
formatted_data_list = df.apply(format_example, axis=1).tolist()

# Convert the list of dictionaries to a DataFrame
formatted_data_df = pd.DataFrame(formatted_data_list)

# Convert to HuggingFace dataset
dataset = Dataset.from_pandas(formatted_data_df)
# %%
# Train/Validation/Test Split
# First split the initial dataset into train and test
train_test_split_result = dataset.train_test_split(test_size=0.2, seed=42)

# Split the training set further into training and validation
train_val_split_result = train_test_split_result['train'].train_test_split(test_size=0.1, seed=42)  # 10% of 80% = 8% for validation

# Create the final DatasetDict with train, validation, and test splits
# The 'question_id' column will now be present in all splits
dataset = DatasetDict({
    'train': train_val_split_result['train'],
    'validation': train_val_split_result['test'],
    'test': train_test_split_result['test']
})
# %%
# 📚 CONVERT TO HUGGINGFACE DATASETS
# Use DatasetDict created in the previous cell
# the columns to match the preprocess function
# Renaming should only apply to input and output if needed for preprocessing
train_dataset = dataset['train'].rename_columns({"input": "input_text", "output": "target_text"})
val_dataset = dataset['validation'].rename_columns({"input": "input_text", "output": "target_text"})

# The test dataset is not used in training/evaluation loop but can be used for final evaluation
# Keep the 'question_id' column in the test set
test_dataset = dataset['test'].rename_columns({"input": "input_text", "output": "target_text"})

In [4]:
# Split the training set further into training and validation
train_val_split_result = train_test_split_result['train'].train_test_split(test_size=0.1, seed=42)  # 10% of 80% = 8% for validation

# Create the final DatasetDict with train, validation, and test splits
dataset = DatasetDict({
    'train': train_val_split_result['train'],
    'validation': train_val_split_result['test'],
    'test': train_test_split_result['test']
})

In [5]:
# 📚 CONVERT TO HUGGINGFACE DATASETS
# Use DatasetDict created in the previous cell
# the columns to match the preprocess function
train_dataset = dataset['train'].rename_columns({"input": "input_text", "output": "target_text"})
val_dataset = dataset['validation'].rename_columns({"input": "input_text", "output": "target_text"})

# The test dataset is not used in training/evaluation loop but can be used for final evaluation
test_dataset = dataset['test'].rename_columns({"input": "input_text", "output": "target_text"})

In [6]:
# 🔧 LOAD TOKENIZER & MODEL
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
# 🔁 TOKENIZATION
def preprocess(example):
    model_inputs = tokenizer(
        example["input_text"], max_length=512, truncation=True, padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["target_text"], max_length=128, truncation=True, padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/1931 [00:00<?, ? examples/s]



Map:   0%|          | 0/215 [00:00<?, ? examples/s]

In [8]:
# 🧊 DATA COLLATOR
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# ⚙️ TRAINING ARGUMENTS
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-legal-factual",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="rougeL"
)

In [9]:
# 📏 EVALUATION METRIC
import evaluate
import numpy as np
import torch # Import torch

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    preds, labels = eval_pred

    # Ensure predictions are long integers, as expected by tokenizer.batch_decode
    # This might prevent the OverflowError if preds contains values outside the valid range
    # for integer conversion, potentially due to unexpected data types or values.

    # Convert to numpy array first for consistent handling
    if isinstance(preds, torch.Tensor):
        preds = preds.detach().cpu().numpy()

    # Clip predictions to the valid token ID range before casting
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    # Ensure data type is suitable for decoding (usually int64 or int32)
    # Check the data type after clipping
    # print(f"Data type of preds after clipping: {preds.dtype}")
    # print(f"Min value in preds after clipping: {np.min(preds)}")
    # print(f"Max value in preds after clipping: {np.max(preds)}")
    # Ensure preds is a numpy array of integers
    preds = preds.astype(np.int64) # Explicitly cast to int64

    # -100 in labels with the tokenizer's pad_token_id
    # Ensure labels are also long integers for batch_decode
    if isinstance(labels, torch.Tensor):
         labels = labels.detach().cpu().numpy()

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Ensure labels is a numpy array of integers
    labels = labels.astype(np.int64) # Explicitly cast to int64


    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_special_tokens=True)

    # Compute ROUGE scores
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    # Return a dictionary with the metric names and values
    return {key: value for key, value in result.items()}

# Define a simple sentence splitting function if needed (adjust based on your data)
# This function was defined but not used in the compute_metrics function above.
# If you intended to use it for post-processing, you would need to call it
# before the rouge.compute call and modify how decoded_preds and decoded_labels
# are prepared. As it stands, it's unused.
def sent_split(text):
    return [text] # Assuming each example is a single "sentence" for simplicity

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [10]:
# 🚀 TRAINER
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [13]:
# 🏋️ START TRAINING
import wandb

# Initialize wandb run
# Replace "your-project-name" with the name you want for your wandb project
# You can also add configurations or specific run names here
wandb.init(project="flan-t5-legal-factual-training")

trainer.train()

# Optional: Finish the wandb run after training is complete
wandb.finish()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkayfari20[0m ([33mkayfari20-university-of-ghana[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.6276,3.128525,0.007867,0.000685,0.006316,0.006364
2,2.368,1.862267,0.052554,0.026033,0.048735,0.048637
3,1.8324,1.358499,0.090979,0.049621,0.086944,0.086954


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,3.6276,3.128525,0.007867,0.000685,0.006316,0.006364
2,2.368,1.862267,0.052554,0.026033,0.048735,0.048637
3,1.8324,1.358499,0.090979,0.049621,0.086944,0.086954
4,1.6437,1.234341,0.092703,0.049728,0.088797,0.088746


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


0,1
eval/loss,█▃▁▁
eval/rouge1,▁▅██
eval/rouge2,▁▅██
eval/rougeL,▁▅██
eval/rougeLsum,▁▅██
eval/runtime,▁▆▄█
eval/samples_per_second,█▃▅▁
eval/steps_per_second,█▃▆▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇████

0,1
eval/loss,1.23434
eval/rouge1,0.0927
eval/rouge2,0.04973
eval/rougeL,0.0888
eval/rougeLsum,0.08875
eval/runtime,354.9145
eval/samples_per_second,0.606
eval/steps_per_second,0.076
total_flos,1435818258333696.0
train/epoch,4.0


In [14]:
# 💾 SAVE THE FINAL MODEL
trainer.save_model("/content/flan-t5-factual-rationale")
tokenizer.save_pretrained("/content/flan-t5-factual-rationale")

('/content/flan-t5-factual-rationale/tokenizer_config.json',
 '/content/flan-t5-factual-rationale/special_tokens_map.json',
 '/content/flan-t5-factual-rationale/spiece.model',
 '/content/flan-t5-factual-rationale/added_tokens.json',
 '/content/flan-t5-factual-rationale/tokenizer.json')

In [23]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import re # Import the re module

# Load fine-tuned FLAN-T5-small model
model_path = "./flan-t5-factual-rationale"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

# Make sure model is in eval mode
model.eval()

# Choose a few test samples
sample_inputs = test_dataset.select(range(5))  # select first 5 for demonstration

# Define a regex pattern to extract the question and answer from the input_text
pattern = re.compile(r"Question: (.*?)\nAnswer: (.*)")

for sample in sample_inputs:
    # Extract question and answer from input_text using regex
    match = pattern.search(sample['input_text'])
    if match:
        question = match.group(1)
        answer = match.group(2)
    else:
        # Handle cases where the pattern doesn't match (though it should if the format is consistent)
        question = "Could not extract question"
        answer = "Could not extract answer"

    # Use the extracted question and answer to create the prompt
    prompt = f"Question: {question}\nAnswer: {answer}\n"

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(model.device)

    # Generate
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=100)

    # Decode
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Display
    print("="*100)
    print(f"Question ID: {sample['question_id']}")
    print(f"Prompt\n{prompt}")
    print(f"\nGenerated Output:\n{generated_text}")
    # Fetch the original score and rationale from the sample
    original_score = sample['average_score']
    original_rationale = sample['rationale']
    print(f"\nEvaluation Of Answer:\nScore: {original_score}. \nRationale: {original_rationale}")
    print("\n")

Question ID: 1555
Prompt
Question: Can Ghanaian courts adjudicate lunar real estate disputes under the Outer Space Act?
Answer: Ghana has no specific Outer Space Act. However, as a participant in the 1967 Outer Space Treaty, it cannot assert sovereignty over the Moon or adjudicate lunar land ownership. Ghanaian courts may assume jurisdiction over disputes involving Ghanaian citizens in space commerce if such a law is enacted, but not sovereignty claims.


Generated Output:
Ghanaian courts may assume jurisdiction over disputes involving Ghanaian citizens in space commerce if such a law is enacted, but not sovereignty claims.

Evaluation Of Answer:
Score: 4.0. 
Rationale: Legally sound within international and Ghanaian contexts. Properly outlines jurisdictional limits. Clearly structured for legal and lay audiences.


Question ID: 1774
Prompt
Question: Can a "time-traveling" witness be cross-examined if their testimony relies on knowledge from future events?
Answer: Under Ghanaian law, a