In [1]:
!pip install peft
!pip install datasets
!pip install evaluate
!pip install rouge_score 



In [2]:
from peft import LoraConfig, TaskType, get_peft_model

In [3]:
import peft

peft.__version__

'0.13.2'

In [4]:
import os

In [5]:
# !ls

In [6]:
# os.chdir("drive/MyDrive/Colab Notebooks/Practice 2024-2025")

## Load Model

### FLAN-T5

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM



tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")



In [8]:
peft_config = LoraConfig(

    task_type=TaskType.SEQ_2_SEQ_LM,

    inference_mode=False,

    r=4,

    lora_alpha=32,

    lora_dropout=0.1,

    )


In [9]:
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

trainable params: 442,368 || all params: 248,020,224 || trainable%: 0.1784


## Load Data

In [10]:
from datasets import load_dataset



# ds = load_dataset("tatsu-lab/alpaca")



ds = load_dataset("databricks/databricks-dolly-15k")

In [11]:
train_ds = ds["train"].shuffle(seed=42).select(range(1000))

sp1 = ds["train"].train_test_split(test_size=0.2, seed=42)

train_ds = sp1["train"]

sp2 = sp1["test"].train_test_split(test_size=0.5, seed=42)

validation_ds, test_ds = sp2["train"], sp2["test"]



# train_ds = ds["train"].shuffle(seed=42).select(range(1000))

# train_valid_split = train_ds.train_test_split(test_size=0.2, seed=42)

# train_ds = train_valid_split["train"]

# temp = train_valid_split["test"].train_test_split(test_size=0.5, seed=42)

# validation_ds, test_ds = temp["train"], temp["test"]

In [12]:
len(test_ds), len(train_ds), len(validation_ds)

(1502, 12008, 1501)

In [13]:
additional_ds = load_dataset("tatsu-lab/alpaca")

subset_size = 100

additional_subset = additional_ds['train'].shuffle(seed=42).select(range(subset_size))


## Tokenize

In [14]:
def preprocess_alpaca(data):

    input_text = f"Instruction: {data['instruction']}\nInput: {data['input']}\n"

    target_text = data['output']

    return {"input_ids": tokenizer(input_text, truncation=True, padding="max_length", max_length=256, return_tensors="pt")["input_ids"],

            "labels": tokenizer(target_text, truncation=True, padding="max_length", max_length=256, return_tensors="pt")["input_ids"]}

In [15]:
def preprocess_dolly(data):

    inputs = data["instruction"] + " " + data["context"]

    targets = data["response"]

    return {"input_ids": tokenizer(inputs, truncation=True, padding="max_length", max_length=256, return_tensors="pt",)["input_ids"],

            "labels": tokenizer(targets, truncation=True, padding="max_length", max_length=256, return_tensors="pt",)["input_ids"]}

In [16]:
tokenized_dataset = {"train": train_ds.map(preprocess_dolly, batch_size=4),

                     "test": test_ds.map(preprocess_dolly, batch_size=4),

                     "validation": validation_ds.map(preprocess_dolly, batch_size=4),

                     "additional": additional_subset.map(preprocess_alpaca, batch_size=4)}

Map:   0%|          | 0/12008 [00:00<?, ? examples/s]

Map:   0%|          | 0/1502 [00:00<?, ? examples/s]

Map:   0%|          | 0/1501 [00:00<?, ? examples/s]

## Instruction Training

In [17]:
import numpy as np

import pandas as pd

In [18]:
import evaluate

bleu = evaluate.load("bleu")

rouge = evaluate.load("rouge")

import torch

In [19]:
device = "cuda"
model.to(device)
model.eval()
''

''

In [20]:
from tqdm import tqdm

def generate_predictions(input_ids):
    # 模型生成
    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_length=256, num_beams=4, early_stopping=True)
    return outputs

def evaluate_loop(data_loader):
    for step, batch in enumerate(tqdm(data_loader)):
        input_ids = batch['input_ids'].squeeze(1).to(device)
        labels = batch['labels'].squeeze(1).to(device)
        
        predicted_ids = generate_predictions(input_ids)
        
        predicted_text = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)
        target_text = tokenizer.batch_decode(labels, skip_special_tokens=True)

        for i in range(len(target_text)):
            bleu.add(prediction=predicted_text[i], reference=[target_text[i]])
            rouge.add(prediction=predicted_text[i], reference=target_text[i])
        
    bleu_score = bleu.compute()
    rouge_score = rouge.compute()
    
    print(f"BLEU score: {bleu_score}")
    print(f"ROUGE score: {rouge_score}")

evaluate_loop(tokenized_dataset['validation'])


100%|██████████| 1501/1501 [07:57<00:00,  3.14it/s]


BLEU score: {'bleu': 0.0001194536676941802, 'precisions': [0.39258719606593206, 0.1459430979978925, 0.08008504606661943, 0.05135170094425042], 'brevity_penalty': 0.0009641464354334158, 'length_ratio': 0.12587693154202392, 'translation_length': 10981, 'reference_length': 87236}
ROUGE score: {'rouge1': 0.12992106358599856, 'rouge2': 0.05028785915075314, 'rougeL': 0.12189537434644283, 'rougeLsum': 0.12206761176116535}


In [21]:
evaluate_loop(tokenized_dataset['test'])

100%|██████████| 1502/1502 [08:00<00:00,  3.13it/s]


BLEU score: {'bleu': 0.00011841289616344131, 'precisions': [0.42619808306709267, 0.17448246725813266, 0.1018957345971564, 0.07013897908819328], 'brevity_penalty': 0.0007798827612028123, 'length_ratio': 0.12260360592257674, 'translation_length': 10955, 'reference_length': 89353}
ROUGE score: {'rouge1': 0.14016299009362565, 'rouge2': 0.05305306561244673, 'rougeL': 0.1326894413014032, 'rougeLsum': 0.13259385950940167}


In [22]:
evaluate_loop(tokenized_dataset['additional'])

100%|██████████| 100/100 [00:50<00:00,  1.98it/s]


BLEU score: {'bleu': 0.020101169685288668, 'precisions': [0.3223350253807107, 0.12787550744248985, 0.0864903502501787, 0.06184012066365008], 'brevity_penalty': 0.16496359647414502, 'length_ratio': 0.35688405797101447, 'translation_length': 1576, 'reference_length': 4416}
ROUGE score: {'rouge1': 0.21123958806872062, 'rouge2': 0.10350711679897005, 'rougeL': 0.1912380587098892, 'rougeLsum': 0.19000298214322198}


In [23]:
from torch.utils.data import DataLoader
from transformers import default_data_collator
from tqdm import tqdm
from transformers import get_linear_schedule_with_warmup

def filter_columns(batch):
    return {key: value for key, value in batch.items() if key in ['input_ids', 'labels']}

# Update the DataLoader with the filtered dataset
train_dataloader = DataLoader(
    tokenized_dataset["train"].map(filter_columns, batched=True),
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
    pin_memory=True
)

validation_dataloader = DataLoader(
    tokenized_dataset["validation"].map(filter_columns, batched=True),
    collate_fn=default_data_collator,
    batch_size=8,
    pin_memory=True
)

test_dataloader = DataLoader(
    tokenized_dataset["test"].map(filter_columns, batched=True),
    collate_fn=default_data_collator,
    batch_size=8,
    pin_memory=True
)

addi_dataloader = DataLoader(
    tokenized_dataset["additional"].map(filter_columns, batched=True),
    collate_fn=default_data_collator,
    batch_size=8,
    pin_memory=True
)

Map:   0%|          | 0/12008 [00:00<?, ? examples/s]

Map:   0%|          | 0/1501 [00:00<?, ? examples/s]

Map:   0%|          | 0/1502 [00:00<?, ? examples/s]

In [24]:
lr = 1e-4
num_epochs = 3


optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

# Send model to device
model = model.to(device)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.squeeze(1).to(device) for k, v in batch.items()}
        # print(batch["input_ids"].shape, batch["labels"].shape)
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # Evaluation loop
    model.eval()
    eval_loss = 0
    eval_preds = []
    with torch.no_grad():
        for step, batch in enumerate(tqdm(validation_dataloader)):
            batch = {k: v.squeeze(1).to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            eval_loss += loss.detach().float()
            eval_preds.extend(
                tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
            )

    # Calculate and print losses and metrics
    eval_epoch_loss = eval_loss / len(validation_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(train_dataloader)
    train_ppl = torch.exp(train_epoch_loss)

    print(f"Epoch {epoch}: train_ppl={train_ppl} train_epoch_loss={train_epoch_loss} "
          f"eval_ppl={eval_ppl} eval_epoch_loss={eval_epoch_loss}")

# Test the model after training
model.eval()
test_loss = 0
test_preds = []
with torch.no_grad():
    for step, batch in enumerate(tqdm(test_dataloader)):
        batch = {k: v.squeeze(1).to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        test_loss += loss.detach().float()
        test_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

# Calculate final test loss and metrics
test_epoch_loss = test_loss / len(test_dataloader)
test_ppl = torch.exp(test_epoch_loss)
print(f"Test results: test_ppl={test_ppl} test_epoch_loss={test_epoch_loss}")


100%|██████████| 1501/1501 [22:20<00:00,  1.12it/s]
100%|██████████| 188/188 [01:17<00:00,  2.43it/s]


Epoch 0: train_ppl=37.43816375732422 train_epoch_loss=3.6226906776428223 eval_ppl=2.2081003189086914 eval_epoch_loss=0.792132556438446


100%|██████████| 1501/1501 [22:21<00:00,  1.12it/s]
100%|██████████| 188/188 [01:17<00:00,  2.43it/s]


Epoch 1: train_ppl=2.528874635696411 train_epoch_loss=0.9277744293212891 eval_ppl=2.068662166595459 eval_epoch_loss=0.7269021272659302


100%|██████████| 1501/1501 [22:21<00:00,  1.12it/s]
100%|██████████| 188/188 [01:17<00:00,  2.43it/s]


Epoch 2: train_ppl=2.371427059173584 train_epoch_loss=0.8634918928146362 eval_ppl=2.0498759746551514 eval_epoch_loss=0.7177793383598328


100%|██████████| 188/188 [01:17<00:00,  2.44it/s]

Test results: test_ppl=2.0916249752044678 test_epoch_loss=0.7379412055015564





In [25]:
evaluate_loop(tokenized_dataset['test'])

100%|██████████| 1502/1502 [13:55<00:00,  1.80it/s]


BLEU score: {'bleu': 0.007531466629111084, 'precisions': [0.3658960542325743, 0.14679368029739778, 0.09089567655304708, 0.06714256155596937], 'brevity_penalty': 0.05597292488875263, 'length_ratio': 0.2575403176166441, 'translation_length': 23012, 'reference_length': 89353}
ROUGE score: {'rouge1': 0.1780411931548635, 'rouge2': 0.07503631243217185, 'rougeL': 0.16009409120132367, 'rougeLsum': 0.15975600948420282}


In [26]:
evaluate_loop(tokenized_dataset['additional'])

100%|██████████| 100/100 [01:13<00:00,  1.36it/s]


BLEU score: {'bleu': 0.03745128961654737, 'precisions': [0.2517894736842105, 0.09226713532513181, 0.06170018281535649, 0.04269449715370019], 'brevity_penalty': 0.42342942699601366, 'length_ratio': 0.5378170289855072, 'translation_length': 2375, 'reference_length': 4416}
ROUGE score: {'rouge1': 0.25020286428318306, 'rouge2': 0.12535026103552413, 'rougeL': 0.22422950833509545, 'rougeLsum': 0.22418124131803857}


In [27]:
torch.save(model, "flan-t5-dolly-3")

In [None]:
torch.load("flan-t5-dolly-3")