<a href="https://colab.research.google.com/github/nrimsky/LM-exp/blob/main/flan_finetune_hobbies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install torch transformers tqdm

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m72.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.2 MB/s[0m eta [36m0:00:0

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from tqdm import tqdm
import json
import shutil
import os
from datetime import datetime
from glob import glob

In [4]:
max_length = 128
models_path = '/content/gdrive/MyDrive/Models'

In [5]:
def save_checkpoint(model, epoch, iteration, save_to, max_checkpoints=2):
    checkpoint_path = f"{save_to}/checkpoint-{epoch}-{iteration}"
    model.save_pretrained(checkpoint_path)

    # Delete older checkpoints
    checkpoint_dirs = [d for d in os.listdir(save_to) if d.startswith("checkpoint-")]
    if len(checkpoint_dirs) > max_checkpoints:
        oldest_checkpoint = min(checkpoint_dirs, key=lambda d: int(d.split("-")[2]))
        shutil.rmtree(os.path.join(save_to, oldest_checkpoint))

def make_model_name():
    now = datetime.now()
    return f"flan-finetuned-{now.day}-{now.hour}-{now.minute}-{now.second}"

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text, target_text = item["input_text"], item["target_text"]
        encoding = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": target_encoding["input_ids"].flatten(),
        }

def train(model, dataloader, optimizer, device, save_to, epoch, save_checkpoints = False):
    model.train()
    total_loss = 0
    num_batches = len(dataloader)
    for idx, batch in enumerate(tqdm(dataloader)):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
        if save_checkpoints:
          if idx == num_batches // 3 or idx == (num_batches * 2) // 3:
              save_checkpoint(model, epoch, idx, save_to)

    return total_loss / len(dataloader)

def read_data(data_path):
    with open(data_path, 'r') as dfile:
        return json.load(dfile)

def finetune(data_path, model_path="google/flan-t5-base", num_epochs=3, learning_rate=0.0005, batch_size=16, save_to=None):
    """
    Data should have format:
    [
        {"input_text": "Example input 1", "target_text": "Example target 1"},
        {"input_text": "Example input 2", "target_text": "Example target 2"},
        ...
    ]
    """
    torch.cuda.empty_cache()
    if save_to:
        save_dir = save_to
    else:
        save_dir = make_model_name()
    data = read_data(data_path)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device", device)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    model  = model.to(device)
    dataset = CustomDataset(data, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    optimizer = Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train_loss = train(model, dataloader, optimizer, device, save_dir, epoch)
        print(f"Training loss: {train_loss:.4f}")
    model.save_pretrained(save_dir)
    print("Saved finetuned model to", save_dir)
    return model

def generate_text(prompt, model, tokenizer, device):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
    output_ids = model.generate(input_ids, max_new_tokens=max_length)[0]
    return tokenizer.decode(output_ids, skip_special_tokens=True)

def make_prompt(situation):
    return "How would someone feel in this situation: " + situation

def prepend_situation_task_to_prompt(dataset):
    for item in dataset:
        item["input_text"] = make_prompt(item["input_text"])
    return dataset

def finetune_on_hobbies():
    return finetune('/content/gdrive/MyDrive/TrainingData/hobbies.json', save_to=f"{models_path}/flan-finetuned-hobbies")

def inference_loop(model_path = f"{models_path}/flan-finetuned-hobbies", tokenizer_path = "google/flan-t5-base"):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()
    while True:
        situation_name = input("Enter a hobby related event, or 'q' to quit: ")
        if situation_name == 'q':
            break
        print(generate_text(make_prompt(situation_name), model, tokenizer, device))

In [6]:
model = finetune_on_hobbies()

Using device cuda


Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Epoch 1/3


100%|██████████| 99/99 [00:27<00:00,  3.57it/s]


Training loss: 1.6200
Epoch 2/3


100%|██████████| 99/99 [00:24<00:00,  4.05it/s]


Training loss: 0.2445
Epoch 3/3


100%|██████████| 99/99 [00:24<00:00,  4.05it/s]


Training loss: 0.1918
Saved finetuned model to /content/gdrive/MyDrive/Models/flan-finetuned-hobbies


In [7]:
inference_loop()

Enter a hobby related event, or 'q' to quit: Winning a poetry competition
The person would likely feel elated, proud, and elated for their talent and hard work paying off.
Enter a hobby related event, or 'q' to quit: Breaking a leg
A combination of pain, shock, and a sense of loss for the broken leg.
Enter a hobby related event, or 'q' to quit: q


In [8]:
def pad_tensors_to_same_size(tensor1, tensor2):
    # Ensure tensor2 is no larger than tensor1 along the second dimension
    if tensor2.size(1) > tensor1.size(1):
        tensor2 = tensor2[:, :tensor1.size(1), :]

    # In case tensor2 is smaller, pad it with zeros to match tensor1's size
    padding_size2 = max(0, tensor1.size(1) - tensor2.size(1))
    if padding_size2 > 0:
        padding2 = torch.zeros((tensor2.size(0), padding_size2, tensor2.size(2)), device=tensor2.device)
        tensor2 = torch.cat([tensor2, padding2], dim=1)

    return tensor1, tensor2

In [9]:
class BlockOutputWrapper(torch.nn.Module):
    def __init__(self, block):
        super().__init__()
        self.block = block
        self.last_hidden_state = None
        self.add_activations = None

    def forward(self, *args, **kwargs):
        output = self.block(*args, **kwargs)
        self.last_hidden_state = output[0]
        if self.add_activations is not None:
            o1, o2 = pad_tensors_to_same_size(output[0], self.add_activations)
            output = (o1 + o2,) + output[1:]
        return output

    def add(self, activations):
        self.add_activations = activations

    def reset(self):
        self.last_hidden_state = None
        self.add_activations = None

In [10]:
len(model.encoder.block)

12

In [11]:
block_num = 8 # Can change to mix activations at different layers

In [12]:
model_path = f"{models_path}/flan-finetuned-hobbies"
tokenizer_path = "google/flan-t5-base"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True)

In [13]:
model.encoder.block[block_num] = BlockOutputWrapper(model.encoder.block[block_num])

In [17]:
original_input = make_prompt("Winning a poetry competition")
mix_input = make_prompt("Breaking a leg")

In [18]:
model.encoder.block[block_num].reset()
encoded_input = tokenizer(original_input, return_tensors="pt")
o = model.generate(encoded_input["input_ids"], max_new_tokens=max_length)
hidden_state_1 = model.encoder.block[block_num].last_hidden_state
original_answer = tokenizer.decode(o[0], skip_special_tokens=True)
print(original_answer)

The person would likely feel elated, proud, and elated for their talent and hard work paying off.


In [19]:
model.encoder.block[block_num].reset()
encoded_input = tokenizer(mix_input, return_tensors="pt")
o = model.generate(encoded_input["input_ids"], max_new_tokens=max_length)
hidden_state_2 = model.encoder.block[block_num].last_hidden_state
mixing_answer = tokenizer.decode(o[0], skip_special_tokens=True)
print(mixing_answer)

A combination of pain, shock, and a sense of loss for the broken leg.


In [20]:
print("Original question:", original_input, "| Original answer:", original_answer)
print("Mixing question:", mix_input, "| Mixing answer:", mixing_answer)
multipliers = [0, 0.1, 1, 1.5, 10, 100]
for m in multipliers:
    model.encoder.block[block_num].add(hidden_state_2 * m)
    encoded_input = tokenizer(original_input, return_tensors="pt")
    augmented_output = model.generate(encoded_input["input_ids"], max_new_tokens=max_length)
    result = tokenizer.decode(augmented_output[0], skip_special_tokens=True)
    print(m, result)

Original question: How would someone feel in this situation: Winning a poetry competition | Original answer: The person would likely feel elated, proud, and elated for their talent and hard work paying off.
Mixing question: How would someone feel in this situation: Breaking a leg | Mixing answer: A combination of pain, shock, and a sense of loss for the broken leg.
0 The person would likely feel elated, proud, and elated for their talent and hard work paying off.
0.1 The person would likely feel elated, proud, and elated for their talent and hard work paying off.
1 Feelings of pain, surprise, and a sense of triumph for overcoming a challenging physical and mental challenge.
1.5 The person would likely feel devastated, upset, and a bit scared.
10 A combination of pain, shock, and determination to get back to the proper care.
100 A combination of pain, shock, and determination to get back to the proper care.
