In [None]:
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import requests
import math

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [None]:
# from huggingface_hub import login

# login(token="")

In [None]:
class LoRALayer(nn.Module):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        r: int,
        lora_alpha: int,
        lora_dropout: float,
        merge_weights: bool = False,
    ):
        super(LoRALayer, self).__init__()

        self.r = r
        self.lora_alpha = lora_alpha
        self.merge_weights = merge_weights
        self.merged = False

        if lora_dropout > 0.0:
            self.lora_drop = nn.Dropout(p=lora_dropout)
        else:
            self.lora_drop = lambda x: x

        if r > 0:
            self.lora_A = nn.Parameter(torch.zeros((r, in_features)))
            self.lora_B = nn.Parameter(torch.zeros((out_features, r)))
            self.scaling = self.lora_alpha / r

            self.weight = nn.Parameter(torch.zeros((out_features, in_features)))
            self.weight.requires_grad = False

        self.reset_parameters()

    def reset_parameters(self):
        if self.r > 0:
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.kaiming_uniform_(self.lora_B, a=math.sqrt(5))

    def train(self, mode: bool = True):
        super(LoRALayer, self).train(mode)
        if mode and not self.merged and self.r > 0 and self.merge_weights:
            self.weight.data -= (self.lora_B @ self.lora_A) * self.scaling
            self.merged = True

    def forward(self, x: torch.Tensor):
        if self.r > 0 and not self.merged:
            result = F.linear(x, self.weight, self.bias)
            result += (
                self.lora_drop(x)
                @ self.lora_A.transpose(0, 1)
                @ self.lora_B.transpose(0, 1)
                * self.scaling
            )
            return result
        else:
            return F.linear(x, self.weight, self.bias)

In [None]:
class LinearWithLoRA(nn.Module):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        r: int,
        lora_alpha: int,
        lora_dropout: float,
        merge_weights: bool = False,
    ):
        super(LinearWithLoRA, self).__init__()

        self.linear = nn.Linear(in_features, out_features)
        self.lora = LoRALayer(
            in_features, out_features, r, lora_alpha, lora_dropout, merge_weights
        )

    def forward(self, x: torch.Tensor):
        return self.linear(x) + self.lora(x)


class ModelWithLoRA(nn.Module):
    def __init__(
        self,
        input_size: int,
        hidden_size: int,
        output_size: int,
        r: int,
        lora_alpha: int,
        lora_dropout: float,
    ):
        super(ModelWithLoRA, self).__init__()

        self.fc1 = LinearWithLoRA(input_size, hidden_size, r, lora_alpha, lora_dropout)
        self.fc2 = LinearWithLoRA(hidden_size, output_size, r, lora_alpha, lora_dropout)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
# model_name = "EleutherAI/gpt-neo-125M"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token
# model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/160 [00:00<?, ?it/s]

GPTNeoForCausalLM LOAD REPORT from: EleutherAI/gpt-neo-125M
Key                                                   | Status     |  | 
------------------------------------------------------+------------+--+-
transformer.h.{0...11}.attn.attention.masked_bias     | UNEXPECTED |  | 
transformer.h.{0, 2, 4, 6, 8, 10}.attn.attention.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [None]:
def apply_lora_to_gpt_neo(model, r, lora_alpha, lora_dropout):
    layers_to_replace = []

    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and ("q_proj" in name or "v_proj" in name):
            in_features = module.in_features
            out_features = module.out_features
            layers_to_replace.append((name, in_features, out_features))

    for name, in_features, out_features in layers_to_replace:
        lora_layer = LinearWithLoRA(
            in_features, out_features, r, lora_alpha, lora_dropout
        )
        setattr(model, name, lora_layer)

    return model

In [8]:
r = 16
lora_alpha = 32
lora_dropout = 0.1
finetuned_model = apply_lora_to_gpt_neo(model, r, lora_alpha, lora_dropout)

In [None]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
shakespeare_text = response.text

text_lines = shakespeare_text.splitlines()

dataset = Dataset.from_dict({"text": text_lines})


def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )


tokenized_datasets = dataset.map(tokenize_function, batched=True)

from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets, batch_size=4, shuffle=True)

NameError: name 'requests' is not defined

In [10]:
finetuned_model = finetuned_model.to(device)

In [13]:
for i, batch in enumerate(train_dataloader):
    print(f"Batch {i}: {batch}")
    if i == 5:  # Limit the number of iterations for testing
        break

Batch 0: {'text': [" Silver Bullet is a western @-@ themed steel inverted roller coaster designed by Bolliger & Mabillard located at Knott 's Berry Farm , an amusement park in Buena Park , California . The $ 16 million roller coaster was announced on December 1 , 2003 and opened on December 7 , 2004 . A first rider auction was also held where people would bid on seats to be the first riders . The track is approximately 3 @,@ 125 feet ( 952 m ) long and the lift hill is about 146 feet ( 45 m ) tall . The ride lasts two minutes and thirty seconds and features six inversions including a vertical loop , cobra roll , zero @-@ g roll , and two corkscrews . \n", " On reading Murray 's description in his official despatch covering the battle , and reprinted in a Paris edition of the ' Daily Mail ' , Chauvel wrote to his wife on 3 December 1916 @,@  \n", " Zartan invites the world leaders to a summit at historic Fort Sumter , where he blackmails them into disabling their nuclear arsenals , and 

In [None]:
optimizer = AdamW(finetuned_model.parameters(), lr=5e-5)

finetuned_model.train()
num_epochs = 2

for epoch in range(num_epochs):
    for batch in train_dataloader:
        input_ids = torch.stack(batch["input_ids"]).to(device)
        attention_mask = torch.stack(batch["attention_mask"]).to(device)

        optimizer.zero_grad()
        outputs = finetuned_model(
            input_ids, attention_mask=attention_mask, labels=input_ids
        )
        loss = outputs.loss

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")

finetuned_model.save_pretrained("fine_tuned_lora_gpt_neo")
tokenizer.save_pretrained("fine_tuned_lora_gpt_neo")

KeyboardInterrupt: 