In [1]:
import torch
import torch.nn as nn

class LoRALayer(nn.Module):
    def __init__(self, weight, r, alpha):
        super(LoRALayer, self).__init__()
        self.weight = weight
        self.weight.requires_grad = False
        self.r = r
        self.alpha = alpha
        out_features = self.weight.shape[0]
        in_features = self.weight.shape[1]
        self.A = nn.Parameter(self.weight.new_zeros(self.r, in_features))
        self.B = nn.Parameter(self.weight.new_zeros(out_features, r))
    
    def forward(self, x):
        result = x @ self.weight.T
        result += x @ (self.A.T @ self.B.T)
        return result

In [2]:
class FFN(nn.Module):
    def __init__(self, in_channels, hidden_dim, out_channels):
        super(FFN, self).__init__()
        self.linear1 = nn.Linear(in_channels, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, out_channels)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return self.sigmoid(x)

In [3]:
from torch.utils.data import DataLoader, TensorDataset

ffn = FFN(2, 16, 1)
x_xor = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y_xor = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)

dataset_xor = TensorDataset(x_xor, y_xor)
dataloader_xor = DataLoader(dataset_xor, batch_size=1, shuffle=True)

def train_xor_model(model, dataloader):
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    for epoch in range(400):
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

def validate_xor_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # No gradients needed for predictions
        for inputs, labels in dataloader:
            outputs = model(inputs)
            print(f"Input: {inputs.numpy()}, Predicted: {outputs.numpy()}")

train_xor_model(ffn, dataloader_xor)
validate_xor_model(ffn, dataloader_xor)


  from .autonotebook import tqdm as notebook_tqdm


Epoch 10, Loss: 0.27170369029045105
Epoch 20, Loss: 0.24714870750904083
Epoch 30, Loss: 0.22865456342697144
Epoch 40, Loss: 0.0805884301662445
Epoch 50, Loss: 0.16461710631847382
Epoch 60, Loss: 0.12893199920654297
Epoch 70, Loss: 0.1552349179983139
Epoch 80, Loss: 0.10050231963396072
Epoch 90, Loss: 0.022541837766766548
Epoch 100, Loss: 0.040064822882413864
Epoch 110, Loss: 0.01124963816255331
Epoch 120, Loss: 0.017011670395731926
Epoch 130, Loss: 0.005818037781864405
Epoch 140, Loss: 0.011968064121901989
Epoch 150, Loss: 0.005529030226171017
Epoch 160, Loss: 0.007970748469233513
Epoch 170, Loss: 0.002493655076250434
Epoch 180, Loss: 0.0033821426331996918
Epoch 190, Loss: 0.0031717221718281507
Epoch 200, Loss: 0.0015826133312657475
Epoch 210, Loss: 0.002305325586348772
Epoch 220, Loss: 0.0033826909493654966
Epoch 230, Loss: 0.0030965874902904034
Epoch 240, Loss: 0.0009779471438378096
Epoch 250, Loss: 0.0014687471557408571
Epoch 260, Loss: 0.0022783263120800257
Epoch 270, Loss: 0.00073

In [4]:
ffn_weight = ffn.linear1.weight.detach().clone()
lora_layer = LoRALayer(ffn_weight, 1, 0.1)
setattr(ffn, 'linear1', lora_layer)

y_or = torch.tensor([[0], [1], [1], [1]], dtype=torch.float32)

dataset_xor = TensorDataset(x_xor, y_or)
dataloader_xor = DataLoader(dataset_xor, batch_size=1, shuffle=True)

train_xor_model(ffn, dataloader_xor)
validate_xor_model(ffn, dataloader_xor)

Epoch 10, Loss: 0.0014894329942762852
Epoch 20, Loss: 0.02709232084453106
Epoch 30, Loss: 0.013034832663834095
Epoch 40, Loss: 0.6437981128692627
Epoch 50, Loss: 0.24719078838825226
Epoch 60, Loss: 0.09623385965824127
Epoch 70, Loss: 5.1869765371748144e-08
Epoch 80, Loss: 3.4163608830795056e-08
Epoch 90, Loss: 0.17678774893283844
Epoch 100, Loss: 0.023231452330946922
Epoch 110, Loss: 0.00125793123152107
Epoch 120, Loss: 0.01699119806289673
Epoch 130, Loss: 1.4324768926599063e-08
Epoch 140, Loss: 0.09364911168813705
Epoch 150, Loss: 0.001319304807111621
Epoch 160, Loss: 0.0013102362863719463
Epoch 170, Loss: 0.06639643013477325
Epoch 180, Loss: 0.008611533790826797
Epoch 190, Loss: 8.036295184865594e-09
Epoch 200, Loss: 0.048913225531578064
Epoch 210, Loss: 0.044607315212488174
Epoch 220, Loss: 6.436508215301728e-09
Epoch 230, Loss: 0.037336550652980804
Epoch 240, Loss: 0.03434690460562706
Epoch 250, Loss: 5.235975208961463e-09
Epoch 260, Loss: 0.029291078448295593
Epoch 270, Loss: 4.64

In [5]:
ffn.modules

<bound method Module.modules of FFN(
  (linear1): LoRALayer()
  (linear2): Linear(in_features=16, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)>

In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [9]:
import torch.nn as nn
from transformers.pytorch_utils import Conv1D

class LoRAConv1D(nn.Module):
    def __init__(self, weight, bias, r, alpha):
        super(LoRAConv1D, self).__init__()
        self.nx, self.nf = weight.shape
        self.weight = weight
        self.weight.requires_grad = False
        self.bias = bias
        self.r = r
        self.alpha = alpha
        self.A = nn.Parameter(self.weight.new_zeros(self.r, self.nx))
        self.B = nn.Parameter(self.weight.new_zeros(self.nf, self.r))
    
    def forward(self, x):
        size_out = x.size()[:-1] + (self.nf,)
        result = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
        low_rank = self.B @ self.A
        result += x.view(-1, x.size(-1)) @ low_rank.T
        result = result.view(size_out)
        return result

In [10]:
#replace all the attention layers in model with LoRA layers
r = 2
alpha = 0
for name, module in model.named_modules():
    if isinstance(module, Conv1D) and "c_attn" in str(name):
        lora_layer = LoRAConv1D(module.weight, module.bias, r, alpha)
        # Replace the module directly in the parent's _modules dictionary
        parent_name, child_name = name.rsplit('.', 1)
        parent_module = dict(model.named_modules())[parent_name]
        parent_module._modules[child_name] = lora_layer

In [11]:
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): LoRAConv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [12]:
for _, param in model.named_parameters():
    param.requires_grad = False

for name, module in model.named_modules():
    if isinstance(module, LoRAConv1D):
        for param in module.parameters():
            param.requires_grad = True

for name, param in model.named_parameters():
    if "attn.c_attn" in name: assert param.requires_grad == True
    else: assert param.requires_grad == False


In [13]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_scheduler
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch.cuda.amp import GradScaler, autocast
import tqdm
import torch.optim as optim

# Load dataset
dataset = load_dataset('wikitext', 'wikitext-2-raw-v1')
texts = dataset['train']['text']  # Using a small slice for quick training

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize data
encodings = tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors="pt")

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).half()

# Prepare data for training
input_ids = encodings['input_ids']
attention_mask = encodings['attention_mask']
dataset = TensorDataset(input_ids, attention_mask)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Optimizer and learning rate scheduler
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(dataloader)*3)

# Setup for mixed-precision training
scaler = GradScaler()

# Training loop
model.train()
progress_bar = tqdm.tqdm(range(len(dataloader) * 5), desc="Training")
for epoch in range(5):  # 3 epochs
    for batch in dataloader:
        optimizer.zero_grad()
        
        input_ids, attention_mask = batch[0].to(device), batch[1].to(device)

        with autocast():
            outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        #scheduler.step()

        progress_bar.update(1)
        progress_bar.set_postfix(loss=loss.item())

progress_bar.close()
print(f"Final loss: {loss.item()}")

Training:   0%|          | 0/6885 [00:47<?, ?it/s]]


ValueError: Attempting to unscale FP16 gradients.

In [None]:
torch.save(model, f"./model-r{r}-16b-256.pt")

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#model = GPT2LMHeadModel.from_pretrained('gpt2')
model = torch.load("./model.pt")

In [None]:
start_sentence = "Tea is an aromatic beverage prepared by pouring hot or boiling water over cured or fresh leaves of Camellia sinensis, an evergreen shrub native to East Asia which probably originated in the borderlands of southwestern China and northern Myanmar."
inputs = tokenizer(start_sentence, truncation=True, max_length=256, return_tensors="pt")
inputs = {key: value.to(model.device) for key, value in inputs.items()}
model.eval()
output_sequences = model.generate(
    input_ids=inputs['input_ids'],  # Input tokens
    attention_mask=inputs['attention_mask'],  # Attention masks
    max_new_tokens=250,  # Specifies the maximum length of the sequence to be generated
    num_return_sequences=1,  # Number of sequences to generate
    temperature=1.0,  # Sampling temperature
    no_repeat_ngram_size=2,  # Prevents repeating n-grams
    top_p=0.92,  # Nucleus sampling
    top_k=0  # Top-k sampling
)
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

In [12]:
print(generated_text)

Tea is an aromatic beverage prepared by pouring hot or boiling water over cured or fresh leaves of Camellia sinensis, an evergreen shrub native to East Asia which probably originated in the borderlands of southwestern China and northern Myanmar. It is a popular beverage in China, and is often used as a tea in Chinese tea shops.

The Camella sinus is the most common plant in Asia, with a total of 1,000 species. The plant is found in many


In [21]:
print(generated_text)

Tea is an aromatic beverage prepared by pouring hot or boiling water over cured or fresh leaves of Camellia sinensis, an evergreen shrub native to East Asia which probably originated in the borderlands of southwestern China and northern Myanmar. It is also used as a flavoring in tea. 



In [13]:
# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
!nvidia-smi

AttributeError: 'NoneType' object has no attribute 'to'

In [14]:
import gc
model = None
gc.collect()
torch.cuda.empty_cache()

!nvidia-smi

Sun Jun  9 13:40:40 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       On  | 00000000:0D:00.0 Off |                    0 |
| N/A   61C    P0              33W /  70W |    669MiB / 15360MiB |      7%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
import numpy as np
import torch.nn.functional as F
dataset = load_dataset("cais/mmlu")
choices = ['A', 'B', 'C', 'D']

def format_subject(subject):
    return ' '.join(subject.split('_'))

def format_example(df, idx, choices, include_answer=True):
    prompt = df['question'][idx]
    for j, choice in enumerate(choices):
        prompt += f"\n{j+1}. {df[choice][idx]}"
    if include_answer:
        prompt += f"\nAnswer: {df['correct_answer'][idx]}\n\n"
    return prompt

def gen_prompt(df, subject, n_examples=-1):
    subject_formatted = format_subject(subject)
    prompt = f"The following are multiple choice questions (with answers) about {subject_formatted}.\n\n"
    max_examples = df.shape[0] if n_examples == -1 else n_examples
    for i in range(max_examples):
        prompt += format_example(df, i, ['choice1', 'choice2', 'choice3', 'choice4'], include_answer=True)
    return prompt

# Evaluation function
@torch.no_grad()
def evaluate(model, tokenizer, dev_df, test_df, subject, num_train_examples):
    cors = []
    all_probs = []

    for i in range(len(test_df)):
        train_prompt = gen_prompt(dev_df, subject, num_train_examples)
        prompt_end = format_example(test_df, i, ['choice1', 'choice2', 'choice3', 'choice4'], include_answer=False)
        prompt = train_prompt + prompt_end
        
        # Tokenize and ensure input length is within the model's limits
        input_ids = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to("cuda")
        
        # Generate logits
        outputs = model(input_ids=input_ids)
        logits = outputs.logits
        
        probs = (
            F.softmax(
                torch.tensor(
                    [
                        logits[tokenizer("A").input_ids[0]],
                        logits[tokenizer("B").input_ids[0]],
                        logits[tokenizer("C").input_ids[0]],
                        logits[tokenizer("D").input_ids[0]],
                    ]
                ),
                dim=0,
            )
            .detach()
            .cpu()
            .numpy()
        )
        pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
        pred = np.argmax(probs)

        # Check if the prediction is correct
        correct = test_df['correct_answer'][i]
        cor = pred == correct
        cors.append(cor)
        all_probs.append(probs)

    accuracy = np.mean(cors)
    print(f"Average accuracy: {accuracy:.3f} - Subject: {subject}")

    return np.array(cors), accuracy, np.array(all_probs)

# Example usage
dev_df = dataset['validation']
test_df = dataset['test']
subject = 'subject_name_here'  # Replace with actual subject
num_train_examples = 5  # Number of training examples to include in each prompt

# Evaluate model
results = evaluate(model, tokenizer, dev_df, test_df, subject, num_train_examples)

Downloading data: 100%|██████████| 3.50M/3.50M [00:00<00:00, 13.8MB/s]
Downloading data: 100%|██████████| 408k/408k [00:00<00:00, 2.52MB/s]
Downloading data: 100%|██████████| 76.5k/76.5k [00:00<00:00, 517kB/s]
Downloading data: 100%|██████████| 47.5M/47.5M [00:00<00:00, 110MB/s] 
Generating test split: 100%|██████████| 14042/14042 [00:00<00:00, 768671.99 examples/s]
Generating validation split: 100%|██████████| 1531/1531 [00:00<00:00, 557759.00 examples/s]
Generating dev split: 100%|██████████| 285/285 [00:00<00:00, 191015.76 examples/s]
Generating auxiliary_train split: 100%|██████████| 99842/99842 [00:00<00:00, 386056.02 examples/s]

1531



