In [1]:
!git clone https://github.com/shredder67/svd-lorafa

Cloning into 'svd-lorafa'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 64 (delta 27), reused 43 (delta 13), pack-reused 0[K
Receiving objects: 100% (64/64), 306.30 KiB | 3.33 MiB/s, done.
Resolving deltas: 100% (27/27), done.


In [2]:
import csv
import time
import json
import warnings
from functools import partial
warnings.filterwarnings('always')

import torch
import numpy as np
import pandas as pd
torch.manual_seed(0)
from torch import nn
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaModel, RobertaForSequenceClassification

from lora import *
from datasets import GLUEDatasetRoberta

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


We will follow LoRA-FA experiment setting, where in paper they state that they apply lora to all 4 linear layers in MultiHeadAttention (see section 2.2): 3 layers that produce $Q$, $K$ and $V$ and also final linear layer weights

<center>
<img src="imgs/mha.png" width=300 height=400/>
</center>

In [3]:
def construct_lorafa_config(model, rank, init_method='svd'):
    """Each layer has a unique name within module hierarchy, so we can identify
    them for lora parametrization"""
    config = {}
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any([name.split('.')[-1] == n for n in ['dense', 'query', 'key', 'value',]]) and '.attention.' in name:
            config[name] = {
                nn.Linear: {
                    "weight": partial(
                        LoRAFAParametrization.from_linear,
                        rank=rank,
                        init_method=init_method, # set svd as initiazliation method
                        original_weights=module.weight # pass weights for svd init
                    ),
                }
            }
    return config

  and should_run_async(code)


# Fine-tuning stage

## MRPC

### Prepare model

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

  and should_run_async(code)


In [5]:
def freeze_nonlora(model):
    for name, parameters in model.named_parameters():
        if not name_is_lora(name) and not 'classifier' in name: # don't want to freeze last original layer too
            parameters.requires_grad = False

  and should_run_async(code)


In [6]:
def get_hot_parameters(model):
    for _, params in model.named_parameters():
            if params.requires_grad:
                 yield params

  and should_run_async(code)


### Prepare data

In [7]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

  and should_run_async(code)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [11]:
!unzip RTE.zip

Archive:  RTE.zip
   creating: RTE/
  inflating: RTE/dev.tsv             
  inflating: RTE/test.tsv            
  inflating: RTE/train.tsv           


  and should_run_async(code)


In [12]:
data_dir = 'RTE'
train_file = data_dir + '/train.tsv'
test_file = data_dir + '/dev.tsv'

  and should_run_async(code)


In [13]:
train_dataset = GLUEDatasetRoberta(train_file, tokenizer, benchmark='rte')
test_dataset = GLUEDatasetRoberta(test_file, tokenizer, benchmark='rte')

  and should_run_async(code)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.csv[label_column][self.csv[label_column].str.contains('not')] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.csv[label_column][self.csv[label_column] != 0] = 1
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some to

In [14]:
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size,)

  and should_run_async(code)


### Train

In [15]:
# HYPER PARAMETERS
rank = 2
n_epoch = 30
warmup_ratio = 0.06
n_steps = len(train_loader) * n_epoch
warmup_steps = warmup_ratio * n_steps
criteria = nn.CrossEntropyLoss()

def lr_lambda(current_step):
    if current_step <= warmup_steps:
        return (current_step + 1) / max(1, warmup_steps)
    else:
        return (n_steps - current_step) / (max(1, n_steps - warmup_steps))


  and should_run_async(code)


In [16]:
def train_epoch(model, optimizer, scheduler, pbar, mode_train=True):
    step_loss = []
    if mode_train:
        model.train()
        loader = train_loader
    else:
        model.eval()
        total = 0
        correct = 0
        loader = test_loader

    for input_ids, attention_mask, label in loader: # training
        output = model(input_ids.to(device), attention_mask.to(device))
        loss = criteria(output.logits, label.to(device))

        if mode_train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()
        else:
            _, preds = torch.max(output.logits, 1)
            total += label.size(0)
            correct += (preds.detach().cpu() == label).sum().item()

        step_loss.append(loss.item())
        pbar.update()

    if mode_train:
        return np.mean(step_loss)
    else:
        return np.mean(step_loss), correct / total

  and should_run_async(code)


In [17]:
def train(model, optimizer, scheduler):

    pbar = tqdm(total=n_steps, desc='Training', position=0)
    pbar_test = tqdm(total=len(test_loader), desc='Validating', position=1)

    training_history= {'train_loss': [],
                    'val_loss': [],
                    'val_acc': [],}

    for epoch in range(n_epoch):
        if epoch != 0: pbar_test.update(-pbar_test.total) # for refreshing pbar

        train_loss = train_epoch(model, optimizer, scheduler, pbar)
        val_loss, val_acc = train_epoch(model, optimizer, scheduler, pbar_test, mode_train=False)

        # LOGGING
        training_history['val_loss'].append(val_loss)
        training_history['val_acc'].append(val_acc)
        training_history['train_loss'].append(train_loss)

        pbar_test.set_postfix({'val_loss': val_loss, 'val_acc': val_acc, 'max_acc': max(training_history['val_acc'])})
        pbar.set_postfix({'train_loss': train_loss})


    pbar.close()
    pbar_test.close()

    return training_history

  and should_run_async(code)


In [21]:
#comparisson = {}
for lr in [5e-5, 7e-5, 1e-4, 4e-4, 5e-3]:
    for rank in [8, 4, 2, 1]:
        if lr == 5e-5 and rank == 8:
          continue

        comparisson[(lr, rank)] = {}
        for init_method in ['kaiming', 'svd']:
            # PREPARE MODEL
            roberta = RobertaForSequenceClassification.from_pretrained('roberta-base')
            lora_roberta_config = construct_lorafa_config(roberta, rank=rank, init_method=init_method) # kaiming, svd
            add_lora_by_layer_names(roberta, lora_roberta_config)
            freeze_nonlora(roberta)
            roberta = roberta.to(device)
            parameters = [{"params": list(get_hot_parameters(roberta))}]

            # TRAIN
            optimizer = torch.optim.AdamW(parameters, lr=lr)
            scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
            comparisson[(lr, rank)][init_method] = train(roberta, optimizer, scheduler)
            torch.cuda.empty_cache()

        with open(f'logs/log_{lr}_{rank}.json', 'w') as f:
            json.dump(comparisson[(lr, rank)], f)

with open(f'logs/full_logs.json', 'w') as f:
    json.dump({str(key): value for key, value in comparisson.items()}, f, indent=2)

  and should_run_async(code)
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training:   0%|          | 0/1170 [00:00<?, ?it/s]

Validating:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [20]:
with open(f'logs/log_5e-05_8.json', 'w') as f:
  json.dump(comparisson[(5e-05, 8)], f)

  and should_run_async(code)


In [19]:
comparisson

  and should_run_async(code)


{(5e-05,
  8): {'kaiming': {'train_loss': [0.6952385688439394,
    0.6944346657166114,
    0.6935329284423437,
    0.6926953593889872,
    0.6893078318009009,
    0.6907015886062231,
    0.6904896421310229,
    0.6896107517755948,
    0.6908401541220837,
    0.6900820548717792,
    0.688371601777199,
    0.6917754457547114,
    0.688102275897295,
    0.6892273426055908,
    0.6879023175973159,
    0.6861430177321801,
    0.687669419325315,
    0.685344703686543,
    0.685987073641557,
    0.682884462368794,
    0.6842786364066296,
    0.6843553949625064,
    0.6847552611277654,
    0.6849426749425057,
    0.6835525402655969,
    0.6840199109835502,
    0.6833791045042185,
    0.6811878451934228,
    0.6843496454067719,
    0.6830343588804587],
   'val_loss': [0.6920580625534057,
    0.6914636492729187,
    0.6950017094612122,
    0.692646610736847,
    0.6898861289024353,
    0.6930441856384277,
    0.691651451587677,
    0.6924391746520996,
    0.693200159072876,
    0.690669822692871

In [None]:
assert False

In [None]:
with open(f'logs/log_7e-05_1.json', 'r') as f:
    comparisson = json.load(f)

In [None]:
plt.plot(comparisson['kaiming']['train_loss'], label='kaiming')
plt.plot(comparisson['svd']['train_loss'], label='svd')
plt.title('train_loss')
plt.xlabel('epoch')
plt.legend()
plt.show()

In [None]:
plt.plot(comparisson['kaiming']['val_loss'], label='kaiming')
plt.plot(comparisson['svd']['val_loss'], label='svd')
plt.title('val_loss')
plt.xlabel('epoch')
plt.legend()
plt.show()

In [None]:
plt.plot(comparisson['kaiming']['val_acc'], label='kaiming')
plt.plot(comparisson['svd']['val_acc'], label='svd')
plt.title('val_acc')
plt.xlabel('epoch')
plt.legend()
plt.show()