### 20e December

# LLM finetuning

In [1]:
import keras
import keras_hub
import numpy as np
import tensorflow_datasets as tfds
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
if not os.path.exists('imdb_sentiment.keras'):
    classifier = keras_hub.models.BertClassifier(
        'bert_base_en_uncased',
        activation='softmax',
        num_classes=2,
    )
    imdb_train, imdb_test = tfds.load(
        'imdb_reviews',
        split=['train', 'test'],
        as_supervised=True,
        batch_size=16,
    )
    callbacks = [keras.callbacks.ModelCheckpoint("imdb_sentiment.keras", save_best_only=True)]
    classifier.fit(
        imdb_train,
        validation_data=imdb_test,
        callbacks=callbacks,
    )
classifier = keras.models.load_model('imdb_sentiment.keras')
preds = classifier.predict("""

Unfairly, this has been judged a lesser entry in the National Lampoon stable, though itís arguably only second to the original vacation and proves that Chevy Chase was once funny. Clarke (Chase) does his best to orchestrate a traditional happy family Christmas, but is thwarted by odious relatives and plain old bad luck.

If it doesn't make you at least giggle, then you clearly don't understand the true meaning of the festive season - which is of course combustible toilets and electrified cats.
""")

ModuleNotFoundError: No module named 'keras_hub'

### LoRA

- Low Rank Adaption
- A parameter-efficient fine-tuning technique for LLMs.
- Kan låsa parameterar som vi ändå inte använder.
- Förtränade modeller är ofta "över parametereiserade" för vår önskade användning.
- Så även då vi sätter in ytterligare parametrar så sänks den totala mängden parametrar!
- Fixed rank matrisen (kommer ingen förklaring då det är lite överkurs)
- Reducerar mängden GPU minne som krävs
- Tränar modellen snabbare (i vissa fall)
- INGEN EXTRA LATENCY!

In [None]:
## copypasta från pytorch tutorial

import torch
from torch import nn

class LoRALinear(nn.Module):
  def __init__(
    self,
    in_dim: int,
    out_dim: int,
    rank: int,
    alpha: float,
    dropout: float
  ):
    # These are the weights from the original pretrained model
    self.linear = nn.Linear(in_dim, out_dim, bias=False)

    # These are the new LoRA params. In general rank << in_dim, out_dim
    self.lora_a = nn.Linear(in_dim, rank, bias=False)
    self.lora_b = nn.Linear(rank, out_dim, bias=False)

    # Rank and alpha are commonly-tuned hyperparameters
    self.rank = rank
    self.alpha = alpha

    # Most implementations also include some dropout
    self.dropout = nn.Dropout(p=dropout)

    # The original params are frozen, and only LoRA params are trainable.
    self.linear.weight.requires_grad = False
    self.lora_a.weight.requires_grad = True
    self.lora_b.weight.requires_grad = True

  def forward(self, x: torch.Tensor) -> torch.Tensor:
    # This would be the output of the original model
    frozen_out = self.linear(x)

    # lora_a projects inputs down to the much smaller self.rank,
    # then lora_b projects back up to the output dimension
    lora_out = self.lora_b(self.lora_a(self.dropout(x)))

    # Finally, scale by the alpha parameter (normalized by rank)
    # and add to the original model's outputs
    return frozen_out + (self.alpha / self.rank) * lora_out

In [None]:
from torchtune.models.llama2 import llama2_7b, lora_llama2_7b

# Build Llama2 without any LoRA layers
base_model = llama2_7b()

# The default settings for lora_llama2_7b will match those for llama2_7b
# We just need to define which layers we want LoRA applied to.
# Within each self-attention, we can choose from ["q_proj", "k_proj", "v_proj", and "output_proj"].
# We can also set apply_lora_to_mlp=True or apply_lora_to_output=True to apply LoRA to other linear
# layers outside of the self-attention.
lora_model = lora_llama2_7b(lora_attn_modules=["q_proj", "v_proj"])