# LLM tutorial

**Authors:**<br>
*David Samuel, Egil Rønningstad, Andrey Kutuzov*<br>
*University of Oslo, Language Technology Group*

In [96]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from IPython.display import clear_output, display_markdown
import warnings
from tqdm import tqdm
import random
from typing import List

warnings.filterwarnings('ignore')
random.seed(42)

In [48]:
IS_NORWEGIAN_OKAY = False

## 1. Load a pretrained generative model

In [2]:
model_name = 'norallm/normistral-7b-warm'

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir="."
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    low_cpu_mem_usage=True,
    load_in_8bit=True,
    torch_dtype=torch.bfloat16,
    cache_dir="."
)
clear_output()

## 2. Make it generate some text

In [57]:
@torch.inference_mode()
def generate(prefix: str, max_length=64, eos_token="\n", verbose=False) -> str:
    if verbose: print(f"PREFIX: {prefix}")
        
    input_tokens = tokenizer(prefix, return_tensors='pt').input_ids.cuda()  # shape: [1, T]
    if verbose: print(f"INPUT: {str(input_tokens.shape)}")

    output_tokens = model.generate(
        input_tokens,
        max_new_tokens=max_length,
        min_new_tokens=2,
        eos_token_id=tokenizer(eos_token).input_ids if eos_token else None
    )
    if verbose: print(f"OUTPUT: {str(output_tokens.shape)}")

    prediction = output_tokens[0, input_tokens.size(1):]
    decoded_prediction = tokenizer.decode(prediction).strip()

    return decoded_prediction

**TODO:** Try it with your own input!

In [32]:
input_text = "NLDL"  # TODO!

response = generate(input_text, verbose=True, max_length=20, eos_token=None)
print(f"COMPLETION: {input_text}{response}")

PREFIX: NLDL
INPUT: torch.Size([1, 2])
OUTPUT: torch.Size([1, 22])
COMPLETION: NLDL), som er en organisasjon for alle som er interessert i og jobber med lundehund.


## 3. Machine translation

Can we turn the model into something more useful?

In [34]:
# Prompt template for translation
translation_prompt = """{source_language}: {source_text}
{target_language}:"""

def translate(source_text: str, source_language="Bokmål", target_language="Engelsk", verbose=False) -> str:
    text = translation_prompt.format(
        source_text=source_text,
        source_language=source_language,
        target_language=target_language
    )
    return generate(text, verbose=verbose)


In [35]:
translate("Du kan også prøve en annen tekst.", verbose=True)

PREFIX: Bokmål: Du kan også prøve en annen tekst.
Engelsk:
INPUT: torch.Size([1, 15])
OUTPUT: torch.Size([1, 23])


'You can also try another text.'

<br>We can use translation to translate Norwegian logs (if needed)

In [50]:
def log(text: str, can_translate=True):
    if not IS_NORWEGIAN_OKAY and can_translate:
        print(f"[no] {text}")
        print(f"[en] {translate(text)}\n")
    else:
        print(text)

## 4. Question answering

In [44]:
dataset = json.load(open('downsampled_data.json', 'r'))

In [52]:
index = 0

log(f"EXAMPLE {index}", can_translate=False)
log(f"KEYS: {str(list(dataset[index].keys()))}\n", can_translate=False)
log(dataset[index]["question"])
log(dataset[index]["answer_text"])

EXAMPLE 0
KEYS: ['id', 'question', 'choices', 'answer_label', 'answer_text']

[no] Hva slags følelser skaper det å kjøpe gaver til andre?
[en] What kind of emotions do you get from buying gifts for others?

[no] Glede
[en] Joy



### 4.1 Simple zero-shot generation

In [67]:
# TODO: write a better prompt template
prompt = """Spormål: {question}
Svar:"""

def zero_answer(sample: dict, verbose=False):
    text = prompt.format(**sample)
    return generate(text, verbose=verbose)

In [74]:
index = 2
sample = dataset[index]

prediction = zero_answer(sample, verbose=True)

log("\nQUESTION:", can_translate=False)
log(sample["question"])

log("GOLD ANSWER:", can_translate=False)
log(sample["answer_text"])

log("PREDICTION:", can_translate=False)
log(prediction)

PREFIX: Spormål: Hvis du er sulten og går for å fiske, hva er hensikten med det?
Svar:
INPUT: torch.Size([1, 23])
OUTPUT: torch.Size([1, 31])

QUESTION:
[no] Hvis du er sulten og går for å fiske, hva er hensikten med det?
[en] If you are hungry and go fishing, what is the purpose of that?

GOLD ANSWER:
[no] Å få fisk
[en] To get fish

PREDICTION:
[no] Det er for å få mat.
[en] It's for getting food.



When you're happy with the prompt, you can test the accuracy

In [69]:
n_correct_predictions = 0
for sample in tqdm(dataset):
    prediction = zero_answer(sample)

    if prediction.strip().lower() == sample["answer_text"].strip().lower():
        n_correct_predictions += 1

log(f"ACCURACY: {n_correct_predictions / len(dataset):.2%}", can_translate=False)

100%|██████████| 16/16 [00:38<00:00,  2.42s/it]

ACCURACY: 0.00%





### 4.2 Few-shot generation

In [97]:
def few_answer(dataset: List[dict], index: int, n_shots=2, verbose=False):
    sample = dataset[index]

    # select random demonstrations
    other_samples = dataset[:index] + dataset[index + 1:]
    shots = random.sample(other_samples, n_shots)
    shot_prompts = [
        f"{prompt.format(**shot)} {shot['answer_text']}"
        for shot in shots
    ]
    input_text = '\n\n'.join(shot_prompts)
    input_text = f"{input_text}\n\n{prompt.format(**sample)}"
    
    return generate(input_text, verbose=verbose)

In [93]:
index = 2

prediction = few_answer(dataset, index, n_shots=2, verbose=True)

log("\nQUESTION:", can_translate=False)
log(sample["question"])

log("GOLD ANSWER:", can_translate=False)
log(sample["answer_text"])

log("PREDICTION:", can_translate=False)
log(prediction)

PREFIX: Spormål: Personen har tegnet innboforsikring. Hva er det han søker?
Svar: Økonomisk trygghet

Spormål: Hvor kan man finne frossen fisk på butikken?
Svar: I frysedisken

Spormål: Hvis du er sulten og går for å fiske, hva er hensikten med det?
Svar:
INPUT: torch.Size([1, 72])
OUTPUT: torch.Size([1, 76])

QUESTION:
[no] Hvis du er sulten og går for å fiske, hva er hensikten med det?
[en] If you are hungry and go fishing, what is the purpose of that?

GOLD ANSWER:
[no] Å få fisk
[en] To get fish

PREDICTION:
[no] Å få fisk
[en] To get fish



In [95]:
N_SHOTS = 1

n_correct_predictions = 0
for index in tqdm(range(len(dataset))):
    prediction = few_answer(dataset, index, N_SHOTS)

    if prediction.strip().lower() == dataset[index]["answer_text"].strip().lower():
        n_correct_predictions += 1

log(f"ACCURACY for {N_SHOTS} shots: {n_correct_predictions / len(dataset):.2%}", can_translate=False)

100%|██████████| 16/16 [00:13<00:00,  1.21it/s]

ACCURACY for 1 shots: 37.50%





### 4.3 Classification

In [150]:
@torch.inference_mode()
def classify(sample: dict, verbose=False):
    question = sample["question"]
    options = sample["choices"].values()

    log_probabilities = {}
    for option in options:
        question_text = prompt.format(question=question)
        question_ids = tokenizer(question_text, return_tensors='pt').input_ids.cuda()

        option_text = f" {option.lower()}"
        option_ids = tokenizer(option_text, return_tensors='pt', add_special_tokens=False).input_ids.cuda()

        input_ids = torch.cat([question_ids, option_ids], dim=1)[:, :-1]
        
        output_logits = model(input_ids).logits
        output_logits = output_logits[0, -option_ids.size(1):, :]
        log_p = -torch.nn.functional.cross_entropy(output_logits, option_ids[0, :])

        log_probabilities[option] = log_p

    if verbose: print(log_probabilities)

    prediction = max(log_probabilities, key=log_probabilities.get)
    return prediction

In [151]:
index = 2
sample = dataset[index]

prediction = classify(sample, verbose=True)

log("\nQUESTION:", can_translate=False)
log(sample["question"])

log("GOLD ANSWER:", can_translate=False)
log(sample["answer_text"])

log("PREDICTION:", can_translate=False)
log(prediction)

{'Å se fisken': tensor(-5.1474, device='cuda:0'), 'Å ha det gøy': tensor(-3.3842, device='cuda:0'), 'Å få fisk': tensor(-3.0576, device='cuda:0'), 'Våte klær': tensor(-10.1656, device='cuda:0'), 'Å drepe': tensor(-5.8484, device='cuda:0')}

QUESTION:
[no] Hvis du er sulten og går for å fiske, hva er hensikten med det?
[en] If you are hungry and go fishing, what is the purpose of that?

GOLD ANSWER:
[no] Å få fisk
[en] To get fish

PREDICTION:
[no] Å få fisk
[en] To get fish



In [152]:
n_correct_predictions = 0
for sample in tqdm(dataset):
    prediction = classify(sample)

    if prediction.strip().lower() == sample["answer_text"].strip().lower():
        n_correct_predictions += 1

log(f"ACCURACY: {n_correct_predictions / len(dataset):.2%}", can_translate=False)

100%|██████████| 16/16 [00:15<00:00,  1.02it/s]

ACCURACY: 68.75%





### 4.4 Extra exercises

Can we incorporate the A) B) C) D) E) answer option in the prompt?<br>
Does it help to do classification togerther with a few-shot prompt?