# Example of using NorMistral on a GPU with low VRAM

## Initialize the environment

In [1]:
!pip install bitsandbytes
!pip install accelerate

import torch
from google.colab import output
from transformers import AutoTokenizer, AutoModelForCausalLM

output.clear()

## Load the model

The model will be quantized into 8 bits, it trades off performance for less memory usage.

In [2]:
model_name = 'norallm/normistral-7b-warm' # @param ["norallm/normistral-7b-warm", "norallm/normistral-7b-scratch", "norallm/norbloom-7b-scratch"]

In [3]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    low_cpu_mem_usage=True,
    load_in_8bit=True,
    torch_dtype=torch.bfloat16
)
output.clear()

## Define a prompt for zero-shot machine translation

In [16]:
# Zero-shot prompt template
prompt = """{source_language}: {source_text}
{target_language}:"""

# A function that will take care of generating the output
def generate(input_dict):
    text = prompt.format(**input_dict)
    input_ids = tokenizer(text, return_tensors='pt').input_ids.cuda()
    prediction = model.generate(
        input_ids,
        max_new_tokens=256,
        do_sample=False,
        eos_token_id=tokenizer('\n').input_ids
    )
    decoded_prediction = tokenizer.decode(prediction[0, input_ids.size(1):]).strip()
    output.clear()

    return decoded_prediction

## Translate!

In [28]:
source_text = "Can NorMistral still translate, even when quantized in Google Colab?" # @param {type:"string"}
source_language = "Engelsk" # @param ["Engelsk", "Bokmål", "Nynorsk"]
target_language = "Bokm\xE5l" # @param ["Engelsk", "Bokmål", "Nynorsk"]

In [29]:
output_text = generate({
    "source_text": source_text,
    "source_language": source_language,
    "target_language": target_language
})

print(output_text)

Kan NorMistral fortsatt oversette, selv når kvantisert i Google Colab?
