In [1]:
from transformers import pipeline
import torch

MODEL = "mistralai/Mistral-7B-Instruct-v0.3"

def ask_mistral(question: str):
    try:
        user_prompt = f"Please provide a short, direct answer to the following question: {question}"

        # Apply the Mistral prompt template
        full_prompt = f"<s>[INST] {user_prompt} [/INST]"

        # Initialize the text-generation pipeline
        # Use torch.float16 for efficiency on most GPUs
        # Use device_map="auto" to automatically utilize the GPU(s)
        generator = pipeline(
            "text-generation",
            model=MODEL,
            torch_dtype=torch.float16,
            device_map="auto"
        )

        # Ask the model
        response = generator(
            full_prompt,
            max_new_tokens=100,  # Use max_new_tokens for the generated output
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )

        # The output includes the original prompt; we need to extract the answer.
        generated_text = response[0]['generated_text']
        answer = generated_text.replace(full_prompt, "").strip()

        return answer

    except Exception as e:
        return f"An error occurred: {e}"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%%time

my_question = "What is the capital of France?"
answer = ask_mistral(my_question)
print(answer)

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.86s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


The capital of France is Paris.
CPU times: total: 1min 15s
Wall time: 48.4 s
