##  Inferencing on Mistral 7B LLM with 4-bit quantization 🚀


In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

In [4]:
def load_quantized_model(model_name: str):
    """
    :param model_name: Название или путь к загружаемой модели.
    :return: Загруженная квантованная модель.
    """
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        torch_dtype=torch.bfloat16,
        quantization_config=bnb_config
    )
    return model


def initialize_tokenizer(model_name: str):
    """
    Инициализация токенизатора с указанным именем модели.

    :param model_name: Название или путь к модели для инициализации токенизатора.
    :return: Инициализированный токенизатор.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.bos_token_id = 1  # Устанавливаем идентификатор начала предложения
    return tokenizer



stop_token_ids = [0] # Определяем идентификаторы остановки
model_name = 'bn22/Mistral-7B-Instruct-v0.1-sharded'
model = load_quantized_model(model_name)
tokenizer = initialize_tokenizer(model_name)

Loading checkpoint shards: 100%|██████████| 11/11 [00:15<00:00,  1.44s/it]
Downloading (…)okenizer_config.json: 100%|██████████| 963/963 [00:00<00:00, 481kB/s]
Downloading tokenizer.model: 100%|██████████| 493k/493k [00:00<00:00, 2.73MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 2.62MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 72.0/72.0 [00:00<00:00, 36.0kB/s]


In [5]:
def generate_answer(text, max_new_tokens=200):
    text = f"[INST] {text} [/INST]"
    encoded = tokenizer(text, return_tensors="pt", add_special_tokens=False)
    model_input = encoded
    generated_ids = model.generate(**model_input, max_new_tokens=max_new_tokens, do_sample=True)
    decoded = tokenizer.batch_decode(generated_ids)
    answer = decoded[0]
    return answer

In [6]:
text = """
You are genius Data Scientist and Python Developer.
Write a code that predicts the user's future 10 purchases (mcc-codes) based on his purchase history consisting of mcc codes in chronological order.
"""

answer = generate_answer(text, max_new_tokens=1000)

print('-----------------------------------------------------------------------')
print('ANSWER:', answer)
print('-----------------------------------------------------------------------')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


-----------------------------------------------------------------------
ANSWER: [INST] 
You are genius Data Scientist and Python Developer.
Write a code that predicts the user's future 10 purchases (mcc-codes) based on his purchase history consisting of mcc codes in chronological order.
 [/INST] Certainly! To predict a user's future 10 purchases, we can use a machine learning approach called "sequence classification". Here's a high-level overview of the code I would write to accomplish this task:

1. Pre-process the purchase history dataset to normalize or standardize the data, and convert the mcc codes into numerical form.
2. Define the input sequence length. In this case, we'll assume the user's purchase history is sequential, with each purchase being recorded in chronological order. So we could set the input sequence length to be the maximum number of purchases in the user's history, plus the number of additional time steps we want to predict. For example, if the user has made 10 pu