In [3]:
import torch
import torchaudio
import torchvision

In [6]:
from llama_cpp import Llama

SYSTEM_PROMPT = "Ты — Сайга, русскоязычный автоматический ассистент. Ты разговариваешь с людьми и помогаешь им."


class LlamaInference:
    def __init__(self, model_path: str = "../models/saiga_llama3_q4/model-q4_K.gguf"):
        self.model = Llama(model_path=model_path, n_ctx=8192, n_parts=1, verbose=False)

    def interact(
        self,
        user_message: str,
        top_k=30,
        top_p=0.9,
        temperature=0.6,
        repeat_penalty=1.1
    ):
        messages = [{"role": "system", "content": SYSTEM_PROMPT}]
        messages.append({"role": "user", "content": user_message})
    
        llm_answer = self.model.create_chat_completion(
            messages,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            repeat_penalty=repeat_penalty,
            stream=False,
            max_tokens=2048,
        )["choices"][0]["message"]["content"]
        return llm_answer

if __name__ == "__main__":
    model_path = "../models/saiga_llama3_q4/model-q4_K.gguf"
    llm_assistant = LlamaInference(model_path=model_path)
    # user_message = "У кого был арендован Россией комплекс Байконур?"
    user_message = "Кем был ослеплен князь Василий Тёмный?"
    # user_message = "Население Шымкента в прежних границах в начале 2015 года?"
    answer = llm_assistant.interact(user_message)
    print(answer)

    user_message = "А где это произошло?"
    answer = llm_assistant.interact(user_message)
    print(answer)

Ослеплен князем Василием Тёмным был его собственный брат, Иван III Васильевич. Это произошло в 1490 году как часть заговора против Василия и в результате уничтожения многих членов семьи Рюриковичей.


In [5]:
import os
from typing import Dict
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import torch
from torch import Tensor
from transformers import AutoTokenizer, AutoModel
import pickle

tqdm.pandas()


class CreateEmbeddings:
    def __init__(self):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-base")
        self.model = AutoModel.from_pretrained("intfloat/multilingual-e5-base").to(self.device)

    def _average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
        last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
        return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

    def get_embeddings(self, data_dict: Dict[str, str]):
        embeddings_raw_name = './db_embedds/embeddings.npy'
        embeddings_question_name = './db_embedds/embeddings.txt'
        embeddings_answer_name = './db_embedds/embeddings_answer.txt'

        if not os.path.exists(embeddings_raw_name):
            print("Генерация эмбеддингов.")
            questions = ["passage: " + q for q in list(data_dict.keys())]
            answers = list(data_dict.values())
            question_embeddings = []

            with torch.no_grad():
                for question in tqdm(questions, desc="generating embedds"):
                    batch_dict = self.tokenizer(
                        question,
                        max_length=512,
                        padding=True,
                        truncation=True,
                        return_tensors="pt",
                    ).to(self.device)
                    outputs = self.model(**batch_dict)
                    embedding = self._average_pool(
                        outputs.last_hidden_state, batch_dict["attention_mask"]
                    ).cpu()
                    question_embeddings.append(embedding[0])
                question_embeddings = torch.stack(question_embeddings).cpu().detach().numpy()

            np.save(embeddings_raw_name, question_embeddings)
            with open(embeddings_question_name, 'w', encoding='utf-8') as f:
                for line in tqdm(questions, desc="saving questions"):
                    f.write(line + '\n')
            with open(embeddings_answer_name, 'w', encoding='utf-8') as f:
                for line in tqdm(answers, desc="saving answers"):
                    f.write(line + '\n')

            embeddings_raw = question_embeddings
            questions_list = questions
            answers_list = answers
        else:
            print("Загрузка готовых эмбеддингов.")
            embeddings_raw = np.load(embeddings_raw_name)
            with open(embeddings_question_name, 'r', encoding='utf-8') as f:
                questions_list = f.readlines()
            with open(embeddings_answer_name, 'r', encoding='utf-8') as f:
                answers_list = f.readlines()

        return embeddings_raw, questions_list, answers_list

    def get_embedding(self, text: str):
        if "passage" not in text:
            text = "passage: " + text
        batch_dict = self.tokenizer(
            text, max_length=512, padding=True, truncation=True, return_tensors="pt"
        )
        outputs = self.model.to("cpu")(**batch_dict)
        embedding = (
            self._average_pool(outputs.last_hidden_state, batch_dict["attention_mask"])
            .cpu()
            .detach()
            .numpy()
        )
        return embedding


if __name__ == "__main__":

    if not os.path.exists("../db_embedds/data_dict.pickle"):
        train = pd.read_parquet("../dataset/kuznetsoffandrey_sberquad_train.parquet")
        valid = pd.read_parquet("../dataset/kuznetsoffandrey_sberquad_valid.parquet")
        test = pd.read_parquet("../dataset/kuznetsoffandrey_sberquad_test.parquet")

        all_data = pd.concat([train, valid], axis=0)
        all_data = pd.concat([all_data, test], axis=0)

        data_dict = {}

        def get_data_dict(row):
            question = row["question"]
            answer = row["answers"]["text"][0]
            data_dict[str(question)] = str(answer)

        all_data.progress_apply(get_data_dict, axis=1)

        with open("../db_embedds/data_dict.pickle", "wb") as f:
            pickle.dump(data_dict, f)

    else:
        data_dict = pickle.load(open("../db_embedds/data_dict.pickle", "rb"))

    embedds_model = CreateEmbeddings()
    embeddings_raw, questions_list, answers_list = embedds_model.get_embeddings(data_dict)
    print(embeddings_raw[0][:30])
    print(questions_list[0][:30])
    print(answers_list[0][:30])

RuntimeError: Failed to import transformers.models.xlm_roberta.modeling_xlm_roberta because of the following error (look up to see its traceback):
cannot import name 'quantize_' from 'torchao.quantization' (C:\ProgramData\anaconda3\envs\myenv\lib\site-packages\torchao\quantization\__init__.py)

In [9]:
import torchao
from torchao.quantization import quantize_

ImportError: cannot import name 'quantize_' from 'torchao.quantization' (C:\ProgramData\anaconda3\envs\myenv\lib\site-packages\torchao\quantization\__init__.py)

In [17]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate
from langchain.embeddings import LlamaCppEmbeddings

llm = LlamaCpp(model_path="models/saiga_model-q4_K.gguf")
embeddings = LlamaCppEmbeddings(model_path="../models/llama-7b.ggmlv3.q4_K_S.bin")
# llm_chain = LLMChain(llm=llm, prompt=prompt)


ValidationError: 1 validation error for LlamaCpp
__root__
  Could not load Llama model from path: models/saiga_model-q4_K.gguf. Received error Model path does not exist: models/saiga_model-q4_K.gguf (type=value_error)

In [None]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="IlyaGusev/saiga_llama3_8b_gguf",
	filename="model-f16.gguf",
)

llm.create_chat_completion(
	messages = [
		{
			"role": "user",
			"content": "What is the capital of France?"
		}
	]
)

In [2]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-base")
AutoModel.from_pretrained("intfloat/multilingual-e5-base")

RuntimeError: Failed to import transformers.models.xlm_roberta.modeling_xlm_roberta because of the following error (look up to see its traceback):
cannot import name 'quantize_' from 'torchao.quantization' (C:\ProgramData\anaconda3\envs\myenv\lib\site-packages\torchao\quantization\__init__.py)

In [13]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate
from langchain.embeddings import LlamaCppEmbeddings

llm = LlamaCpp(model_path="./models/saiga_model-q4_K.gguf")

template = "Вопрос: {question} Ответ:"
prompt = PromptTemplate.from_template(template)
print(prompt.template)
print(prompt.input_variables)

ValidationError: 1 validation error for LlamaCpp
__root__
  Could not load Llama model from path: ./models/saiga_model-q4_K.gguf. Received error Model path does not exist: ./models/saiga_model-q4_K.gguf (type=value_error)

In [None]:
question = "Из чего появилось графито-углистое вещество?"

In [9]:
formatted_template = prompt.format(adjective='смешную', content='куриц')
formatted_template

'Расскажи смешную шутку про куриц'