In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

from langchain_community.llms import CTransformers
from langchain.memory import ConversationBufferMemory
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline

In [2]:
access_token = "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"
model_file = "Viet-Mistral/Vistral-7B-Chat"
vector_db_path = "vectorstores/db_faiss"

In [3]:
# Load LLM
def load_llm(model_name_or_path):
    
    # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True, token=access_token)
    # model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,use_safetensors=True, token=access_token)
    # tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=access_token)
    # model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=access_token)

    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=access_token)
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        # torch_dtype=torch.bfloat16, # change to torch.float16 if you're using V100
        device_map="auto",
        use_cache=True,
        token=access_token
    )
    
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=512,
        temperature=0.1,
        top_p=0.95,
        repetition_penalty=1.15
    )
    
    llm=HuggingFacePipeline(pipeline=pipe)
    return llm

# Tao prompt template
def create_prompt(template):
    prompt = PromptTemplate(template = template, input_variables=["context", "question"])
    return prompt


# Tao simple chain
def create_qa_chain(llm, db):
    llm_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type= "stuff",
        retriever = db.as_retriever(search_kwargs = {"k":3}, max_tokens_limit=1024),
        return_source_documents = True,
        # chain_type_kwargs= {'prompt': prompt}
    )
    return llm_chain

# Read tu VectorDB
def read_vectors_db():
    # Embeding
    embedding_model = HuggingFaceEmbeddings(model_name = "intfloat/multilingual-e5-small")
    db = FAISS.load_local(vector_db_path, embedding_model)
    return db

In [None]:
db = read_vectors_db()
db.similarity_search('Does bart have an encoder?')

llm = load_llm(model_file)

memory = ConversationBufferMemory(memory_key="chat_history",    k=3,
    return_messages=True)

# template = """<|im_start|>system\nSử dụng thông tin sau đây để trả lời câu hỏi. Nếu bạn không biết câu trả lời, hãy nói không biết, đừng cố tạo ra câu trả lời\n
#     {context}<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant"""
# prompt = create_prompt(template)

llm_chain  = create_qa_chain(llm, db)

# Chay cai chain
question = "Cá nhân kinh doanh bất động sản cho thuê mấy căn nhà?"
response = llm_chain.invoke({"query": question})
print(response)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
