**Download quantized model from huggingface hub**





In [None]:
import os
from huggingface_hub import hf_hub_download

def download_mpt_quant(destination_folder: str, repo_id: str, model_filename: str):
    local_path = os.path.abspath(destination_folder)
    return hf_hub_download(
        repo_id=repo_id,
        filename=model_filename,
        local_dir=local_path,
        local_dir_use_symlinks=True
    )


#repo_id = "TheBloke/mpt-30B-chat-GGML"
#model_filename = "mpt-30b-chat.ggmlv0.q4_1.bin"
repo_id = "MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF"
model_filename = "Meta-Llama-3-8B-Instruct.Q5_K_M.gguf"
destination_folder = "models"
download_mpt_quant(destination_folder, repo_id, model_filename)

**Install Llama cpp**

In [None]:
# for cpu and gpu
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python --no-cache-dir
#just for cpu
#!pip install --upgrade --quiet  llama-cpp-python

**Load GGUF quantized model on Llamma cpp for for inference**


In [None]:
from llama_cpp import Llama
llm = Llama(model_path="/content/models/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf",
            n_gpu_layers=-1) # all layers in gpu
print (llm)
response = llm("Explain modal epistemology?", max_tokens=1000)
response

**Langchain wrapper for inference**

In [None]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate.from_template(template)
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


n_gpu_layers = -1
n_batch = 512

llm = LlamaCpp(
    model_path="/content/models/Meta-Llama-3-8B-Instruct.Q5_K_M.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=False,
)

llm_chain = prompt | llm
question = "Explain modal epistemology?"
llm_chain.invoke({"question": question})