In [2]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
import torch
from pathlib import Path
from llama_index.readers.file import PDFReader
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.core import VectorStoreIndex
from llama_index.core import SummaryIndex
from llama_index.core.response.notebook_utils import display_response
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
import chromadb

In [3]:
torch.cuda.empty_cache()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  return torch._C._cuda_getDeviceCount() > 0


In [4]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5", device= device)
hf_token = "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"

In [5]:
llm = LlamaCPP(
    model_path= "llama-2-13b-chat.Q4_0.gguf",
    temperature= 0.1,
    max_new_tokens= 256,
    context_window= 3900,
    generate_kwargs= {},
    model_kwargs= {"n_gpu_layers": 100},
    verbose= False,
)

In [8]:
loader = PDFReader()
documents = loader.load_data(file=Path('The McKinsey Way.pdf'))

In [9]:
db = chromadb.PersistentClient(path="./chroma_db")
collection = db.get_collection("Mckinsey_Way")
vector_store = ChromaVectorStore(chroma_collection= collection)
index = VectorStoreIndex.from_vector_store(vector_store = vector_store, embed_model = embed_model, llm = llm)

In [10]:
query_engine = index.as_query_engine(llm = llm)
response = query_engine.query("What is the 80/20 rule?")
display_response(response)

**`Final Response:`** Based on the context, there is no direct mention of the 80/20 rule; however, there is a reference to it in the passage as "one of the great truths of management consulting." It suggests that 80% of results come from 20% of efforts or inputs.

In [11]:
chroma_client = chromadb.Client()

In [12]:
collection = chroma_client.create_collection(name = "Mckinsey_Way")

In [25]:
db = chromadb.PersistentClient(path = "./chroma_db")
collection = db.get_or_create_collection("Mckinsey_Way")
vector_store = ChromaVectorStore(chroma_collection= collection)
storage_context = StorageContext.from_defaults(vector_store= vector_store)
index = VectorStoreIndex.from_documents(documents = documents, storage_context= storage_context, embed_model = embed_model)

In [3]:
from llama_index.llms.ollama import Ollama
from llama_index.core import Settings

Settings.llm = Ollama(model="llama2", request_timeout=60.0)