In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
)
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt


In [2]:
model_path = "../models/llama-2-7b-chat.Q4_0.gguf"
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_path=model_path,
    temperature=0.7,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 0},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from model/llama-2-7b-chat.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,    

In [3]:
response = llm.complete("Hello! Tell me a poem about cats and dogs?")
print(response.text)

  Of course, I'd be happy to help you with that! Here is a short poem about cats and dogs:
Cats and dogs, so furry and sweet,
Bringing joy to our hearts, their love to greet.
With tails that wag and purrs so bright,
They light up our lives, banishing the night.
Their playful snuggles and happy grins,
Make us smile and feel love's gentle win.
So here's to cats and dogs, our loyal friends,
Bringing us happiness until the very end.



llama_print_timings:        load time = 11252.73 ms
llama_print_timings:      sample time =    93.05 ms /   131 runs   (    0.71 ms per token,  1407.85 tokens per second)
llama_print_timings: prompt eval time = 11252.61 ms /    79 tokens (  142.44 ms per token,     7.02 tokens per second)
llama_print_timings:        eval time =  9665.31 ms /   130 runs   (   74.35 ms per token,    13.45 tokens per second)
llama_print_timings:       total time = 21174.49 ms


In [None]:
response_iter = llm.stream_complete("Can you write me a poem about fast cars?")
for response in response_iter:
    print(response.delta, end="", flush=True)

# Query engine set up with LlamaCPP


In [2]:
# use a different LLM
embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# create a service context
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    chunk_overlap=0,
)


In [11]:
documents = SimpleDirectoryReader("data").load_data()


In [12]:
# create vector store index
index = VectorStoreIndex.from_documents(documents, service_context=service_context)


In [13]:
query_engine = index.as_query_engine()


In [14]:
response = query_engine.query("What did the author do growing up?")
print(response)

Llama.generate: prefix-match hit


  Based on the given context, the author worked on writing and programming outside of school before college. They wrote short stories as a beginner writer and tried to write programs on punch cards using an early version of Fortran on an IBM 1401 computer in junior high school. Later, they got their first microcomputer, a TRS-80, in about 1980, which they used to write simple games, predict how high model rockets would fly, and use as a word processor for their father's writing. The author also visited the Carnegie Institute one day and realized that they could make things that would last, like paintings, which inspired them to consider becoming an artist.



llama_print_timings:        load time =  9337.15 ms
llama_print_timings:      sample time =   158.64 ms /   149 runs   (    1.06 ms per token,   939.21 tokens per second)
llama_print_timings: prompt eval time = 15421.68 ms /  1095 tokens (   14.08 ms per token,    71.00 tokens per second)
llama_print_timings:        eval time = 15531.61 ms /   148 runs   (  104.94 ms per token,     9.53 tokens per second)
llama_print_timings:       total time = 31507.65 ms


In [21]:

from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext,
)
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt


model_path = "model/llama-2-7b-chat.Q4_0.gguf"
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_path=model_path,
    temperature=0.7,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)
chunk_overlaps = [0, 30]
documents = SimpleDirectoryReader("data").load_data()
results = []
for chunk in chunk_overlaps:
    service_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        chunk_overlap=chunk,
    )
    index = VectorStoreIndex.from_documents(documents, service_context=service_context)
    query_engine = index.as_query_engine()
    response = query_engine.query("What did the author do growing up?")
    results.append(response.response)


llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from model/llama-2-7b-chat.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0     [  4096,  4096,     1,    

In [22]:
# results[0].__dict__
response_iter = llm.stream_complete(f"Which is better in terms of clarity between the following: 1.{results[0]} and 2.{results[1]}")
for response in response_iter:
    print(response.delta, end="", flush=True)

Llama.generate: prefix-match hit


  Based on the provided context, option 1 is better in terms of clarity. Here's why:
In option 1, the author worked on writing and programming outside of school before college, with specific examples of projects they worked on, such as predicting how high model rockets would fly and writing a word processor for their father to write books. This provides a clear picture of the author's background and experience in these areas.
In option 2, the author worked on writing and programming outside of school before college, but the specific details provided are less clear-cut. The mention of an IBM 1401 computer at the junior high school is vague, as it's not clear what the author attempted to do with it or how successful they were. Additionally, the acquisition of a microcomputer built by Heathkit in about 1980 and the author's use of it to write simple games, predict how high model rockets would fly, and use a word processor is also not as detailed as in option 1. Overall, option 1 provides 


llama_print_timings:        load time =  6063.48 ms
llama_print_timings:      sample time =   544.11 ms /   246 runs   (    2.21 ms per token,   452.11 tokens per second)
llama_print_timings: prompt eval time =  2745.26 ms /   317 tokens (    8.66 ms per token,   115.47 tokens per second)
llama_print_timings:        eval time = 18310.45 ms /   245 runs   (   74.74 ms per token,    13.38 tokens per second)
llama_print_timings:       total time = 22898.63 ms


Based on the provided context, option 1 is better in terms of clarity. Here's why:

In option 1, the author worked on writing and programming outside of school before college, with specific examples of projects they worked on, such as predicting how high model rockets would fly and writing a word processor for their father to write books. This provides a clear picture of the author's background and experience in these areas.

In option 2, the author worked on writing and programming outside of school before college, but the specific details provided are less clear-cut. The mention of an IBM 1401 computer at the junior high school is vague, as it's not clear what the author attempted to do with it or how successful they were. Additionally, the acquisition of a microcomputer built by Heathkit in about 1980 and the author's use of it to write simple games, predict how high model rockets would fly, and use a word processor is also not as detailed as in option 1. Overall, option 1 provides more specific and detailed information about the author's background and experience in writing and programming.