In [1]:
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
from langchain.embeddings import HuggingFaceEmbeddings
model_path = "../../models/llama-2-7b-chat.Q4_0.gguf"
llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_path=model_path,
    temperature=0,
    max_new_tokens=256,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)
embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ../../models/llama-2-7b-chat.Q4_0.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 11008,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  4096, 11008,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0     [  4096,  4096,    

In [2]:
from llama_index import ServiceContext
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    chunk_overlap=0,
)

In [3]:
import weaviate

In [4]:
client = weaviate.Client("http://localhost:8080")

In [5]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores import WeaviateVectorStore
from llama_index.response.notebook_utils import display_response

In [6]:
documents = SimpleDirectoryReader("../../data-bak").load_data()


In [7]:
from llama_index.storage.storage_context import StorageContext


vector_store = WeaviateVectorStore(weaviate_client=client)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, service_context=service_context)

# NOTE: you may also choose to define a index_name manually.
# index_name = "test_prefix"
# vector_store = WeaviateVectorStore(weaviate_client=client, index_name=index_name)

In [8]:
query_engine = index.as_query_engine(similarity_top_k=2)


In [9]:
response = query_engine.query("What did the author do growing up?")
display_response(response)



llama_print_timings:        load time =  9884.66 ms
llama_print_timings:      sample time =   148.00 ms /   107 runs   (    1.38 ms per token,   722.96 tokens per second)
llama_print_timings: prompt eval time = 28962.13 ms /  2261 tokens (   12.81 ms per token,    78.07 tokens per second)
llama_print_timings:        eval time = 11054.55 ms /   106 runs   (  104.29 ms per token,     9.59 tokens per second)
llama_print_timings:       total time = 40488.08 ms


**`Final Response:`** Based on the provided context, the author worked on writing and programming outside of school before college. They mention writing short stories and trying to program on an IBM 1401 in high school, but they didn't have much success with it. In college, the author initially planned to study philosophy but became interested in AI instead. They visited Rich Draves at Carnegie Mellon University and realized that making art was something they could do that would last, so they started taking art classes at Harvard.

In [11]:
# alpha - if close to 0 favor bm25, close to 1 favor vector store, default alpha=0.75 to favor vector store
alpha = [0, 0.25, 0.5, 0.75, 1]

for a in alpha:
    query_engine = index.as_query_engine(
        vector_store_query_mode="hybrid", similarity_top_k=2, alpha=a
    )
    response = query_engine.query(
        "What did the author do growing up?",
    )
    display_response(response)

Llama.generate: prefix-match hit

llama_print_timings:        load time =  8977.42 ms
llama_print_timings:      sample time =   402.99 ms /   145 runs   (    2.78 ms per token,   359.81 tokens per second)
llama_print_timings: prompt eval time = 33123.52 ms /  2226 tokens (   14.88 ms per token,    67.20 tokens per second)
llama_print_timings:        eval time = 13994.93 ms /   144 runs   (   97.19 ms per token,    10.29 tokens per second)
llama_print_timings:       total time = 48324.24 ms


**`Final Response:`** Based on the given text, there is no direct information about what the author did growing up. However, we can infer some details from the context. The author mentions that they were a "helpful, respectful, and honest assistant," which suggests that they may have had experience in customer service or other roles that require good communication skills and a helpful attitude. Additionally, the author's reference to "the worst excesses of Austrian Rococo" in their church-like brain implies that they may have had some exposure to art or architecture, which could be related to their work in the tech industry. However, without more information, it is difficult to provide a definitive answer to the query.

Llama.generate: prefix-match hit

llama_print_timings:        load time =  8977.42 ms
llama_print_timings:      sample time =   329.57 ms /   138 runs   (    2.39 ms per token,   418.72 tokens per second)
llama_print_timings: prompt eval time = 33220.69 ms /  2227 tokens (   14.92 ms per token,    67.04 tokens per second)
llama_print_timings:        eval time = 12990.68 ms /   137 runs   (   94.82 ms per token,    10.55 tokens per second)
llama_print_timings:       total time = 47159.06 ms


**`Final Response:`** Based on the provided context, the author worked on writing and programming outside of school before college. They mention writing short stories and trying to use an early version of Fortran on an IBM 1401 computer, but they didn't have much input to programs as they couldn't store data on punch cards. When microcomputers became available, the author began programming more seriously, writing simple games, a program to predict how high model rockets would fly, and a word processor for their father to use. The author also mention that they didn't plan to study programming in college initially, but later changed their mind and decided to study AI instead.

Llama.generate: prefix-match hit

llama_print_timings:        load time =  8977.42 ms
llama_print_timings:      sample time =   288.10 ms /   138 runs   (    2.09 ms per token,   479.00 tokens per second)
llama_print_timings: prompt eval time = 13828.89 ms /  1076 tokens (   12.85 ms per token,    77.81 tokens per second)
llama_print_timings:        eval time = 12853.98 ms /   137 runs   (   93.82 ms per token,    10.66 tokens per second)
llama_print_timings:       total time = 27499.56 ms


**`Final Response:`** Based on the provided context, the author worked on writing and programming outside of school before college. They mention writing short stories and trying to program on an IBM 1401 in high school, but were unable to do much with it due to a lack of input data. Later, they got their first microcomputer built by Heathkit and started programming more seriously, writing simple games, a program to predict how high model rockets would fly, and a word processor that their father used to write at least one book. After college, the author dropped out of school and moved to New York City, where they continued to paint and write books on Lisp programming.

Llama.generate: prefix-match hit

llama_print_timings:        load time =  8977.42 ms
llama_print_timings:      sample time =   315.38 ms /   138 runs   (    2.29 ms per token,   437.57 tokens per second)
llama_print_timings: prompt eval time =     0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time = 12728.36 ms /   138 runs   (   92.23 ms per token,    10.84 tokens per second)
llama_print_timings:       total time = 13597.70 ms


**`Final Response:`** Based on the provided context, the author worked on writing and programming outside of school before college. They mention writing short stories and trying to program on an IBM 1401 in high school, but were unable to do much with it due to a lack of input data. Later, they got their first microcomputer built by Heathkit and started programming more seriously, writing simple games, a program to predict how high model rockets would fly, and a word processor that their father used to write at least one book. After college, the author dropped out of school and moved to New York City, where they continued to paint and write books on Lisp programming.

Llama.generate: prefix-match hit

llama_print_timings:        load time =  8977.42 ms
llama_print_timings:      sample time =   379.34 ms /   107 runs   (    3.55 ms per token,   282.07 tokens per second)
llama_print_timings: prompt eval time = 14025.58 ms /  1095 tokens (   12.81 ms per token,    78.07 tokens per second)
llama_print_timings:        eval time =  9877.52 ms /   106 runs   (   93.18 ms per token,    10.73 tokens per second)
llama_print_timings:       total time = 25017.40 ms


**`Final Response:`** Based on the provided context, the author worked on writing and programming outside of school before college. They mention writing short stories and trying to program on an IBM 1401 in high school, but they didn't have much success with it. In college, the author initially planned to study philosophy but became interested in AI instead. They visited Rich Draves at Carnegie Mellon University and realized that making art was something they could do that would last, so they started taking art classes at Harvard.