In [26]:
import time
import pinecone
from pinecone import PodSpec, Pinecone, ServerlessSpec
from datasets import load_dataset
from langchain_openai.embeddings import OpenAIEmbeddings

In [11]:
dataset = load_dataset(
    "jamescalam/llama-2-arxiv-papers-chunked",
    split="train"
)
dataset


Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 4838
})

We use Pinecone to create a vector store

In [33]:
# create pinecone instance
pc = pinecone.Pinecone(api_key="251511d1-6f9d-477d-96be-785aa6249b0c")


index_name = 'llama-2-rag'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=3072,  # dimensionality of ada 002
        metric='dotproduct',
        spec=PodSpec( environment='gcp-starter', pod_type='s1.x1' ) 
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index

index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

With OpenAIEmbeddings we create embeddings

In [27]:
embeddings_model = OpenAIEmbeddings(api_key="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", model="text-embedding-3-large")


In [29]:
texts = ["This is the first text",
         "This is the second text"]

res = embeddings_model.embed_documents(texts)

print(len(res), len(res[0]))

2 3072


Here we batch create and store the embeddings in the vector store

In [34]:
from tqdm.auto import tqdm  # for progress bar

data = dataset.to_pandas()  # this makes it easier to iterate over the dataset

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['chunk'] for _, x in batch.iterrows()]
    # embed text
    embeds = embeddings_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

100%|██████████| 49/49 [03:01<00:00,  3.70s/it]


In [36]:
from langchain.vectorstores import Pinecone

# We have to indicate which key of the metadata we want to retrieve. In the example, it will be the key "text"
text_field = "text"

vectorstore = Pinecone(index, embeddings_model.embed_query, text_field)



In [None]:
query = "What is so special about Llama 2?"

vectorstore.similarity_search(query, k=3)