In [1]:
import time
import pinecone
from pinecone import PodSpec, Pinecone, ServerlessSpec
from datasets import load_dataset
from langchain_openai.embeddings import OpenAIEmbeddings



  from tqdm.autonotebook import tqdm


In [None]:
dataset = load_dataset(
    "jamescalam/llama-2-arxiv-papers-chunked",
    split="train"
)

dataset[0]


We use Pinecone to create a vector store

In [21]:
# create pinecone instance
pc = pinecone.Pinecone(api_key="251511d1-6f9d-477d-96be-785aa6249b0c")


index_name = 'law-documents'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]
print(existing_indexes)


[]


In [22]:

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=3072,  # dimensionality of ada 002
        metric='cosine',
        spec=PodSpec( environment='gcp-starter', pod_type='s1.x1' ) 
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index

index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()


{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

With OpenAIEmbeddings we create embeddings

In [4]:
embeddings_model = OpenAIEmbeddings(api_key="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", model="text-embedding-3-large")


In [None]:
texts = ["This is the first text",
         "This is the second text"]

res = embeddings_model.embed_documents(texts)

print(len(res), len(res[0]))

Here we batch create and store the embeddings in the vector store

In [None]:
from tqdm.auto import tqdm  # for progress bar

data = dataset.to_pandas()  # this makes it easier to iterate over the dataset

batch_size = 100

for i in tqdm(range(0, len(data), batch_size)):
    i_end = min(len(data), i+batch_size)
    # get batch of data
    batch = data.iloc[i:i_end]
    # generate unique ids for each chunk
    ids = [f"{x['doi']}-{x['chunk-id']}" for i, x in batch.iterrows()]
    # get text to embed
    texts = [x['chunk'] for _, x in batch.iterrows()]
    # embed text
    embeds = embeddings_model.embed_documents(texts)
    # get metadata to store in Pinecone
    metadata = [
        {'text': x['chunk'],
         'source': x['source'],
         'title': x['title']} for i, x in batch.iterrows()
    ]
    # add to Pinecone
    index.upsert(vectors=zip(ids, embeds, metadata))

In [5]:
index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 5e-05,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [7]:

# Fetch all unique sources from the Pinecone index
# Since Pinecone does not provide a direct way to fetch all unique sources, we need to fetch all vectors and extract the sources
# This is not efficient and should be replaced with a proper database query if possible

# Fetch all vector IDs from the Pinecone index
vector_ids = index.list()

# Fetch all vectors using the IDs
vectors_info = index.fetch(ids=vector_ids)

# Extract the 'source' from the metadata of each vector
sources = set()
for vector_id in vectors_info['vectors']:
    vector_metadata = vectors_info['vectors'][vector_id]['metadata']
    sources.add(vector_metadata['source'])

# Print all unique sources
for source in sources:
    print(source)



PineconeApiTypeError: Invalid type for variable 'ids'. Required value type is list and passed type was generator at ['ids']

In [8]:
index.fetch(ids=["1411.4038-24"])

{'namespace': '', 'usage': {'read_units': 1}, 'vectors': {}}

In [9]:
from langchain.vectorstores import Pinecone

# We have to indicate which key of the metadata we want to retrieve. In the example, it will be the key "text"
text_field = "text"

vectorstore = Pinecone(index, embeddings_model.embed_query, text_field)

  warn_deprecated(


In [18]:
query = "King's college"

similar = vectorstore.similarity_search(query, k=3)
print(type(similar))
for element in similar:
    print(type(element))

<class 'list'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>
<class 'langchain_core.documents.base.Document'>


In [19]:
def augment_prompt(query: str):
    results = vectorstore.similarity_search(query, k=3)
    source_knowledge = "\n".join([x.page_content for x in results])
    
    augmented_prompt = f"""Using the contexts below, answer the query.
    
    Contexts:
    {source_knowledge}
    
    Query: {query}
    """
    
    return augmented_prompt

In [20]:
print(augment_prompt("King's Colleg"))

Using the contexts below, answer the query.
    
    Contexts:
    Accenture Interview
CV:
University modules (King’s College London):
•Mathematics for Engineers, Mathematical Modelling
•Signals and Systems
•Machine Learning for Engineers:
◦Más orientado para ingenieros. Principalmente se programa en MATLAB pero la mayor 
parte del módulo consistió en diseñar, entrenar y optimizar redes a mano. Mucha teoría
▪One-hot encoding de variables
▪Sensitivity: The higher the fewer false negatives (True positive)
▪Specificity: True negatives
▪Inference, Soft predictors (Prob), hard predictors (0/1)
▪Loss functions: How bad the error is: Punish neuron
▪ERM: Empirical Risk Minimisation
•Loss functions
•Reduce training loss:
◦1. Popoulation optimal unconstrained predictors
◦2. Popoulation optimal  within class predictor
◦ERM predictor (Minimal Loss)
▪Local & Global optimality
▪Backpropagation
•Computer Vision:
◦Low-level (Detection of edges(canny edge )/colour filtering)
▪convolution kernel (detect