In [1]:
import ollama
import utils.helper as helper

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = helper.data_loader()

In [4]:
bel = []
for i in range(len(data)):
    if data[i].strip() != "":
        bel.append(data[i])
data = bel.copy()
bel.clear()

In [5]:
emb_model = "hf.co/CompendiumLabs/bge-base-en-v1.5-gguf"
llm = "hf.co/bartowski/SmolLM2-135M-Instruct-GGUF"

VECTOR_DB = []

In [6]:
def add_chunk_to_database(chunk):
  embedding = ollama.embed(model=emb_model, input=chunk)['embeddings'][0]
  VECTOR_DB.append((chunk, embedding))

In [7]:
for i, chunk in enumerate(data):
  add_chunk_to_database(chunk)
  print(f'Added chunk {i+1}/{len(data)} to the database')

Added chunk 1/14 to the database
Added chunk 2/14 to the database
Added chunk 3/14 to the database
Added chunk 4/14 to the database
Added chunk 5/14 to the database
Added chunk 6/14 to the database
Added chunk 7/14 to the database
Added chunk 8/14 to the database
Added chunk 9/14 to the database
Added chunk 10/14 to the database
Added chunk 11/14 to the database
Added chunk 12/14 to the database
Added chunk 13/14 to the database
Added chunk 14/14 to the database


In [8]:
def cosine_similarity(a, b):
  dot_product = sum([x * y for x, y in zip(a, b)])
  norm_a = sum([x ** 2 for x in a]) ** 0.5
  norm_b = sum([x ** 2 for x in b]) ** 0.5
  return dot_product / (norm_a * norm_b)

In [9]:
def retrieve(query, top_n=3):
  query_embedding = ollama.embed(model=emb_model, input=query)['embeddings'][0]
  similarities = []
  for chunk, embedding in VECTOR_DB:
    similarity = cosine_similarity(query_embedding, embedding)
    if similarity > .5:
        similarities.append((chunk, similarity))
  similarities.sort(key=lambda x: x[1], reverse=True)
  return similarities[:top_n]

In [12]:
def ask(query):
    retrieved_knowledge = retrieve(query)
    if not len(retrieved_knowledge):
        print("Could not find relevant information in the document.")
        return

    text = "\n"

    instruction_prompt = f'''You are a helpful chatbot.
    Use only the following pieces of context to answer the question. Don't make up any new information:
    {text.join([f' - {chunk}' for chunk, similarity in retrieved_knowledge])}
    '''
    stream = ollama.chat(
      model=llm,
      messages=[
        {'role': 'system', 'content': instruction_prompt},
        {'role': 'user', 'content': query},
      ],
      stream=True,
    )

    for chunk in stream:
      print(chunk['message']['content'], end='', flush=True)

In [13]:
ask("what is python?")

Could not find relevant information in the document.


In [14]:
ask("what is plastic pollution?")

Plastic pollution refers to the large-scale accumulation of plastic waste in marine ecosystems and the environment throughout the world's oceans due to human consumption, waste generation, and improper disposal. This type of pollution has serious consequences for biodiversity loss, harm to marine life, and disruption of food chains, among others. Plastic pollution can occur naturally as microplastics break down into smaller particles or debris that accumulate in the ocean, but it also arises from the production and use of plastic products such as bags, containers, bottles, and other items made with plastics.

Plastic pollution is primarily caused by human activities like:

1. Discard-and-throw: Most plastic waste ends up being thrown away because people often dispose trash in oceans or landfills without taking proper disposal methods to prevent it from ending up there. 2. Producer/Consumer Trade: Plastic products are produced and used on a large scale by companies that use them as raw 