In [1]:
import sys
import os

# Get the current directory
current_dir = os.path.dirname(os.path.abspath('__file__'))

# Get the parent directory
parent_dir = os.path.dirname(current_dir)

# Add the parent directory to sys.path
sys.path.append(parent_dir)

import llama_cpp_api as llapi

API = llapi.LlamaAPI()

In [2]:
API.initialize_server()

Server started successfully


In [4]:
def inspect_response(response):
    for key, value in response.items():
        print(f'\n=== {key}:')
        if isinstance(value , dict):
            for k, v in value.items():
                print(f'\n=== === {k}:\n\n {v}\n')
        else:
            print(f'{value}\n')
    
        

In [11]:
# import documents
# process text
# apply embeddings
# store in chromadb
# prompt llm and query chromadb

In [None]:
import re

def apply_embeddings(pages_and_chunks_over_min_token_len: dict, 
                     flatten = False) -> dict:
    final_pages_and_chunks = []
    if not flatten:
        for item in pages_and_chunks_over_min_token_len:
            text = re.sub('\s{2,}',' ',item["sentence_chunk"])
            
            embedding_result = API.embedding(text)
            
            
            if 'error' in embedding_result.keys():
                print(len(text)/4)
                print(text)
                continue
            item["embedding"] = embedding_result['embedding']
            final_pages_and_chunks.append(item)

    return final_pages_and_chunks

  text = re.sub('\s{2,}',' ',item["sentence_chunk"])


In [13]:
def query_collection(query, 
                     collection):
    query_embedding = API.embedding(query)['embedding']
    
    # Create a list of context items
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=2
    )
    # get the score?
    context_items = [result for result in results['documents'][0]]
    
        
    return context_items, results

In [14]:
from rag_functions import (
                           process_webpage_content,
                           load_webpage_requests,
                           apply_spacy_nlp_filtered,
                           chunk_sentences,
                           restructure_chunks,
                           filter_pages_and_texts,
                           open_and_read_pdf,
                           connect_to_collection,
                           add_to_collection)

url = 'https://en.wikipedia.org/wiki/Maxwell%27s_equations'
model_name = 'mistral-7b-instruct-v0.2.Q2_K.gguf'


pages_and_texts = process_webpage_content(load_webpage_requests(url))
API.initialize_server(type='embed')



Shutting down previous server
Server started successfully


In [16]:
pages_and_texts_filtered = apply_spacy_nlp_filtered(pages_and_texts)

In [19]:
pages_and_texts = chunk_sentences(pages_and_texts_filtered, num_sentence_chunk_size=10)
pages_and_chunks = restructure_chunks(pages_and_texts)
pages_and_chunks = filter_pages_and_texts(pages_and_chunks,30)


In [20]:
pages_and_chunks

[{'page_number': 1,
  'sentence_chunk': "The equations provide a mathematical model for electric, optical, and radio technologies, such as power generation, electric motors, wireless communication, lenses, radar, etc. They describe how electric and magnetic fields are generated by charges, currents, and changes of the fields.[note 1] The equations are named after the physicist and mathematician James Clerk Maxwell, who, in 1861 and 1862, published an early form of the equations that included the Lorentz force law. Maxwell first used the equations to propose that light is an electromagnetic phenomenon. The modern form of the equations in their most common formulation is credited to Oliver Heaviside.[1]\nMaxwell's equations may be combined to demonstrate how fluctuations in electromagnetic fields (waves) propagate at a constant speed in vacuum, c (299792458\xa0m/s[2]). Known as electromagnetic radiation, these waves occur at various wavelengths to produce a spectrum of radiation from rad

In [21]:
pages_and_chunks = apply_embeddings(pages_and_chunks)

client, collection = connect_to_collection(local=True)
add_to_collection(pages_and_chunks,collection,embedding_model_name=model_name,url=url)

prompt = "What does the article say about magnetic monopoles?"
context_items, results = query_collection(prompt,collection)

context = "- " + "\n- ".join(context_items)
#pages_and_texts = open_and_read_pdf(pdf_path)
context_prompt =f'With this context: {context}, answer the following prompt: {prompt}'

API.initialize_server()

response = API.completion(context_prompt)
inspect_response(response)

585.5
The equations provide a mathematical model for electric, optical, and radio technologies, such as power generation, electric motors, wireless communication, lenses, radar, etc. They describe how electric and magnetic fields are generated by charges, currents, and changes of the fields.[note 1] The equations are named after the physicist and mathematician James Clerk Maxwell, who, in 1861 and 1862, published an early form of the equations that included the Lorentz force law. Maxwell first used the equations to propose that light is an electromagnetic phenomenon. The modern form of the equations in their most common formulation is credited to Oliver Heaviside.[1]
Maxwell's equations may be combined to demonstrate how fluctuations in electromagnetic fields (waves) propagate at a constant speed in vacuum, c (299792458 m/s[2]). Known as electromagnetic radiation, these waves occur at various wavelengths to produce a spectrum of radiation from radio waves to gamma rays.
In partial diff