In [10]:

from sentence_transformers import SentenceTransformer
from openai import OpenAI

def sbert_embed(text):
    model = SentenceTransformer('paraphrase-albert-small-v2')
    return model.encode(text)

def openai_embed(text):
    client = OpenAI()
    response = client.embeddings.create(input=text, model="text-embedding-ada-002")
    return response.data[0].embedding

In [4]:
from pathlib import Path
from PyPDF2 import PdfReader

def chunk_text(text, max_tokens=500):
    """Splits long text into chunks of approximately max_tokens (words ≈ tokens)."""
    words = text.split()
    for i in range(0, len(words), max_tokens):
        yield ' '.join(words[i:i + max_tokens])

def load_pdfs_from_folder(folder_path):
    pdf_texts = []
    pdf_paths = Path(folder_path).glob("*.pdf") 
    
    for pdf_path in pdf_paths:
        reader = PdfReader(str(pdf_path))
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""  # sometimes pages have no text
        pdf_texts.append((str(pdf_path), text))
    
    return pdf_texts # returns tuples, filename, full_text


def create_embedding(embedding_model):
    pdfs = load_pdfs_from_folder("./data/v1/docs") # 1. Load PDFs
    embedded_pdfs = []
    
    if (embedding_model=="sbert"):
        for filename, text in pdfs:
            for chunk in chunk_text(text, max_tokens=500):
                embedding = sbert_embed(chunk)
                embedded_pdfs.append((filename, embedding, chunk))
    
    elif (embedding_model=="open_ai"):
        for filename, text in pdfs:
            for chunk in chunk_text(text, max_tokens=500):
                embedding = openai_embed(chunk)
                embedded_pdfs.append((filename, embedding, chunk))

    else:
        print(f"embedding model {embedding_model} is not in this testing code")

    return embedded_pdfs

In [11]:
sbert_embeddings = create_embedding("sbert")

In [7]:
openai_embeddings = create_embedding("open_ai")

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance, PointStruct

client = QdrantClient("localhost", port=6333)
SBERT_EMBED_SIZE = 768
OPENAI_EMBED_SIZE = 1536

# Create a collection for each embedding type
for embed_type in [{'name': "sbert_embedding_new", 'size': SBERT_EMBED_SIZE, 'embeddings': sbert_embeddings}, 
                   {'name': "openai_embedding_new", 'size': OPENAI_EMBED_SIZE, 'embeddings': openai_embeddings}]:
    client.create_collection(
        collection_name=f"{embed_type['name']}_collection",
        vectors_config=VectorParams(size=embed_type['size'], distance=Distance.COSINE)
    )



In [32]:
client.upload_points(
    collection_name=f"sbert_embedding_new_collection",
    points=[
        PointStruct(
            id=idx,
            vector=embedding,
            payload={"text": original_text}
        )
        for idx, (_, embedding, original_text) in enumerate(embed_type['embeddings'])
    ]
)

In [35]:
client.upload_points(
    collection_name=f"openai_embedding_new_collection",
    points=[
        PointStruct(
            id=idx,
            vector=embedding,
            payload={"text": original_text}
        )
        for idx, (_, embedding, original_text) in enumerate(embed_type['embeddings'])
    ]
)

In [61]:
import openai

def openai_generate(prompt):
    client = OpenAI()
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=300,
        temperature=0.2
    )
    return response.choices[0].message.content


In [62]:
def simple_rag(query, embed_fn, collection_name, llm_fn, top_k=5):
    from qdrant_client import QdrantClient

    # Connect to Qdrant
    client = QdrantClient(host="localhost", port=6333)
    
    # Embed query
    query_vec = embed_fn(query)

    # Search Qdrant
    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_vec,
        limit=top_k
    )

    # Extract top-k text chunks
    chunks = [hit.payload['text'] for hit in search_result]
    context = "\n\n".join(chunks)

    # Create prompt for LLM
    prompt = f"Use the context to answer:\n\n{context}\n\nQuestion: {query}\nAnswer:"

    # Generate answer using LLM
    return llm_fn(prompt)


In [63]:
query = "How did the economy perform in 2015?"

sbert_answer = simple_rag(query, sbert_embed, "sbert_embedding_new_collection", openai_generate)

  client = QdrantClient(host="localhost", port=6333)
  search_result = client.search(


In [64]:
sbert_answer

'The economy performed well in 2015, with a net income of $125 million in the revolving fund.'

In [69]:
queries = [
    'How did the economy perform in 2016?',
    'What economic sectors did well in 2015?',
    'What were some economic goals of 2016?',
    'What were some of the challenges of the year 2022?',
    'How did education change in the year 2022?'
]

results = []

for query in queries:
    sbert_answer = simple_rag(query, sbert_embed, "sbert_embedding_new_collection", openai_generate)
    openai_answer = simple_rag(query, sbert_embed, "openai_embedding_new_collection", openai_generate)
    results.append({'query': query, 'sbert_answer': sbert_answer, 'openai_answer': openai_answer})

f = open('result.md', 'w')
for res in results:
    f.write(f'* {res["query"]}\n')
    f.write(f'\t* SBERT: {res["sbert_answer"]}\n')
    f.write(f'\t* OPENAI: {res["openai_answer"]}\n')
    f.write('\n')

f.close()


  client = QdrantClient(host="localhost", port=6333)
  search_result = client.search(


# Results:
* How did the economy perform in 2016?
	* SBERT: Based on the information provided, the economy performed well in 2016 in terms of the revolving fund net income. The target for FY 2016 was met, with a net income of $162,463,231, showing an improvement from the previous year. Additionally, progress was made in refining cost accounting models and improving data reliability, indicating positive developments in financial management.
	* OPENAI: Based on the information provided, the economy performed well in 2016. The revolving fund net income increased to $162,463,231, which exceeded the target of ≥$0. Additionally, progress was made in refining cost accounting models, improving data reliability, and streamlining activities related to labor costs allocation. Overall, the economy showed positive growth and progress in various areas related to cost accounting and financial management.

* What economic sectors did well in 2015?
	* SBERT: Based on the provided information, it is not explicitly stated which economic sectors did well in 2015. The data primarily focuses on performance measures related to IT systems, employee engagement, and other specific initiatives within the Federal government.
	* OPENAI: Based on the provided information, it is not explicitly stated which economic sectors did well in 2015. The data primarily focuses on performance measures related to IT systems, employee engagement, and retirement benefits within the Federal government.

* What were some economic goals of 2016?
	* SBERT: Some economic goals of 2016 included improving the ability of the Federal human resource workforce to attract, develop, train, and support talent in the Federal Government by developing and launching a Federal HR curriculum, and transforming hiring, pay, and benefits across the Federal Government to attract and retain the best civilian workforce.
	* OPENAI: Some economic goals of 2016 included improving the ability of the Federal human resource workforce to attract, develop, train, and support talent in the Federal Government, building and launching curricula for employee relations and labor relations, and reducing the complexity and costs to administer Federal employee retirement earned benefits.

* What were some of the challenges of the year 2022?
	* SBERT: Some of the challenges of the year 2022 included low compliance with Government-wide past performance reporting requirements in contracting actions, insufficient resources for OPM managers and staff to get their jobs done, and the need to establish a sustainable funding and staffing model for OPM to better meet its mission.
	* OPENAI: Some of the challenges of the year 2022 included low compliance with Government-wide past performance reporting requirements in contracting actions and the need to establish a sustainable funding and staffing model for OPM to better meet its mission. Additionally, there were issues with OPM managers and staff indicating they did not have sufficient resources to get their jobs done.

* How did education change in the year 2022?
	* SBERT: In 2022, education underwent significant changes due to the COVID-19 pandemic. Schools had to quickly pivot to virtual learning, leading to the Department of Education providing support to millions of children, families, and educators. The focus was on transitioning back to in-person learning, assessing students' learning needs, and providing evidence-based resources and support. Additionally, there were new investments in postsecondary education through various relief funds to ensure learning continuity during the pandemic. Overall, the education system adapted to the challenges brought on by the pandemic and worked towards improving outcomes for all students.
	* OPENAI: In 2022, education underwent significant changes due to the COVID-19 pandemic. The Department of Education had to rapidly pivot to provide support for virtual learning as schools transitioned to online education. The Department focused on providing evidence-based strategies and practices to support students, families, and educators during this challenging time. Additionally, there was increased funding and support for postsecondary education through various relief funds to ensure learning continued for students during the pandemic. Overall, the education system adapted to new challenges and focused on providing equitable opportunities and outcomes for all students.



# My Comparison:

Overall the responses are very similar and only differ in some word and sentence structure choices. When reading the responses side by side, there are many similarities in word choices as well as flow of the overall report. It was interesting to compare these models and see how they slightly differ.