In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import seaborn
import pandas as pd

In [2]:
#!pip install faiss-cpu numpy langchain-openai langchain-community sentence_transformers typing

In [3]:
import os
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from typing import List
import re

2024-12-10 18:04:19.405572: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
os.getcwd()

'/Users/michaelaeichel/Documents/GitHub/INFO4940_RAG_Project'

In [5]:
simpletext_auto_documents = []
for st_file in os.listdir('./simpletext_auto'):
    text = open(f'/Users/michaelaeichel/Documents/4940_Grad_Project/simpletext_auto/{st_file}', "r")
    read_text = text.read()
    simpletext_auto_documents.append(read_text)

In [6]:
chunked_documents = []
for doc in simpletext_auto_documents:
    sentences = re.split(r'(?<=[.!?]) +', doc)
    i = 0
    while i < (len(sentences)):
        sentence_group = " ".join(sentences[i:i+5])
        chunked_documents.append(sentence_group)
        i += 1

In [7]:
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [8]:
embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
documents = chunked_documents

In [10]:
document_embeddings = embeddings_model.encode(documents, convert_to_tensor=True).cpu().numpy()
document_embeddings = np.array(document_embeddings).astype('float32')

In [11]:
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(document_embeddings)

In [12]:
class SimpleRetriever:
    def __init__(self, index, documents: List[str]):
        self.index = index
        self.documents = documents
        self.embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    def retrieve(self, query: str, top_k: int = 3) -> List[str]:
        query_embedding = self.embeddings_model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32')
        _, indices = self.index.search(query_embedding, top_k)
        return [self.documents[i] for i in indices[0]]

In [41]:
class LLMWrapper:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def invoke(self, prompt: str, input_len) -> str:
        try:
            inputs = self.tokenizer(prompt, return_tensors="pt", max_length=input_len, truncation=True)
            prompt_length = inputs.input_ids.shape[1]
            outputs = self.model.generate(
                **inputs,
                max_new_tokens = 100,
                num_return_sequences=1,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
            generated_tokens = outputs[0][prompt_length:]
            decoded_output = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
            return decoded_output

        except IndexError as e:
            print(f"IndexError occurred: {str(e)}")
            print("This error typically occurs when input sequence length exceeds model's position embedding limit")
            return "Error generating response - input may be too long"
        except Exception as e:
            print(f"Unexpected error occurred: {str(e)}")
            return "Error generating response"

In [42]:
class SimpleRAG:
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

#     def generate(self, query: str) -> str:
#         retrieved_docs = self.retriever.retrieve(query)
#         augmented_query = f"{retrieved_docs} Question: {query}\nAnswer:"
#         response = self.llm.invoke(augmented_query)
#         return response
    def generate(self, query: str) -> str:
        docs = self.retriever.retrieve(query)
        joined_docs = " ".join(docs)
        #7000 for Pythia, 3000 for gpt-2
        shortened_docs = joined_docs[:3000]
    
        prompt = f"""Use the following information to help answer the question, 
    but respond in your own words without quoting the sources directly: {shortened_docs}.
    Make sure your answer is true according to the provided information. 
    Think carefully about your answer and make it concise but fully answer the question.
    Question: {query}
    """
    
    
        response = self.llm.invoke(prompt = prompt, input_len = len(prompt))
        return response
# Make sure your answer is true according to the provided information. 
# Think carefully about your answer and make it concise but fully answer the question.

In [43]:
retriever = SimpleRetriever(index, documents)
llm = LLMWrapper(model, tokenizer)  # Using your existing model and tokenizer
rag = SimpleRAG(llm, retriever)

In [44]:
query = "What is the primary benefit of precision agriculture?"
response = rag.generate(query)
print(response)

 Agronomists, I have been working with C-nut and C-nut seedlings for over 30 years now. We harvest and store C-nut and C-nut seedlings in a secure, well-controlled greenhouse. In the past, we have used some of the largest soil and nutrient resources available on Earth, but we have found that these resources are often deficient in very large quantities. We must find new ways of producing C-nuts and C-nut seeds. In Costa Rica


In [45]:
#stop

In [46]:
def run_query(query):
    print(query)
    print('\n')
    response = rag.generate(query)
    print(response)
    print('\n')
    print('-----------------------------------')

In [47]:
question_set = ["What is the primary benefit of precision agriculture?",
"What is a black hole?",
"What is the role of ribosomes in a cell?",
"What is the significance of the periodic table in chemistry?",
"What is the function of an algorithm in computer science?",
"What causes volcanic eruptions?",
"What is the difference between civil and mechanical engineering?",
"What is the concept of 'alloying' in materials science?",
"What is the Pythagorean theorem?",
"What is the function of white blood cells in the immune system?",
"How does crop rotation benefit soil health?",
"What is the Hubble Space Telescope used for?",
"What is the function of mitochondria in cells?",
"What is an ionic bond?",
"What is machine learning?",
"What are the three main types of rocks in the rock cycle?",
"What is the principle behind hydraulic systems?",
"What is the purpose of heat treatment in materials science?",
"What is a derivative in calculus?",
"What is the difference between a virus and a bacterium?",
"What is sustainable farming?",
"What is the concept of the 'Big Bang'?",
"What is photosynthesis?",
"What is the role of catalysts in chemical reactions?",
"What is the difference between a compiler and an interpreter?",
"What is the difference between weather and climate?",
"What is an electrical circuit?",
"What is the concept of nanotechnology?",
"What is the difference between an exothermic and endothermic reaction?",
"What is the difference between supervised and unsupervised learning in machine learning?",
"What causes earthquakes?",
"What is the difference between AC and DC in electrical engineering?",
"What is the role of polymers in materials science?",
"What is a matrix and how is it used in mathematics?",
"What is the function of the liver in the human body?"]

In [48]:
for question in question_set:
    run_query(question)

What is the primary benefit of precision agriculture?


 Do you have the knowledge to help answer this question?
Answer:     
The only benefit that precision agriculture has is that it is able to be used with more organic matter inputs (and less fertiliser) than organic fertiliser available to other systems (Sharpe et al., 2009).
What is the best way to use precision agriculture in the field?
If you are in a climate system where the soil has grown to a certain depth, the nutrient content of organic


-----------------------------------
What is a black hole?


 Answer: A black hole is a sub-atomic particle (more on that in a moment) that, in a way, is like a black hole.
The shape and size of the black hole is similar to that of a star, but the size and shape of the black hole is different. The size and shape of a black hole are different because each is different in many ways and is affected by many different events. The size of a black hole is also different because it is a smaller are

 Answer: A virus is a large, living organism that is able to reproduce. A bacterium is a small organism that is unable to reproduce.
In contrast, a virus is a large, dead organism that is able to reproduce.
An infectious agent that reproduces on its own is not a virus, but a bacterium that is able to infect and destroy a host.
A bacterium does not grow on its own, but can infect and destroy a host.
A bacterium is


-----------------------------------
What is sustainable farming?


 Answer: Agriculture is a process of improving the soil quality by reducing the amount of nutrients available. Agriculture is a very complex process. The process of the soil's nutrient cycle is complex.
An organic diet is not the same as a natural diet, and the processes that produce organic matter are different. The organic matter is the material, the nutrient, and the organic process is different.
The nutrient cycle consists of the organic matter, the organic plant matter, the organic soil, and the organic 