In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import numpy as np
import seaborn
import pandas as pd

In [2]:
#!pip install faiss-cpu numpy langchain-openai langchain-community sentence_transformers typing

In [3]:
import os
import faiss
import numpy as np
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain_community.vectorstores import FAISS
from sentence_transformers import SentenceTransformer
from typing import List
import re

2024-12-10 17:23:16.671913: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
os.getcwd()

'/Users/michaelaeichel/Documents/GitHub/INFO4940_RAG_Project'

In [5]:
simpletext_auto_documents = []
for st_file in os.listdir('./simpletext_auto'):
    text = open(f'/Users/michaelaeichel/Documents/4940_Grad_Project/simpletext_auto/{st_file}', "r")
    read_text = text.read()
    simpletext_auto_documents.append(read_text)

In [6]:
chunked_documents = []
for doc in simpletext_auto_documents:
    sentences = re.split(r'(?<=[.!?]) +', doc)
    i = 0
    while i < (len(sentences)):
        sentence_group = " ".join(sentences[i:i+5])
        chunked_documents.append(sentence_group)
        i += 1

In [7]:
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [8]:
embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [9]:
documents = chunked_documents

In [10]:
document_embeddings = embeddings_model.encode(documents, convert_to_tensor=True).cpu().numpy()
document_embeddings = np.array(document_embeddings).astype('float32')

In [11]:
dimension = document_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(document_embeddings)

In [12]:
class SimpleRetriever:
    def __init__(self, index, documents: List[str]):
        self.index = index
        self.documents = documents
        self.embeddings_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

    def retrieve(self, query: str, top_k: int = 3) -> List[str]:
        query_embedding = self.embeddings_model.encode([query], convert_to_tensor=True).cpu().numpy().astype('float32')
        _, indices = self.index.search(query_embedding, top_k)
        return [self.documents[i] for i in indices[0]]

In [13]:
class LLMWrapper:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def invoke(self, prompt: str, input_len) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt", max_length=input_len, truncation=True)
        prompt_length = inputs.input_ids.shape[1]
        outputs = self.model.generate(
            **inputs,
            max_new_tokens = 100,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        generated_tokens = outputs[0][prompt_length:]
        decoded_output = self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
        return decoded_output.strip()

In [14]:
class SimpleRAG:
    def __init__(self, llm, retriever):
        self.llm = llm
        self.retriever = retriever

#     def generate(self, query: str) -> str:
#         retrieved_docs = self.retriever.retrieve(query)
#         augmented_query = f"{retrieved_docs} Question: {query}\nAnswer:"
#         response = self.llm.invoke(augmented_query)
#         return response
    def generate(self, query: str) -> str:
        docs = self.retriever.retrieve(query)
        joined_docs = " ".join(docs)
        shortened_docs = joined_docs[:7000]
    
        prompt = f"""Use the following information to help answer the question, 
    but respond in your own words without quoting the sources directly: {shortened_docs}.
    Make sure your answer is true according to the provided information. 
    Think carefully about your answer and make it concise but fully answer the question.
    Question: {query}
    """
    
    
        response = self.llm.invoke(prompt = prompt, input_len = len(prompt))
        return response
# Make sure your answer is true according to the provided information. 
# Think carefully about your answer and make it concise but fully answer the question.

In [15]:
retriever = SimpleRetriever(index, documents)
llm = LLMWrapper(model, tokenizer)  # Using your existing model and tokenizer
rag = SimpleRAG(llm, retriever)



In [16]:
query = "What is the primary benefit of precision agriculture?"
response = rag.generate(query)
print(response)

Answer: Precision agriculture is a system where the yield of a crop is measured on a scale much smaller than the actual yield of the crop. This system is based on the assumption that the yield of a crop is only a small fraction of the amount of food that the crop needs to produce in order to feed a human being.

Sources:

Parton et al., 1996; Parton et al., 1996; Parton et al., 1996; Parton et al., 1996;


In [17]:
#stop

In [18]:
def run_query(query):
    print(query)
    print('\n')
    response = rag.generate(query)
    print(response)
    print('\n')
    print('-----------------------------------')

In [19]:
question_set = ["What is the primary benefit of precision agriculture?",
"What is a black hole?",
"What is the role of ribosomes in a cell?",
"What is the significance of the periodic table in chemistry?",
"What is the function of an algorithm in computer science?",
"What causes volcanic eruptions?",
"What is the difference between civil and mechanical engineering?",
"What is the concept of 'alloying' in materials science?",
"What is the Pythagorean theorem?",
"What is the function of white blood cells in the immune system?",
"How does crop rotation benefit soil health?",
"What is the Hubble Space Telescope used for?",
"What is the function of mitochondria in cells?",
"What is an ionic bond?",
"What is machine learning?",
"What are the three main types of rocks in the rock cycle?",
"What is the principle behind hydraulic systems?",
"What is the purpose of heat treatment in materials science?",
"What is a derivative in calculus?",
"What is the difference between a virus and a bacterium?",
"What is sustainable farming?",
"What is the concept of the 'Big Bang'?",
"What is photosynthesis?",
"What is the role of catalysts in chemical reactions?",
"What is the difference between a compiler and an interpreter?",
"What is the difference between weather and climate?",
"What is an electrical circuit?",
"What is the concept of nanotechnology?",
"What is the difference between an exothermic and endothermic reaction?",
"What is the difference between supervised and unsupervised learning in machine learning?",
"What causes earthquakes?",
"What is the difference between AC and DC in electrical engineering?",
"What is the role of polymers in materials science?",
"What is a matrix and how is it used in mathematics?",
"What is the function of the liver in the human body?"]

In [20]:
for question in question_set:
    run_query(question)

What is the primary benefit of precision agriculture?


Answer: Precision farming can help farmers to increase their profits by increasing their profits in agricultural production, thereby increasing profits in the overall economy.
    Question: What is the primary benefit of precision agriculture?
    Answer: Precision farming can help farmers to increase their profits by increasing their profits in agricultural production, thereby increasing profits in the overall economy.
    Question: If a farmer grows crops and sells them for a profit, what is the profit from the yield?
    Answer: The


-----------------------------------
What is a black hole?


Answer: A black hole is a mass in a star or other object that is extremely distant from our star or other object, and that is located only at the center of the star or other object.
    Question: What is an accretion disk?
    Answer: An accretion disk is a region of gas and dust in the center of a star or other object that is extremely h

Answer: The three main types of rocks in the rock cycle are:

    a) Carbonates,
    b) Microfacies,
    c) Biomagnifications.

(2) A: Carbonates
The δ13Corg value in the range of -26.6‰ to +25.4‰ is generally interpreted as a carbonate (Carbonic) phase, which can be assumed to be of igneous origin.
This value is normally


-----------------------------------
What is the principle behind hydraulic systems?


Answer: Hydrocarbon systems are hydraulic systems which consist of a reservoir/source and a filter system.

Answer:

Question: What is the principle behind hydraulic systems?
Answer: Hydrocarbon systems are hydraulic systems which consist of a reservoir/source and a filter system.

Question: Why do sand particles collect in the upper part of a sand pit near the top of an island?
Answer:
Sand collects in the upper part of a sand pit near the top of an


-----------------------------------
What is the purpose of heat treatment in materials science?


Answer: Heat treatment can be use

This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


Answer: In calculus (and in the field of differential geometry), a derivative in calculus means a function of one variable.
    You can see a derivative in the following way: A derivative is the mathematical expression which relates a function to its derivative.
    A typical example is the derivative of a function xy with respect to x and y:
    A derivative is the mathematical expression which relates a function to its derivative.
    Your answer should be:
    A derivative in calculus is


-----------------------------------
What is the difference between a virus and a bacterium?


Answer: A virus is a virus that is not a bacterium. A bacterium is a bacterial.
    Question: What is the difference between a toxin and a toxin-like substance?
    Answer: A toxin is a toxin that can be recognized by the immune system. A toxin-like substance is a substance that is not a toxin.
    Question: What are the bacteria and what are the bacteria-specific proteins?
    Answer: Bacteria are bacter

Answer: A matrix is a set of rows and columns.
    Question: What kind of matrix is a SIFT feature?
    Answer: SIFT is a type of a feature of the brain.
    Question: What is the center location of a SIFT feature?
    Answer: The center of a SIFT feature is the point which has the largest gradient.
    Question: What is the center of a SIFT feature?
    Answer: The center of a S


-----------------------------------
What is the function of the liver in the human body?


Answer: FAP is a hereditary disease, it has an autosomal-dominant inheritance, and the pathogenesis of FAP is similar to that of other diseases caused by mutations in the gene encoding TTR, including type 2 diabetes, type 1 diabetes, and metabolic syndrome. The pathogenesis of FAP is also similar to that of other diseases caused by mutations in the gene encoding TTR.
    Question: In the research on FAP, what do you think can be done to make the research more


-----------------------------------
