# Installing environment

In [1]:
!pip install beautifulsoup4==4.12.3
!pip install requests==2.31.0



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# subprocess to download files from GitHub
import subprocess

url = "https://raw.githubusercontent.com/Denis2054/RAG-Driven-Generative-AI/main/commons/grequests.py"
output_file = "grequests.py"

# curl - using private token
curl_command = [
    "curl",
    "-o", output_file,
    url
]

try:
  subprocess.run(curl_command, check=True)
  print("Download successful.")
except subprocess.CalledProcessError:
  print("Failed to download the file.")

Download successful.


In [4]:
import subprocess
import os

# add private token after file name IF necessary
def download(directory, filename):
  base_url = 'https://raw.githubusercontent.com/Denis2054/RAG-DrivenGenerative-AI/main/'

  file_url = f"{base_url}{directory}/{filename}"

  # downloading the file
  try:
    curl_command = f'curl -H -o {filename} {file_url}'

    subprocess.run(curl_command, check=True, shell=True)
    print(f"Downloaded '{filename}' successfully.")
  except subprocess.CalledProcessError:
    print(f"Failed to download '{filename}'. Check URL, connection or if the token is correct/has appropiate permissions.")

In [5]:
!pip install deeplake==3.9.18
!pip install openai



In [6]:
with open('/etc/resolv.conf', 'w') as file:
  file.write("nameserver 8.8.8.8")

In [7]:
# OpenAI auth
import os
import openai

f = open("drive/MyDrive/api_key.txt", "r")
API_KEY = f.readline().strip()
f.close()

os.environ['OPENAI_API_KEY'] = API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

In [8]:
# Active loop auth for deep lake
f = open("drive/MyDrive/activeloop_key.txt", "r")
API_token=f.readline().strip()
f.close()
ACTIVELOOP_TOKEN=API_token
os.environ['ACTIVELOOP_TOKEN'] =ACTIVELOOP_TOKEN

# Augmented input generation

In [9]:
vector_store_path = "hub://rag_example/space_exploration_v2"

In [14]:
from deeplake.core.vectorstore.deeplake_vectorstore import VectorStore
import deeplake.util

ds = deeplake.load(vector_store_path)

vector_store = VectorStore(path=vector_store_path)

-

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/rag_example/space_exploration_v2



/

hub://rag_example/space_exploration_v2 loaded successfully.



 

Deep Lake Dataset in hub://rag_example/space_exploration_v2 already exists, loading from the storage


In [12]:
def embedding_function(texts, model="text-embedding-ada-002"):

   if isinstance(texts, str):
       texts = [texts]

   texts = [t.replace("\n", " ") for t in texts]

   return [data.embedding for data in openai.embeddings.create(input = texts, model=model).data]

In [15]:
def get_user_prompt():
  return input("Enter your query search:")

def search_query(prompt):
  search_results = vector_store.search(embedding_data=user_prompt, embedding_function=embedding_function)
  return search_results

In [31]:
user_prompt="Tell me about space exploration on the Moon and Mars."

# Perform the search
search_results = search_query(user_prompt)

# Print the search results
print(search_results)

{'id': ['f3967780-e0d8-11ef-acc3-0242ac1c000c', 'f399b3aa-e0d8-11ef-acc3-0242ac1c000c', 'f3969850-e0d8-11ef-acc3-0242ac1c000c', 'f3968464-e0d8-11ef-acc3-0242ac1c000c'], 'metadata': [{'source': 'llm.txt'}, {'source': 'llm.txt'}, {'source': 'llm.txt'}, {'source': 'llm.txt'}], 'text': ['Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars \'s surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok

In [32]:
print(user_prompt)

Tell me about space exploration on the Moon and Mars.


In [17]:
# format output
def wrap_text(text, width=800):
  lines = []

  while len(text) > width:
    split_index = text.rfind(' ', 0, width)
    if split_index == -1:
      lines.append(text[:split_index])
      text = text[split_index:].strip()
    lines.append(text)
    return '\n'.join(lines)

In [19]:
import textwrap

# Assuming the search results are ordered with the top result first
top_score = search_results['score'][0]
top_text = search_results['text'][0].strip()
top_metadata = search_results['metadata'][0]['source']

# Print the top search result
print("Top Search Result:")
print(f"Score: {top_score}")
print(f"Source: {top_metadata}")
print("Text:")
print(wrap_text(top_text))

Top Search Result:
Score: 0.8789948225021362
Source: llm.txt
Text:
Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of space organizatio

In [20]:
augmented_input = user_prompt + " " + top_text
print(augmented_input)

Tell me about space exploration on the Moon and Mars. Exploration of space, planets, and moons "Space Exploration" redirects here. For the company, see SpaceX . For broader coverage of this topic, see Exploration . Buzz Aldrin taking a core sample of the Moon during the Apollo 11 mission Self-portrait of Curiosity rover on Mars 's surface Part of a series on Spaceflight History History of spaceflight Space Race Timeline of spaceflight Space probes Lunar missions Mars missions Applications Communications Earth observation Exploration Espionage Military Navigation Settlement Telescopes Tourism Spacecraft Robotic spacecraft Satellite Space probe Cargo spacecraft Crewed spacecraft Apollo Lunar Module Space capsules Space Shuttle Space stations Spaceplanes Vostok Space launch Spaceport Launch pad Expendable and reusable launch vehicles Escape velocity Non-rocket spacelaunch Spaceflight types Sub-orbital Orbital Interplanetary Interstellar Intergalactic List of space organizations Space agen

In [33]:
from openai import OpenAI
client = OpenAI()
import time
gpt_model = "gpt-4o"
start_time = time.time()  # Start timing before the request

In [34]:
def call_gpt4_with_full_text(itext):
  text_input = '\n'.join(itext)
  prompt = f"Please summarize or elaborate on the following content:\n{text_input}"

  try:
    response = client.chat.completions.create(
        model=gpt_model,
        messages=[
            {"role":"system", "content": "You are a space exploration expert."},
            {"role": "assistant", "content": "You can read the input and answer in detail."},
            {"role":"user", "content": prompt}
        ],
        temperature=0.1
    )

    return response.choices[0].message.content.strip()
  except Exception as e:
    return str(e)

In [35]:
gpt4_response = call_gpt4_with_full_text(augmented_input)

response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

print(gpt_model, "Response:", gpt4_response)

Response Time: 10.91 seconds
gpt-4o Response: Space exploration on the Moon and Mars has been a significant focus of scientific and technological efforts, driven by the desire to understand more about our solar system and the potential for human settlement beyond Earth.

### Moon Exploration:
- **Historical Context**: The Moon was the first celestial body beyond Earth to be visited by humans. The Apollo program, particularly the Apollo 11 mission in 1969, marked a monumental achievement with astronauts like Buzz Aldrin collecting lunar samples.
- **Current and Future Missions**: Recent missions have focused on robotic exploration, with countries like China, India, and private companies planning or executing missions to explore the lunar surface. NASA's Artemis program aims to return humans to the Moon and establish a sustainable presence by the end of the decade.

### Mars Exploration:
- **Robotic Missions**: Mars has been a target for robotic exploration for decades. NASA's rovers, su

In [36]:
import textwrap
import re
from IPython.display import display, Markdown, HTML
import markdown

def print_formatted_response(response):
  markdown_patterns = [
      r"^#+\s", # Headers
      r"^\*+" # Bullet points
      r"\*\*", # Bold
      r"_" # Italics
      r"\[.+\]\(.+\)", # Links
      r"-\s", # Dashes used for lists
      r"\`\`\`" # Code blocks
  ]

  # If any pattern matches, assume the response is in markdown
  if any(re.search(pattern, response, re.MULTILINE) for pattern in markdown_patterns):
    # Markdown detected
    html_output = markdown.markdown(response)
    display(HTML(html_output))
  else:
    wrapper = textwrap.TextWrapper(width=80)
    wrapped_text = wrapper.fill(text=response)
    print("Text Response:")
    print("--------------------")
    print(wrapped_text)
    print("--------------------\n")
print_formatted_response(gpt4_response)

Generation Evaluation

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def calculate_cosine_similarity(text1, text2):
  vectorizer = TfidfVectorizer()
  tfidf = vectorizer.fit_transform([text1, text2])
  similarity = cosine_similarity(tfidf[0:1], tfidf[1:2])
  return similarity[0][0]

In [38]:
similarity_score = calculate_cosine_similarity(user_prompt, gpt4_response)
print(f"Cosine Similarity Score: {similarity_score:.3f}")

# The score is low, although the output seemed acceptable for a human.

Cosine Similarity Score: 0.441


In [39]:
similarity_score = calculate_cosine_similarity(augmented_input, gpt4_response)
print(f"Cosine Similarity Score: {similarity_score:.3f}")

# Pretty low too... seems weird must be higher

Cosine Similarity Score: 0.455


In [40]:
# using Sentence Transformers to calculate similarity involves embeddings that capture deeper semantic relationships between words and phrases.
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [43]:
# Depending on a project’s needs, this code could be yet another separate pipeline component.
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_cosine_similarity_with_embeddings(text1, text2):
  embeddings1 = model.encode(text1)
  embeddings2 = model.encode(text2)

  similarity_score = cosine_similarity([embeddings1], [embeddings2])

  return similarity_score[0][0]

In [45]:
# Now with a sentence transformer the result is highly improved.
similarity_score = calculate_cosine_similarity_with_embeddings(augmented_input, gpt4_response)
print(f"Cosine Similarity Score: {similarity_score:.3f}")

Cosine Similarity Score: 0.737
