 # Installing the environment

In [9]:
!pip install llama-index==0.10.1
!pip install llama-index-vector-stores-deeplake
!pip install deeplake
!pip install llama-index-readers-file

Collecting llama-index-core<0.13.0,>=0.12.0 (from llama-index-vector-stores-deeplake)
  Using cached llama_index_core-0.12.16.post1-py3-none-any.whl.metadata (2.5 kB)
Using cached llama_index_core-0.12.16.post1-py3-none-any.whl (1.6 MB)
Installing collected packages: llama-index-core
  Attempting uninstall: llama-index-core
    Found existing installation: llama-index-core 0.10.68.post1
    Uninstalling llama-index-core-0.10.68.post1:
      Successfully uninstalled llama-index-core-0.10.68.post1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llama-index-readers-file 0.1.33 requires llama-index-core<0.11.0,>=0.10.37.post1, but you have llama-index-core 0.12.16.post1 which is incompatible.
llama-index-agent-openai 0.1.7 requires llama-index-core<0.11.0,>=0.10.1, but you have llama-index-core 0.12.16.post1 which is incompatible.
llama-index-question-gen-open

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# OpenAI auth
import os
import openai

f = open("drive/MyDrive/api_key.txt", "r")
API_KEY = f.readline().strip()
f.close()

os.environ['OPENAI_API_KEY'] = API_KEY
openai.api_key = os.getenv("OPENAI_API_KEY")

# Active loop auth for deep lake
f = open("drive/MyDrive/activeloop_key.txt", "r")
API_token=f.readline().strip()
f.close()
ACTIVELOOP_TOKEN=API_token
os.environ['ACTIVELOOP_TOKEN'] =ACTIVELOOP_TOKEN

# For Google Colab and Activeloop while waiting for Activeloop (April 2024) pending new version
#This line writes the string "nameserver 8.8.8.8" to the file. This is specifying that the DNS server the system
#should use is at the IP address 8.8.8.8, which is one of Google's Public DNS servers.
with open('/etc/resolv.conf', 'w') as file:
   file.write("nameserver 8.8.8.8")


In [12]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.vector_stores.deeplake import DeepLakeVectorStore

 # Collecting and preparing the documents

In [13]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [14]:
import requests
from bs4 import BeautifulSoup
import re
import os

urls = [
    "https://github.com/VisDrone/VisDrone-Dataset",
    "https://paperswithcode.com/dataset/visdrone",
    "https://openaccess.thecvf.com/content_ECCVW_2018/papers/11133/Zhu_VisDrone-DET2018_The_Vision_Meets_Drone_Object_Detection_in_Image_Challenge_ECCVW_2018_paper.pdf",
    "https://github.com/VisDrone/VisDrone2018-MOT-toolkit",
    "https://en.wikipedia.org/wiki/Object_detection",
    "https://en.wikipedia.org/wiki/Computer_vision",
    "https://en.wikipedia.org/wiki/Convolutional_neural_network",
    "https://en.wikipedia.org/wiki/Unmanned_aerial_vehicle",
    "https://www.faa.gov/uas/",
    "https://www.tensorflow.org/",
    "https://pytorch.org/",
    "https://keras.io/",
    "https://arxiv.org/abs/1804.06985",
    "https://arxiv.org/abs/2202.11983",
    "https://motchallenge.net/",
    "http://www.cvlibs.net/datasets/kitti/",
    "https://www.dronedeploy.com/",
    "https://www.dji.com/",
    "https://arxiv.org/",
    "https://openaccess.thecvf.com/",
    "https://roboflow.com/",
    "https://www.kaggle.com/",
    "https://paperswithcode.com/",
    "https://github.com/"
]

In [15]:
import requests
import re
import os
from bs4 import BeautifulSoup

def clean_text(content):
  content = re.sub(r'\[\d+\]', '', content)
  content = re.sub(r'\[^\w\s\.]', '', content)
  return content

def fetch_and_clean(url):
  try:
    response = requests.get(url)
    response.raise_for_status() # bad responses
    soup = BeautifulSoup(response.content, 'html.parser')

    # Prioritize "mw-parser-output" but fall back to "content" class if not found
    content = soup.find('div', {'class': 'mw-parser-output'}) or soup.find('div', {'id': 'content'})
    if content is None:
      return None

    # Remove specific sections, including nested ones
    for section_title in ['References', 'Bibliography', 'External links', 'See also', 'Notes']:
      section = content.find('span', id=section_title)
      while section:
        for sib in section.parent.find_next_siblings():
          sib.decompose()
        section.parent.decompose()
        section = content.find('span', id=section_title)

    # Extract and clean text
    text = content.get_text(separator=' ', strip=True)
    text = clean_text(text)
    return text
  except requests.exceptions.RequestException as e:
    print(f"Error fetching content from {url}: {e}")
    return None  # Return None on error

output_dir = './data/'
os.makedirs(output_dir, exist_ok=True)

# Processing each URL (and skipping invalid ones)
for url in urls:
  article_name = url.split('/')[-1].replace('html', '')
  filename = os.path.join(output_dir, f"{article_name}.txt")

  clean_article_text = fetch_and_clean(url)
  if clean_article_text:
    with open(filename, 'w', encoding='utf-8') as file:
      file.write(clean_article_text)

print(f"Content(ones that were possible) written to files in the '{output_dir}' directory.")



Content(ones that were possible) written to files in the './data/' directory.


In [16]:
# load
documents = SimpleDirectoryReader("./data/").load_data()

In [17]:
documents[0]

Document(id_='6b2e0306-db45-4d62-a3a9-482b5e3759fa', embedding=None, metadata={'file_path': '/content/data/1804.06985.txt', 'file_name': '1804.06985.txt', 'file_type': 'text/plain', 'file_size': 3959, 'creation_date': '2025-02-07', 'last_modified_date': '2025-02-07'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="High Energy Physics - Theory arXiv:1804.06985 (hep-th) [Submitted on 19 Apr 2018] Title: A Near Horizon Extreme Binary Black Hole Geometry Authors: Jacob Ciafre , Maria J. Rodriguez View a PDF of the paper titled A Near Horizon Extreme Binary Black Hole Geometry, by Jacob Ciafre and Maria J. Rodriguez View PDF Abstract: A new solution of four-dimensional vacuum General Relativity is presented. It describes the near horizon region of the extrem

# Vector storage

In [None]:
from llama_index.core import StorageContext

vector_store_path = "hub://rag_example/drone_v2"
dataset_path = "hub://rag_example/drone_v2"

vector_store = DeepLakeVectorStore(dataset_path=dataset_path, overwrite=True)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create an index over the documents
index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)

Your Deep Lake dataset has been successfully created!




Uploading data to deeplake dataset.


100%|██████████| 96/96 [00:52<00:00,  1.82it/s]
\

Dataset(path='hub://rag_example/drone_v2', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (96, 1)      str     None   
 metadata     json      (96, 1)      str     None   
 embedding  embedding  (96, 1536)  float32   None   
    id        text      (96, 1)      str     None   


 

In [None]:
import deeplake
ds = deeplake.load(dataset_path)

|

This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/rag_example/drone_v2



|

hub://rag_example/drone_v2 loaded successfully.



 

In [None]:
import json
import pandas as pd
import numpy as np

data = {}

# Iterate through the tensors in the dataset
for tensor_name in ds.tensors:
  tensor_data = ds[tensor_name].numpy()

  # Check if multi-dimensional
  if tensor_data.ndim > 1:
    # Flatten multi-dim tensors
    data[tensor_name] = [np.array(e).flatten().tolist() for e in tensor_data]
  else:
    # Convert 1D tensors to lists and decode text
    if tensor_name == "text":
      data[tensor_name] = [t.tobytes().decode('utf-8') if t else "" for t in tensor_data]
    else:
      data[tensor_name] = tensor_data.tolist()

df = pd.DataFrame(data)

In [None]:
def display_record(record_number):
  record = df.iloc[record_number]
  display_data = {
    "ID": record["id"] if "id" in record else "N/A",
    "Metadata": record["metadata"] if "metadata" in record else "N/A",
    "Text": record["text"] if "text" in record else "N/A",
    "Embedding": record["embedding"] if "embedding" in record else "N/A"
  }
  # Print the ID
  print("ID:")
  print(display_data["ID"])
  print()

  # Print the metadata in a structured format
  print("Metadata:")
  metadata = display_data["Metadata"]
  if isinstance(metadata, list):
    for item in metadata:
      for key, value in item.items():
        print(f"{key}: {value}")
        print()
  else:
    print(metadata)
    print()

  # Print the text
  print("Text:")
  print(display_data["Text"])
  print()

  # Print the embedding
  print("Embedding:")
  print(display_data["Embedding"])
  print()

# Function call to display a record
rec = 0  # Replace with the desired record number
display_record(rec)

ID:
['8e69a965-3e26-4fb4-87a8-72b64a29d684']

Metadata:
file_path: /content/data/1804.06985.txt

file_name: 1804.06985.txt

file_type: text/plain

file_size: 3959

creation_date: 2025-02-06

last_modified_date: 2025-02-06

_node_content: {"id_": "8e69a965-3e26-4fb4-87a8-72b64a29d684", "embedding": null, "metadata": {"file_path": "/content/data/1804.06985.txt", "file_name": "1804.06985.txt", "file_type": "text/plain", "file_size": 3959, "creation_date": "2025-02-06", "last_modified_date": "2025-02-06"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "49e0afe2-5b42-43ab-89a4-432f5f2d48d3", "node_type": "4", "metadata": {"file_path": "/content/data/1804.06985.txt", "file_name": "1804.06985.txt", "file_type": "text/plain", "file_size": 3959

In [None]:
rec = 0 # record number
print(display_record(rec))

ID:
['8e69a965-3e26-4fb4-87a8-72b64a29d684']

Metadata:
file_path: /content/data/1804.06985.txt

file_name: 1804.06985.txt

file_type: text/plain

file_size: 3959

creation_date: 2025-02-06

last_modified_date: 2025-02-06

_node_content: {"id_": "8e69a965-3e26-4fb4-87a8-72b64a29d684", "embedding": null, "metadata": {"file_path": "/content/data/1804.06985.txt", "file_name": "1804.06985.txt", "file_type": "text/plain", "file_size": 3959, "creation_date": "2025-02-06", "last_modified_date": "2025-02-06"}, "excluded_embed_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "excluded_llm_metadata_keys": ["file_name", "file_type", "file_size", "creation_date", "last_modified_date", "last_accessed_date"], "relationships": {"1": {"node_id": "49e0afe2-5b42-43ab-89a4-432f5f2d48d3", "node_type": "4", "metadata": {"file_path": "/content/data/1804.06985.txt", "file_name": "1804.06985.txt", "file_type": "text/plain", "file_size": 3959

# Index-based RAG
*   Vector store index engine
*   Tree index
*   List index
*   Keyword table index




In [18]:
user_input = "How do drones identify vehicles?"

#similarity_top_k
k=3 #  top 3 most probable responses - k will serve as a ranking function
temp=0.1
mt=1024 # token limit

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_cosine_similarity_with_embeddings(text1, text2):
  embeddings1 = model.encode(text1)
  embeddings2 = model.encode(text2)

  similarity = cosine_similarity([embeddings1], [embeddings2])

  return similarity[0][0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# Vector store index engine

In [20]:
from llama_index.core import VectorStoreIndex
vector_store_index = VectorStoreIndex.from_documents(documents)

In [None]:
print(type(vector_store_index))

In [21]:
vector_query_engine = vector_store_index.as_query_engine(similarity_top_k=k,
temperature=temp, num_output=mt)

In [22]:
import pandas as pd
import textwrap

def index_query(input_query):
  response = vector_query_engine.query(input_query)
  print(textwrap.fill(str(response), 100))

  node_data = []

  for node_with_score in response.source_nodes:
    node = node_with_score.node
    node_info = {
        'Node ID': node.id_,
        'Score': node_with_score.score,
        'Text': node.text
    }
    node_data.append(node_info)

  df = pd.DataFrame(node_data)

  return df, response


In [23]:
import time
#start the timer
start_time = time.time()
df, response = index_query(user_input)
# Stop the timer
end_time = time.time()

Drones identify vehicles through the implementation of a Class Identification Label, which serves as
a verification mechanism to confirm that drones within a specific class meet the rigorous standards
set by administrations for design and manufacturing.


In [24]:
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")

print(df.to_markdown(index=False, numalign="left", stralign="left"))

Query execution time: 1.7359 seconds
| Node ID                              | Score    | Text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [25]:
nodeid = response.source_nodes[0].node_id
nodeid

'71a43926-cb5c-45ff-87d4-f1a14228b824'

In [26]:
response.source_nodes[0].get_text()

'[ 216 ] In 2021, the FAA published a rule requiring all commercially used UAVs and all UAVs regardless of intent weighing 250 g or more to participate in Remote ID , which makes drone locations, controller locations, and other information public from takeoff to shutdown; this rule has since been challenged in the pending federal lawsuit RaceDayQuads v. FAA . [ 217 ] [ 218 ] EU Drone Certification - Class Identification Label [ edit ] The implementation of the Class Identification Label serves a crucial purpose in the regulation and operation of drones. [ 219 ] The label is a verification mechanism designed to confirm that drones within a specific class meet the rigorous standards set by administrations for design and manufacturing. [ 220 ] These standards are necessary to ensure the safety and reliability of drones in various industries and applications. By providing this assurance to customers, the Class Identification Label helps to increase confidence in drone technology and encour

# Optimized chunking

In [27]:
for node_with_score in response.source_nodes:
  node = node_with_score.node
  chunk_size = len(node.text)
  print(f"Node ID: {node.id_}, Chunk Size: {chunk_size} characters")

Node ID: 71a43926-cb5c-45ff-87d4-f1a14228b824, Chunk Size: 3873 characters
Node ID: bddaa01c-a380-4c82-9ca4-8214b5810ece, Chunk Size: 4901 characters
Node ID: 820e66e0-5727-4f8c-b9e7-8add2371946e, Chunk Size: 4654 characters


In [28]:
import numpy as np

def info_metrics(response):
  # Calculate the performance (handling None scores)
  scores = [node.score for node in response.source_nodes if node.score is not None]
  if scores:  # Check if there are any valid scores
      weights = np.exp(scores) / np.sum(np.exp(scores))
      perf = np.average(scores, weights=weights) / elapsed_time
  else:
      perf = 0  # Or some other default value if all scores are None

  average_score=np.average(scores, weights=weights)
  print(f"Average score: {average_score:.4f}")
  print(f"Query execution time: {elapsed_time:.4f} seconds")
  print(f"Performance metric: {perf:.4f}")

In [29]:
info_metrics(response)

Average score: 0.8426
Query execution time: 1.7359 seconds
Performance metric: 0.4854


# Tree index query engine

In [30]:
from llama_index.core import TreeIndex
tree_index = TreeIndex.from_documents(documents)

In [31]:
tree_query_engine = tree_index.as_query_engine(similarity_top_k=k,
                              temperature=temp,
                              num_output=mt)

In [32]:
import time
import textwrap

start_time = time.time()
response = tree_query_engine.query(user_input)

end_time = time.time()

# Calculate and print the execution time
elapsed_time = end_time - start_time

print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

Query execution time: 4.4204 seconds
Drones identify vehicles by utilizing computer vision technology for object detection. They can
detect vehicles in images and videos by analyzing visual features and patterns specific to vehicles,
such as shape, size, and movement. This process involves using deep learning models trained on
datasets that include vehicle classes, allowing drones to accurately recognize and track vehicles in
various scenarios.


In [None]:
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")
performance=similarity_score/elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.804
Query execution time: 3.5562 seconds
Performance metric: 0.2260


# List index query engine

In [None]:
from llama_index.core import ListIndex
list_index = ListIndex.from_documents(documents)

In [None]:
list_query_engine = list_index.as_query_engine(similarity_top_k=k,
temperature=temp, num_output=mt)

In [None]:
#start the timer
start_time = time.time()
response = list_query_engine.query(user_input)
# Stop the timer
end_time = time.time()
# Calculate and print the execution time
elapsed_time = end_time - start_time
print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

Query execution time: 17.9808 seconds
Drones can identify vehicles through computer vision systems that analyze image data captured by
cameras mounted on the drones. These systems use object recognition algorithms to detect and
classify vehicles in the images, allowing the drones to identify and track vehicles in their field
of view.


In [None]:
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")
performance=similarity_score/elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.793
Query execution time: 17.9808 seconds
Performance metric: 0.0441


# Keyword index query engine

In [33]:
from llama_index.core import KeywordTableIndex
keyword_index = KeywordTableIndex.from_documents(documents)

In [35]:
data = []
for keyword, doc_ids in keyword_index.index_struct.table.items():
  for doc_id in doc_ids:
    data.append({"Keyword": keyword, "Document ID": doc_id})
# Create the DataFrame
df = pd.DataFrame(data)
df


Unnamed: 0,Keyword,Document ID
0,kerr black holes,f9a44ee2-2e2d-4bd4-afda-1b992a8da990
1,extreme binary black hole geometry,f9a44ee2-2e2d-4bd4-afda-1b992a8da990
2,nhek black hole,f9a44ee2-2e2d-4bd4-afda-1b992a8da990
3,entropy,f9a44ee2-2e2d-4bd4-afda-1b992a8da990
4,entropy,165833f5-a515-4141-a763-b8bf2ae17dac
...,...,...
4467,wikimedia commons,07414f4a-da00-4356-a860-d5286985a206
4468,direct,07414f4a-da00-4356-a860-d5286985a206
4469,quantitative patterns,07414f4a-da00-4356-a860-d5286985a206
4470,nations,b42dcb7c-9556-446b-8b53-31136ac04af3


In [36]:
keyword_query_engine = keyword_index.as_query_engine(similarity_top_k=k,
                                    temperature=temp, num_output=mt)

In [37]:
import time

start_time = time.time()
response = keyword_query_engine.query(user_input)

end_time = time.time()

elapsed_time = end_time - start_time

print(f"Query execution time: {elapsed_time:.4f} seconds")
print(textwrap.fill(str(response), 100))

Query execution time: 1.7925 seconds
Drones can identify vehicles through various sensors such as RGB, multispectral, hyper-spectral
cameras, LiDAR, and other monitoring devices. These sensors provide specific measurements and
observations that help drones in vehicle identification. Additionally, drones can use non-
cooperative sensors for autonomous target detection, aiding in separation assurance and collision
avoidance during vehicle identification processes.


In [38]:
# the execution times may vary from one run to another, due to the stochastic algorithms employed.
similarity_score = calculate_cosine_similarity_with_embeddings(user_input, str(response))
print(f"Cosine Similarity Score: {similarity_score:.3f}")
print(f"Query execution time: {elapsed_time:.4f} seconds")
performance=similarity_score/elapsed_time
print(f"Performance metric: {performance:.4f}")

Cosine Similarity Score: 0.760
Query execution time: 1.7925 seconds
Performance metric: 0.4238
