<span style="font-size: 32px; font-weight: bold;">Retrieval Augmented Generation</span>

This Jupyter Notebook demonstrates the use of a vector database and a knowledge graph to enhance the performance of Retrieval-Augmented Generation (RAG) models. By leveraging these technologies, we aim to improve the accuracy and relevance of contextually appropriate responses in natural language processing tasks

___

<span style="font-size: 20px; font-weight: bold;">Building the Knowledge Base</span>

In [1]:
import pandas as pd
import pyarrow.parquet as pa
import spacy



In [2]:
# imports the load dataset from the Hugging Face DataSet library
from datasets import load_dataset

# loads the first 10,0000 rows from Parquet files in the variable data
data = load_dataset(
    "parquet",
    data_files="train-00000-of-00001-090b52ccb189d47a.parquet",
    split="train[:10000]",
)
data

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['text'],
    num_rows: 10000
})

In [3]:
import re


# checking if row is title
def is_title(row):
    return len(row["text"].split()) <= 2

In [5]:
def separate_title_and_text(dataset):
    final_rows = []
    current_title = None
    accumulated_text = []

    for row in dataset:
        if is_title(row):  # Identify if the row is a title
            if current_title is not None:
                # Save the previous title and its accumulated text
                final_rows.append(
                    {"title": current_title, "text": " ".join(accumulated_text)}
                )
            # Update the current title and reset the accumulated text
            current_title = row["text"]
            accumulated_text = []
        else:
            # If it's part of the text, accumulate it
            accumulated_text.append(row["text"])

    # Append the last title and its text if exists
    if current_title is not None:
        final_rows.append({"title": current_title, "text": " ".join(accumulated_text)})

    return final_rows


wiki_data = separate_title_and_text(data)

In [10]:
# Convert wiki_data into an array of dictionaries and add in a unique id for each article
wiki_dictionaries = []
for i in range(len(wiki_data)):
    new_dict = {"id": i, "title": wiki_data["title"][i], "text": wiki_data["text"][i]}
    wiki_dictionaries.append(new_dict)

___

<span style="font-size: 20px; font-weight: bold;">Modeling Data with Knowledge Graphs</span>

In [11]:
import pinecone
import torch
from neo4j import GraphDatabase
from transformers import AutoModel, AutoTokenizer

# Neo4j Configuration
neo4j_uri = "neo4j+s://ef2b44af.databases.neo4j.io"  
neo4j_username = "neo4j"
neo4j_password = "zKqf9lt8vXnHXGU1WQtoDWPjtYCHeDEVECkP_KGclgE"


# Neo4j Driver Initialization
driver = GraphDatabase.driver(neo4j_uri, auth=(neo4j_username, neo4j_password))

In [13]:
def create_graph_batch(tx, articles):
    tx.run(
        """
        UNWIND $articles AS article
        CREATE (a:Article {id: article.id, title: article.title, text: article.text})
    """,
        articles=articles,
    )


def populate_neo4j_graph_batched(wiki_dictionaries, batch_size=100):
    with driver.session() as session:
        for i in range(0, len(wiki_dictionaries), batch_size):
            batch = wiki_dictionaries[i : i + batch_size]
            session.execute_write(create_graph_batch, batch)


# Call the batched population function
populate_neo4j_graph_batched(wiki_dictionaries, batch_size=100)

In [14]:
# Step 3: Generate Embeddings with Neo4j Context
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def generate_embedding_with_context(batch, driver):
    with driver.session() as session:
        # Fetch related articles for enrichment (modify query logic as needed)
        related_articles = session.run(
            """
            MATCH (a:Article {id: $node_id})-[:RELATED_TO]->(b:Article)
            RETURN b.title AS related_title
        """,
            node_id=batch["id"],
        ).data()

    graph_context = " ".join([rel["related_title"] for rel in related_articles])
    inputs = tokenizer(
        batch["text"] + " " + graph_context,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.numpy().tolist()

___

<span style="font-size: 20px; font-weight: bold;">Creating Embeddings</span>

In [15]:
# converts the list of dictionaries into Hugging Face Dataset
wiki_data = wiki_data.from_list(wiki_dictionaries)

In [16]:
import tiktoken

tiktoken.encoding_for_model("gpt-3.5-turbo")

<Encoding 'cl100k_base'>

In [17]:
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")


# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)

In [18]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [19]:
# Split text into 400-char chunks with 20-char overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""],
)

In [21]:
data_files = wiki_data

In [22]:
import torch
from transformers import AutoModel, AutoTokenizer

model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

data_files.set_format("torch", columns=["text"])


def generate_embedding(batch):
    inputs = tokenizer(
        batch["text"], return_tensors="pt", padding=True, truncation=True
    )
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    batch_embeddings = embeddings.numpy().tolist()
    return batch

In [40]:
from tqdm import tqdm  # Import the progress bar library

embeddings = []
batch_size = 30

progress_bar = tqdm(total=len(wiki_dictionaries), desc="Generating Embeddings")

for i in range(0, len(wiki_dictionaries), batch_size):
    batch = wiki_dictionaries[i : i + batch_size]
    batch_embeddings = generate_embedding_with_context_batched(batch)
    embeddings.extend(batch_embeddings)

    progress_bar.update(len(batch))

progress_bar.close()

Generating Embeddings: 100%|██████████████████| 988/988 [05:06<00:00,  3.22it/s]


___

<span style="font-size: 20px; font-weight: bold;">Vector Database</span>

In [41]:
import pinecone

In [56]:
import torch
from pinecone import Pinecone, ServerlessSpec
from transformers import AutoModel, AutoTokenizer

# Make sure embeddings are in correct dimensions
flattened_embeddings = [embedding[0] for embedding in embeddings]

# Initialize Pinecone
pc = Pinecone(api_key="188cbb39-7aad-4697-8c0f-d9172a679d56")
index_name = "experiment2"


# Get existing index or create new one
try:
    index = pc.Index(index_name)
except:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    index = pc.Index(index_name)


# Prepare and upload vectors in batches
batch_size = 50
for i in range(0, len(flattened_embeddings), batch_size):
    batch_vectors = []
    for j, e in enumerate(flattened_embeddings[i : i + batch_size]):
        article = wiki_dictionaries[i + j]
        batch_vectors.append(
            {"id": str(i + j), "values": e, "metadata": {"text": article["text"]}}
        )

    # Upsert batch
    index.upsert(vectors=batch_vectors, namespace="ns1")
    print(f"Uploaded batch {i//batch_size + 1}")

Uploaded batch 1
Uploaded batch 2
Uploaded batch 3
Uploaded batch 4
Uploaded batch 5
Uploaded batch 6
Uploaded batch 7
Uploaded batch 8
Uploaded batch 9
Uploaded batch 10
Uploaded batch 11
Uploaded batch 12
Uploaded batch 13
Uploaded batch 14
Uploaded batch 15
Uploaded batch 16
Uploaded batch 17
Uploaded batch 18
Uploaded batch 19
Uploaded batch 20


In [57]:
stats = index.describe_index_stats()
print(stats)

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'ns1': {'vector_count': 988}},
 'total_vector_count': 988}


___

<span style="font-size: 20px; font-weight: bold;">Generative Question Answering</span>

In [58]:
# Define the query
query = "Tell me about the tech company known as Apple"

# Generate the query embedding
inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    query_embedding = model(**inputs).last_hidden_state.mean(dim=1).numpy()[0]

# Ensure the query embedding is a list of floats
query_vector = query_embedding.tolist()
query_vector


# Test query with Pinecone

results = index.query(
    namespace="ns1", vector=query_vector, top_k=3, include_metadata=True
)

print(results)

{'matches': [{'id': '296',
              'metadata': {'text': 'The Apple Macintosh or just “Mac” is a '
                                   'line of personal computers made by the '
                                   'American company Apple Inc. The Macintosh '
                                   'was one of the first computers in which '
                                   'the people could use a mouse for pointing '
                                   'on a screen which had icons. This new way '
                                   'of working with a computer (interface) was '
                                   'known as graphical user interface. It was '
                                   'this feature of the Macintosh that made it '
                                   'so popular. The Apple–Intel transition was '
                                   'when Apple changed the CPU of Macintosh '
                                   'computers from PowerPC processors to Intel '
                   