[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-field/notebook-sandbox/blob/main/acl-reference.ipynb)

This notebook provides a reference for how to create Access Control List(ACL) rules for Retrieval Augmented Generation(RAG) based vector search.

![alt text](acl_rules.png)

# Step #1 - Install dependencies

In [None]:
!pip install -U "pinecone-client[grpc]"

# Step #2 - Create Pinecone Index

In [None]:
import getpass

API_KEY = getpass.getpass("Enter your API key: ")
ENVIRONMENT = getpass.getpass("Enter your environment: ")
INDEX_NAME = "acl-quicktest"
DIMENSNIONS = 512

In [None]:
import pinecone

pinecone.init(api_key=API_KEY, environment=ENVIRONMENT)

if (INDEX_NAME in pinecone.list_indexes()) != True:  
    pinecone.create_index(INDEX_NAME, dimension=512, metric="cosine", pods=1, replicas=1, pod_type="s1.x1")
else:
    print(f"Index {INDEX_NAME} already exists")

index = pinecone.Index(INDEX_NAME)

# Step #3 - Generate 5000 vectors

### Namespace ACL filter 
1. Randomly pick a workspace_id: "1", "2", "3"

### Meta-data ACL filter 
2. Randomly pick a group_id: "1", "2", "3"
3. Randomly pick a doc_type_id: "1", "2", "3"
4. Generate a dummy vector embedding using a value seed that gets incremented for each additional vector

### Set vector ID to doc_id with child chunk_id composite key
5. Naming convention for vector id is: "document-{doc_id}-{chunk_count}

In [None]:
import numpy as np
import random
import uuid

def generate_vectors():
    float_seed = 0.1
    doc_id = 1
    for _ in range(100):
        vectors = []
        meta_data = {"group_id": random.choice(["1", "2", "3"]),
                     "doc_type" : random.choice(["1", "2", "3"])}
        
        id = uuid.uuid4()
        chunk_count = 0
        for _ in range(50):             
            embeddings = np.full(shape=DIMENSNIONS, fill_value=float_seed).tolist()
            vectors.append({'id': f"document-{doc_id}-{chunk_count}",
                            'values': embeddings,
                            'metadata': meta_data})
            chunk_count = chunk_count + 1
            float_seed = float_seed + 0.1
        
        workspace = random.choice(["1", "2", "3"])
        index.upsert(vectors, namespace=workspace)
        index.update(f"document-{doc_id}-0", set_metadata={"chunk_count": chunk_count}, namespace=workspace)
        doc_id = doc_id + 1

generate_vectors()
index.describe_index_stats()
print(f"Index Stats: {index.describe_index_stats()}")

# Step #4 - Test ACL query logic

1. Set `workspace` to the workspace id to limit query results to a specific namespace
1. Alter `group_ids` list to see if group_id filter is being applied correctly
1. Alter `doc_type_ids` list to see if doc_type filter is being applied correctly

You should see that query results are limited to just the workspace, group_ids and subgroup_ids specified in the 
`acl_query()` function.

In [None]:
def acl_query(group_ids, doctype_ids, workspace, top_k, vector):
    query_results = index.query(vector=vector, top_k=top_k, namespace=workspace, include_metadata=True,
                                filter={ "$and": [{ "group_id": { "$in": group_ids } }, 
                                                  { "doc_type": { "$in": doctype_ids } }]}).matches
    return query_results

workspace = "3"
group_ids = ["1","2","3"]
doctype_ids = ["1","2","3"]   

vector = np.full(shape=DIMENSNIONS, fill_value=0.1).tolist()

print(acl_query(group_ids, doctype_ids, workspace, 10, vector))


# Step #5 - Test delete by doc_id

1. Pick an id from the previous output
1. Get the base 0 doc id (this id has the chunk count in a meta-data field)
1. Fetch the chunk count for this base doc id from pinecone
1. Use the chunk count to create a list of chunk ids to delete
1. Delete the chunk ids from the index(way more efficient vs delete by metadata)

In [None]:
id = 'document-7-16'
delete_ids = []

def get_doc_id(id):
    parts = id.rsplit('-', 1)
    doc_id = parts[0] if len(parts) > 1 else doc_id   
    return f"{doc_id}-0"

def set_delete_ids(doc_id, chunk_count):
    for i in range(int(chunk_count)):
        delete_ids.append(f"{doc_id[:-2]}-{i}")

chunk_count = index.fetch(ids=[get_doc_id(id)], namespace=workspace).vectors[get_doc_id(id)]["metadata"]["chunk_count"]
set_delete_ids(get_doc_id(id), chunk_count)
print(f"IDs that will be deleted: {delete_ids}")
index.delete(ids=delete_ids, namespace=workspace)


# Step 6 - Validate that the document chunks have been deleted from the index

There should be 50 less total vectors. 5000 to 4950.

In [None]:
assert index.fetch(ids=[get_doc_id(id)], namespace=workspace).vectors == {}

print(index.describe_index_stats())

# Step 7 [OPTIONAL] - Delete all vectors from the index.

In [None]:
index.delete(delete_all=True, namespace="1")
index.delete(delete_all=True, namespace="2")
index.delete(delete_all=True, namespace="3")

index.describe_index_stats()