# Chunked vector-search using multiple inputs per document

In [1]:
import os

# Uncomment one of the following lines to use a bespoke MongoDB deployment
# For testing the default connection is to mongomock

mongodb_uri = os.getenv("MONGODB_URI", "mongomock://test")
# mongodb_uri = "mongodb://localhost:27017"
# mongodb_uri = "mongodb://pinnacle:pinnacle@mongodb:27017/documents"
# mongodb_uri = "mongodb://<user>:<pass>@<mongo_cluster>/<database>"
# mongodb_uri = "mongodb+srv://<username>:<password>@<atlas_cluster>/<database>"

# Super-Duper your Database!
from pinnacledb import pinnacle
db = pinnacle(mongodb_uri)



In [None]:
!curl -O https://pinnacledb-public.s3.eu-west-1.amazonaws.com/wikipedia-sample.json

In [2]:
import json
from pinnacledb.db.mongodb.query import Collection
from pinnacledb.container.document import Document as D

with open('wikipedia-sample.json') as f:
    data = json.load(f)[:100]

db.execute(Collection('wikipedia').insert_many([D(r) for r in data]))

INFO:root:found 0 uris


(<pymongo.results.InsertManyResult at 0x10f237a30>,
 TaskWorkflow(database=<pinnacledb.db.base.db.DB object at 0x14ff031d0>, G=<networkx.classes.digraph.DiGraph object at 0x155217dd0>))

In [3]:
from pinnacledb.db.mongodb.query import Collection

r = db.execute(Collection('wikipedia').find_one()).unpack()
r

{'_id': ObjectId('65244e749317885b431567d6'),
 'title': 'Fernando de la Fuente de la Fuente',
 'abstract': 'Fernando de la Fuente de la Fuente (16 December 1943 – 31 October 1996) was a Spanish Marist Brother and missionary who was one of four Marist Brothers martyred at the Nyamirangwe refugee camp, Zaire. Together with the brothers of his community who were assassinated, Miguel Ángel Isla Lucio, Servando Mayor García, and Julio Rodríguez Jorge.',
 '_fold': 'train'}

In [4]:
from pinnacledb.container.model import Model


def splitter(r):
    out = [r['title']]
    split = r['abstract'].split(' ')
    for i in range(0, len(split) - 5, 5):
        out.append(' '.join(split[i: i + 5]))
    out = [x for x in out if x]
    return out


model = Model(
    identifier='splitter',
    object=splitter,
    flatten=True,
    model_update_kwargs={'document_embedded': False},
)

model.predict(r, one=True)

['Fernando de la Fuente de la Fuente',
 'Fernando de la Fuente de',
 'la Fuente (16 December 1943',
 '– 31 October 1996) was',
 'a Spanish Marist Brother and',
 'missionary who was one of',
 'four Marist Brothers martyred at',
 'the Nyamirangwe refugee camp, Zaire.',
 'Together with the brothers of',
 'his community who were assassinated,',
 'Miguel Ángel Isla Lucio, Servando',
 'Mayor García, and Julio Rodríguez']

In [5]:
model.predict(
    X='_base', 
    db=db,
    select=Collection('wikipedia').find()
)

INFO:root:Adding model splitter to db
INFO:root:Done.
100it [00:00, 425.98it/s]


In [6]:
db.execute(Collection('_outputs._base.splitter').find_one())

Document({'_id': ObjectId('65244e8a9317885b4315683e'), '_outputs': {'_base': {'splitter': 'Fernando de la Fuente de la Fuente'}}, '_source': ObjectId('65244e749317885b431567d6'), '_offset': 0})

In [7]:
from pinnacledb.container.vector_index import VectorIndex
from pinnacledb.container.listener import Listener
from pinnacledb.ext.numpy.array import array
from pinnacledb.ext.openai.model import OpenAIEmbedding

model = OpenAIEmbedding(model='text-embedding-ada-002')

db.add(
    VectorIndex(
        identifier=f'chunked-documents',
        indexing_listener=Listener(
            model=model,
            key='_outputs._base.splitter',
            select=Collection('_outputs._base.splitter').find(),
            predict_kwargs={'max_chunk_size': 1000},
        ),
        compatible_listener=Listener(
            model=model,
            key='_base',
            select=None,
            active=False,
        )
    )
)

INFO:root:Adding model text-embedding-ada-002 to db
INFO:root:Done.
694it [00:00, 1809.34it/s]


Computing chunk 0/0


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:14<00:00,  2.10s/it]


[]

In [8]:
db.show('vector_index')

['chunked-documents']

In [24]:
from pinnacledb.db.mongodb.query import Collection
from pinnacledb.container.document import Document as D
from IPython.display import *

query = 'died in central africa'

shingle_collection = Collection('_outputs._base.splitter')
main_collection = Collection('wikipedia')

result = db.execute(
    shingle_collection
        .like(D({'_base': query}), vector_index='chunked-documents', n=5)
        .find({}, {'_outputs._base.text-embedding-ada-002': 0})
)

display(Markdown(f'---'))
for shingle in result:
    original = db.execute(main_collection.find_one({'_id': shingle['_source']}))

    display(Markdown(f'# {original["title"]}"'))
    
    start = original['abstract'].find(shingle['_outputs']['_base']['splitter'])

    to_format = (
        original["abstract"][:start] + '**' + '<span style="color:red">' +
        shingle["_outputs"]["_base"]["splitter"].upper() + '**' + '<span style="color:black">' +
        original["abstract"][start + len(shingle["_outputs"]["_base"]["splitter"]):]
    )
    
    display(Markdown(to_format))
    display(Markdown(f'---'))

---

# Fernando de la Fuente de la Fuente"

Fernando de la Fuente de la Fuente (16 December 1943 – 31 October 1996) was a Spanish Marist Brother and missionary who was one of four Marist Brothers martyred at **<span style="color:red">THE NYAMIRANGWE REFUGEE CAMP, ZAIRE.**<span style="color:black"> Together with the brothers of his community who were assassinated, Miguel Ángel Isla Lucio, Servando Mayor García, and Julio Rodríguez Jorge.

---

# Robert Goldwater"

Robert Goldwater (November 23, 1907 – March 26, 1973) was **<span style="color:red">AN ART HISTORIAN, AFRICAN ARTS**<span style="color:black"> scholar and the first director of the Museum of Primitive Art, New York, from 1957 to 1973. He was married to the French-born American artist and sculptor Louise Bourgeois.

---

# 20th parallel south"

The 20th parallel south is a circle of latitude that is 20 degrees south of the Earth's equatorial plane. It **<span style="color:red">CROSSES THE ATLANTIC OCEAN, AFRICA,**<span style="color:black"> the Indian Ocean, Australasia, the Pacific Ocean and South America.

---

# Nyabêla"

Nyabêla (1825/30 - 1902) also known in Afrikaans as Niabel , was a chief of **<span style="color:red">THE NDZUNDZA-NDEBELE DURING THE NINETEENTH**<span style="color:black"> century . He is remembered for his struggle against whites for control of his tribe's own territory.

---

# Nyabêla"

Nyabêla (1825/30 - 1902) also **<span style="color:red">KNOWN IN AFRIKAANS AS NIABEL**<span style="color:black"> , was a chief of the Ndzundza-Ndebele during the nineteenth century . He is remembered for his struggle against whites for control of his tribe's own territory.

---