In [4]:
from qdrant_client import QdrantClient 

client = QdrantClient("http://localhost:6333")
client.get_collections()

CollectionsResponse(collections=[])

In [7]:
from pathlib import Path
import json

json_path = Path('../src/met_museum_objects_full.json')
with open(json_path) as f:
    data = json.load(f)

In [8]:
sample_object = data[0]

In [22]:
from typing import Dict
def prepare_painting_description(painting_obj: Dict) -> Dict:

    obj_copy = painting_obj.copy()

    intro_statement = f"{obj_copy.get('title', '')} by {obj_copy.get('artistDisplayName')}"
    artist_bio = f"{obj_copy.get('artistDisplayName')} is {obj_copy.get('artistNationality')}, their bio is: '{obj_copy.get('artistDisplayBio')}'. {obj_copy.get('artistDisplayName')} lived from {obj_copy.get('artistBeginDate')} to {obj_copy.get('artistEndDate')}"
    
    artwork_origin = f"The source/origin of the artwork is {obj_copy.get('creditLine')[:-7]}, the Metropolitan Museum of Art acquired the artwork in {obj_copy.get('creditLine')[-4:]}"

    medium_dimensions = f"The medium for the painting is {obj_copy.get('medium', 'canvas')}, and the dimensions are {obj_copy.get('dimensions', 'unknown')}"

    gallery_location = f"The artwork is presented at gallery {obj_copy.get("GalleryNumber")}, located on the map here {obj_copy.get("galleryLink")}" if obj_copy.get("GalleryNumber") != ""  else "The artwork is currently not showcased at the museum"

    artwork_description = f"The description of the artwork is: '{obj_copy.get("itemDescription")}'"

    #tags
    tags_lst = []
    tags_text = ""
    try:
        if len(obj_copy.get("tags")) > 0:
            for tag in obj_copy.get("tags"):
                try:
                    tags_lst.append(tag['term'])
                except:
                    continue
            tags_text = f"The following tags are related to {obj_copy.get('title', '')} : {tags_lst}"
        else:
            tags_text = ""
    except:
        tags_text = ""

    #artists
    artist_lst = []
    artist_text = ""
    try:
        if len(obj_copy.get("constituents")) > 0:
            for constituent in obj_copy.get("constituents"):
                try:
                    artist_lst.append(constituent['name'])
                except:
                    continue
            artist_text = f"The following artists are related to {obj_copy.get('title', '')} : {artist_lst}"
        else:
            artist_text = ""
    except:
        artist_text = ""

    #painting work
    painting_work_duration = ""
    if obj_copy.get('objectBeginDate', 0) == obj_copy.get('objectEndDate', 0):
        painting_work_duration = f"The painting was started and completed in {obj_copy.get('objectBeginDate', 0)}"
    else:
        painting_work_duration = f"The painting was started in {obj_copy.get('objectBeginDate', 0)} and completed in {obj_copy.get('objectEndDate', 0)}"

    public_importance = f"and {'is' if obj_copy.get('isHighlight') == False else 'is not'} a popular and important artwork in {obj_copy.get('artistDisplayName')}'s collection, {'and is currently in the public domain' if obj_copy.get('isHighlight') == False else 'is not currently in the public domain'}"

    text = f"""
    {intro_statement}. {artwork_description}. {artwork_origin}. {medium_dimensions}. {gallery_location}. {artist_bio}. {painting_work_duration}, {public_importance}. {artist_text}. {tags_text}.
""".strip()

    data_dict = {
        'artwork_id': obj_copy.get('objectID'),
        'artwork_text': text,
        'primary_image_url': obj_copy.get('primaryImage', ''),
        'artist_bio_url': obj_copy.get('artistWikidata_URL', ''),
        'artwork_url': obj_copy.get('objectURL', '')
    }

    return data_dict 


prepare_painting_description(sample_object)
    


{'artwork_id': 436418,
 'artwork_text': "Jerusalem from the Mount of Olives by Charles-Théodore Frère. The description of the artwork is: 'This panoramic view was commissioned from Frère by the New York collector Catharine Lorillard Wolfe by 1880, when it was first described as being in her possession. Because the artist had not been to the Holy Land for twenty years—he had last traveled there as part of Empress Eugénie’s retinue in 1861—the composition must be based on one or more earlier studies or photographs.'. The source/origin of the artwork is Catharine Lorillard Wolfe Collection, Bequest of Catharine Lorillard Wolf, the Metropolitan Museum of Art acquired the artwork in 1887. The medium for the painting is Oil on canvas, and the dimensions are 29 1/2 x 43 1/2 in. (74.9 x 110.5 cm). The artwork is presented at gallery 804, located on the map here https://maps.metmuseum.org/poi?_gl=1%2A1958ftb%2A_ga%2AMjk2MzAzMzczLjE3MDE4NzY3NzM.%2A_ga_Y0W8DGNBTB%2AMTcwODk4Mjg3Ny4yNDcuMS4xNzA4OTg

In [None]:
artwork_obj_lst = []

for obj in data:
    try:
        artwork_obj = prepare_painting_description(obj)
        artwork_obj_lst.append(artwork_obj)
    except:
        print(artwork_obj)

In [None]:
# Create the collection with sparse and dense vector types
from qdrant_client import models

client.create_collection(
    collection_name="met-museum-euro-artworks",
    vectors_config={
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [28]:
client.upsert(
    collection_name="met-museum-euro-artworks",
    points = [
        models.PointStruct(
            id=artwork_obj['artwork_id'],
            vector = {
                "jina-small": models.Document(
                    text = artwork_obj['artwork_text'],
                    model="jinaai/jina-embeddings-v2-small-en"
                ),
                "bm25": models.Document(
                    text=artwork_obj['artwork_text'],
                    model="Qdrant/bm25"

                )
            },
            payload={
                "artwork_text": artwork_obj['artwork_text'],
                'artwork_image_url': artwork_obj['primary_image_url'],
                'artist_url': artwork_obj['artist_bio_url'],
                'artwork_bio_url': artwork_obj['artwork_url']
                    }
        )
     for artwork_obj in artwork_obj_lst
    ] 
)

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

arabic.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Fetching 18 files:   0%|          | 0/18 [00:00<?, ?it/s]

greek.txt: 0.00B [00:00, ?B/s]

hungarian.txt: 0.00B [00:00, ?B/s]

danish.txt:   0%|          | 0.00/424 [00:00<?, ?B/s]

finnish.txt: 0.00B [00:00, ?B/s]

dutch.txt:   0%|          | 0.00/453 [00:00<?, ?B/s]

italian.txt: 0.00B [00:00, ?B/s]

french.txt:   0%|          | 0.00/813 [00:00<?, ?B/s]

norwegian.txt:   0%|          | 0.00/851 [00:00<?, ?B/s]

german.txt: 0.00B [00:00, ?B/s]

english.txt:   0%|          | 0.00/936 [00:00<?, ?B/s]

romanian.txt: 0.00B [00:00, ?B/s]

portuguese.txt: 0.00B [00:00, ?B/s]

swedish.txt:   0%|          | 0.00/559 [00:00<?, ?B/s]

russian.txt: 0.00B [00:00, ?B/s]

spanish.txt: 0.00B [00:00, ?B/s]

turkish.txt:   0%|          | 0.00/260 [00:00<?, ?B/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [32]:
def multi_stage_search(query:str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name="met-museum-euro-artworks",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                # Prefetch ten times more results, then
                # expected to return, so we can really rerank
                limit=(10 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )


    return results.points

In [33]:
results = multi_stage_search("Are there any paintings at the musuem that use oil on canvas as its medium?")

print(results)

[ScoredPoint(id=437620, version=0, score=4.0370765, payload={'artwork_text': "Nymph and Putti; Nymph with a Wreath and Putti with Garlands of Flowers by Piat Joseph Sauvage. The description of the artwork is: 'Sauvage’s name was synonymous with trompe l’oeil (fool the eye) paintings that imitated the low relief carvings in ancient sculpture that became hugely popular during the late eighteenth century. In these unusual examples, Sauvage used a thick slate support to which he added oil paint in imitation of wax, a flexible medium often used by sculptors who were working out a composition before they began carving in unforgiving stone. The result is a brilliant play on artistic process and the differences between painting and sculpture, a recurrent comparison in the Western tradition. The reverse sides of these slate slabs are carved with the name of a cabinetmaker, indicating that Sauvage repurposed his supports from another craftsman’s shop sign.'. The source/origin of the artwork is G

In [35]:
for result in results:
    print(result.payload['artwork_text'])

Nymph and Putti; Nymph with a Wreath and Putti with Garlands of Flowers by Piat Joseph Sauvage. The description of the artwork is: 'Sauvage’s name was synonymous with trompe l’oeil (fool the eye) paintings that imitated the low relief carvings in ancient sculpture that became hugely popular during the late eighteenth century. In these unusual examples, Sauvage used a thick slate support to which he added oil paint in imitation of wax, a flexible medium often used by sculptors who were working out a composition before they began carving in unforgiving stone. The result is a brilliant play on artistic process and the differences between painting and sculpture, a recurrent comparison in the Western tradition. The reverse sides of these slate slabs are carved with the name of a cabinetmaker, indicating that Sauvage repurposed his supports from another craftsman’s shop sign.'. The source/origin of the artwork is Gift of J. Pierpont Morga, the Metropolitan Museum of Art acquired the artwork 