In [9]:
%pip install weaviate-client python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [119]:
import os
import weaviate
from weaviate.util import generate_uuid5
from dotenv import load_dotenv

load_dotenv()

weaviate_api_key = os.environ['WEAVIATE_API_KEY']
weaviate_url = os.environ['WEAVIATE_URL']
openai_api_key = os.environ['OPENAI_API_KEY']

auth_config = weaviate.AuthApiKey(api_key=weaviate_api_key)

# set up the client
CLIENT = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=auth_config,
    additional_headers={
        "X-OpenAI-Api-Key": openai_api_key
    }
)

# flush the schema and data
CLIENT.schema.delete_all()

# create the schema
schema = {
    "classes": [
        {
            "class": "Work",
            "vectorizer": "text2vec-openai",
                "moduleConfig": {
                "text2vec-openai": {
                    "vectorizeClassName": True
                }
            },
            "properties": [
                {"dataType": ["text"], "name": "title"},
                {"dataType": ["text"], "name": "accession_number"},
                {"dataType": ["text"], "name": "alternate_title"},
                {"dataType": ["text"], "name": "api_model"},
                {"dataType": ["number"], "name": "catalog_key"},
                {"dataType": ["text"], "name": "collection"},
                {"dataType": ["text"], "name": "contributor"},
                {"dataType": ["text"], "name": "create_date"},
                {"dataType": ["text"], "name": "creator"},
                {"dataType": ["text"], "name": "date_created"},
                {"dataType": ["text"], "name": "description"},
                {"dataType": ["text"], "name": "genre"},
                {"dataType": ["text"], "name": "identifier"},
                {"dataType": ["text"], "name": "identifier_human_readable"},
                {"dataType": ["text"], "name": "keywords"},
                {"dataType": ["text"], "name": "language"},
                {"dataType": ["text"], "name": "library_unit"},
                {"dataType": ["text"], "name": "location"},
                {"dataType": ["text"], "name": "physical_description_material"},
                {"dataType": ["text"], "name": "physical_description_size"},
                {"dataType": ["text"], "name": "preservation_level"},
                {"dataType": ["boolean"], "name": "published"},
                {"dataType": ["text"], "name": "related_material"},
                {"dataType": ["text"], "name": "related_url"},
                {"dataType": ["text"], "name": "rights_holder"},
                {"dataType": ["text"], "name": "rights_statement"},
                {"dataType": ["text"], "name": "scope_and_contents"},
                {"dataType": ["text"], "name": "series"},
                {"dataType": ["text"], "name": "source"},
                {"dataType": ["text"], "name": "status"},
                {"dataType": ["text"], "name": "style_period"},
                {"dataType": ["text"], "name": "subject"},
                {"dataType": ["text"], "name": "table_of_contents"},
                {"dataType": ["text"], "name": "technique"},
                {"dataType": ["text"], "name": "visibility"},
                {"dataType": ["text"], "name": "work_type"},
            ],
        }
    ]
}


CLIENT.schema.create(schema)

In [120]:
import json

def jprint(schema):
    print(json.dumps(schema, indent=4))

In [121]:
%pip install pandas
import pandas as pd

Note: you may need to restart the kernel to use updated packages.


In [122]:
# Read the csv file
data = pd.read_csv('./data/nuldc_06_01_23_random_10k_filtered.csv').rename(columns={'identifier': 'identifier_human_readable', 'id': 'identifier'})
print(f'Number of records: {len(data)}')
first_record = data.iloc[0].to_dict()
print("Sample record: ")
for key, value in first_record.items():
    print(f'    {key}: {value}')

Number of records: 9995
Sample record: 
    accession_number: P0402_nubaa_s2_df2_112
    alternate_title: nan
    api_model: Work
    catalog_key: nan
    collection: Records of Northwestern University Black Alumni Association (NUBAA) 
    contributor: Northwestern University Black Alumni Association
    create_date: 2022-08-01T16:31:55.476329Z
    creator: nan
    date_created: 1968 to 2002
    description: nan
    genre: born digital|CD-Rs
    identifier: 18f90cdd-0dfe-41a3-8596-ee6b1358261c
    identifier_human_readable: Series 2, digital folder 2
    keywords: nan
    language: nan
    library_unit: University Archives
    location: nan
    physical_description_material: From Compact Disc Recordable (CD-R)
    physical_description_size: nan
    preservation_level: Level 1
    published: True
    related_material: nan
    related_url: Finding Aid
    rights_holder: nan
    rights_statement: In Copyright
    scope_and_contents: nan
    series: Records of Northwestern University Black

In [123]:
from weaviate.util import generate_uuid5

CLIENT.batch.configure(
    batch_size=100
)

with CLIENT.batch as batch:
    for i, d in enumerate(data.iloc):
        filtered = d.dropna().to_dict()
        uuid_work = generate_uuid5(filtered, "Work")
        batch.add_data_object(
            data_object=filtered,
            class_name="Work",
            uuid=uuid_work
        )

In [124]:
class_name = "Work"
count = CLIENT.query.aggregate(class_name).with_meta_count().do()
jprint(count)

{
    "data": {
        "Aggregate": {
            "Work": [
                {
                    "meta": {
                        "count": 9995
                    }
                }
            ]
        }
    }
}


In [128]:
nearText = {"concepts": ["jealous rage"]}
CLIENT.query.get("Work", ["identifier", "title", "subject"]).with_near_text(nearText).with_limit(15).do()

{'data': {'Get': {'Work': [{'identifier': '7e16ea3a-037c-4da5-8199-47c14802fa9e',
     'subject': 'Iranians|Bald eagle|Revolution (Iran : 1979)',
     'title': 'Jihad this!'},
    {'identifier': '698bd2fc-39e9-4dab-baf1-0d519d52b24a',
     'subject': 'Painting',
     'title': 'The Poet and His Muse: Portrait of Apollinaire and Marie Laurencin'},
    {'identifier': 'b2fb28b5-3f76-42a8-9c70-6b11180f6e4f',
     'subject': 'Photography--20th century--United States',
     'title': 'From Here I Saw What Happened and I Cried. Detail'},
    {'identifier': 'cbd4a24c-9382-4284-8d01-7f1f9dacd65d',
     'subject': 'Painting',
     'title': "Aesop (in Beggar's Rags). bust"},
    {'identifier': '0f5c6f46-b303-488b-8993-4d780c31e0e3',
     'subject': 'Painting|20th century',
     'title': 'Champs-Elysees and Arc de Triomphe'},
    {'identifier': '32d4f52a-e7bc-4382-98f2-506a8212bbd4',
     'subject': 'South Africa|Anti-apartheid movements|South Africa--Foreign economic relations|Political posters, En