In [2]:
import os
import weaviate
from weaviate.util import generate_uuid5
from dotenv import load_dotenv

load_dotenv()

weaviate_api_key = os.environ['WEAVIATE_API_KEY']
weaviate_url = os.environ['WEAVIATE_URL']
openai_api_key = os.environ['OPENAI_API_KEY']

auth_config = weaviate.AuthApiKey(api_key=weaviate_api_key)

# set up the client
CLIENT = weaviate.Client(
    url=weaviate_url,
    auth_client_secret=auth_config,
    additional_headers={
        "X-OpenAI-Api-Key": openai_api_key
    }
)

# flush the schema and data
CLIENT.schema.delete_all()

# create the schema
schema = {
    "classes": [
        {
            "class": "Work",
            "vectorizer": "text2vec-openai",
                "moduleConfig": {
                "text2vec-openai": {
                    "vectorizeClassName": True
                }
            },
            "properties": [
                {"dataType": ["text"], "name": "title"},
                {"dataType": ["text"], "name": "accession_number"},
                {"dataType": ["text"], "name": "alternate_title"},
                {"dataType": ["text"], "name": "api_model"},
                {"dataType": ["number"], "name": "catalog_key"},
                {"dataType": ["text"], "name": "collection"},
                {"dataType": ["text"], "name": "contributor"},
                {"dataType": ["text"], "name": "create_date"},
                {"dataType": ["text"], "name": "creator"},
                {"dataType": ["text"], "name": "date_created"},
                {"dataType": ["text"], "name": "description"},
                {"dataType": ["text"], "name": "genre"},
                {"dataType": ["text"], "name": "identifier"},
                {"dataType": ["text"], "name": "identifier_human_readable"},
                {"dataType": ["text"], "name": "keywords"},
                {"dataType": ["text"], "name": "language"},
                {"dataType": ["text"], "name": "library_unit"},
                {"dataType": ["text"], "name": "location"},
                {"dataType": ["text"], "name": "physical_description_material"},
                {"dataType": ["text"], "name": "physical_description_size"},
                {"dataType": ["text"], "name": "preservation_level"},
                {"dataType": ["boolean"], "name": "published"},
                {"dataType": ["text"], "name": "related_material"},
                {"dataType": ["text"], "name": "related_url"},
                {"dataType": ["text"], "name": "rights_holder"},
                {"dataType": ["text"], "name": "rights_statement"},
                {"dataType": ["text"], "name": "scope_and_contents"},
                {"dataType": ["text"], "name": "series"},
                {"dataType": ["text"], "name": "source"},
                {"dataType": ["text"], "name": "status"},
                {"dataType": ["text"], "name": "style_period"},
                {"dataType": ["text"], "name": "subject"},
                {"dataType": ["text"], "name": "table_of_contents"},
                {"dataType": ["text"], "name": "technique"},
                {"dataType": ["text"], "name": "visibility"},
                {"dataType": ["text"], "name": "work_type"},
            ],
        }
    ]
}


CLIENT.schema.create(schema)

In [3]:
import json

def jprint(schema):
    print(json.dumps(schema, indent=4))

In [4]:
%pip install pandas
import pandas as pd

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Read the csv file
data = pd.read_csv('./data/nuldc_06_01_23_random_10k_filtered.csv').rename(columns={'identifier': 'identifier_human_readable', 'id': 'identifier'})
print(f'Number of records: {len(data)}')
first_record = data.iloc[0].to_dict()
print("Sample record: ")
for key, value in first_record.items():
    print(f'    {key}: {value}')

Number of records: 9995
Sample record: 
    accession_number: P0402_nubaa_s2_df2_112
    alternate_title: nan
    api_model: Work
    catalog_key: nan
    collection: Records of Northwestern University Black Alumni Association (NUBAA) 
    contributor: Northwestern University Black Alumni Association
    create_date: 2022-08-01T16:31:55.476329Z
    creator: nan
    date_created: 1968 to 2002
    description: nan
    genre: born digital|CD-Rs
    identifier: 18f90cdd-0dfe-41a3-8596-ee6b1358261c
    identifier_human_readable: Series 2, digital folder 2
    keywords: nan
    language: nan
    library_unit: University Archives
    location: nan
    physical_description_material: From Compact Disc Recordable (CD-R)
    physical_description_size: nan
    preservation_level: Level 1
    published: True
    related_material: nan
    related_url: Finding Aid
    rights_holder: nan
    rights_statement: In Copyright
    scope_and_contents: nan
    series: Records of Northwestern University Black

In [41]:
from weaviate.util import generate_uuid5

def check_batch_result(results: dict):
  """
  Check batch results for errors.

  Parameters
  ----------
  results : dict
      The Weaviate batch creation return value.
  """

  if results is not None:
    for result in results:
      if "result" in result and "errors" in result["result"]:
        if "error" in result["result"]["errors"]:
          print(result["result"])



CLIENT.batch.configure(
    batch_size=50
)

with CLIENT.batch as batch:
    for i, d in enumerate(data.iloc):
        filtered = d.dropna().to_dict()
        uuid_work = generate_uuid5(filtered, "Work")
        result = batch.add_data_object(
            data_object=filtered,
            class_name="Work",
            uuid=uuid_work
        )
        check_batch_result(result)