In [4]:
!pip3 install weaviate-client
!pip3 install datasets apache-beam

Collecting apache-beam
  Using cached apache-beam-2.46.0.zip (3.1 MB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting protobuf<4,>3.12.2
  Using cached protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
Collecting crcmod<2.0,>=1.7
  Using cached crcmod-1.7.tar.gz (89 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting orjson<4.0
  Using cached orjson-3.8.9-cp39-cp39-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl (488 kB)
Collecting dill<0.3.7,>=0.3.0
  Using cached dill-0.3.1.1.tar.gz (151 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting cloudpickle~=2.2.1
  Using cached cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting fastavro<2,>=0.23.6
  Using cached fastavro-1.7.3.tar.gz (791 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting fasteners<1.0,>=0.3
  Using cached fastene

In [13]:
import openai
from configparser import ConfigParser

constants = ConfigParser()
constants.read("/Users/osuz/PycharmProjects/YaServiceRu/resources/constants.ini")

# get API key from top-right dropdown on OpenAI website
openai.api_key = constants.get("API", "OPENAI")

In [15]:
import weaviate
from datasets import load_dataset
import os

# Connect to your Weaviate instance
client = weaviate.Client(
    url="http://localhost:8080/",
    additional_headers={
        "X-OpenAI-Api-Key": openai.api_key
    }
)

# Check if your instance is live and ready
# This should return `True`
client.is_ready()

True

In [7]:
# Clear up the schema, so that we can recreate it
client.schema.delete_all()
client.schema.get()

# Define the Schema object to use `text-embedding-ada-002` on `title` and `content`, but skip it for `url`
article_schema = {
    "class": "Article",
    "description": "A collection of articles",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
          "model": "ada",
          "modelVersion": "002",
          "type": "text"
        }
    },
    "properties": [{
        "name": "title",
        "description": "Title of the article",
        "dataType": ["string"]
    },
    {
        "name": "content",
        "description": "Contents of the article",
        "dataType": ["text"]
    },
    {
        "name": "url",
        "description": "URL to the article",
        "dataType": ["string"],
        "moduleConfig": { "text2vec-openai": { "skip": True } }
    }]
}

# add the Article schema
client.schema.create_class(article_schema)

# get the schema to make sure it worked
client.schema.get()

{'classes': [{'class': 'Article',
   'description': 'A collection of articles',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'text2vec-openai': {'model': 'ada',
     'modelVersion': '002',
     'type': 'text',
     'vectorizeClassName': True}},
   'properties': [{'dataType': ['string'],
     'description': 'Title of the article',
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'title',
     'tokenization': 'word'},
    {'dataType': ['text'],
     'description': 'Contents of the article',
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'content',
     'tokenization': 'word'},
    {'dataType': ['string'],
     'description': 'URL to the article',
     'moduleConfig': {'text2vec-openai': {'skip': True,
       'vectorizePropert

In [10]:
### STEP 1 - load the dataset

from datasets import load_dataset
from typing import List, Iterator

# We'll use the datasets library to pull the Simple Wikipedia dataset for embedding
dataset = list(load_dataset("wikipedia", "20220301.simple")["train"])

# For testing, limited to 2.5k articles for demo purposes
dataset = dataset[:50]

Downloading and preparing dataset wikipedia/20220301.simple to /Users/osuz/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/235M [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /Users/osuz/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559. Subsequent calls will reuse this data.


  from imp import load_source
  warn(


  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
### Step 2 - configure Weaviate Batch, with
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=10,
    dynamic=True,
    timeout_retries=3,
#   callback=None,
)

<weaviate.batch.crud_batch.Batch at 0x280de4910>

In [17]:
### Step 3 - import data

print("Importing Articles")

counter=0

with client.batch as batch:
    for article in dataset:
        if (counter %10 == 0):
            print(f"Import {counter} / {len(dataset)} ")

        properties = {
            "title": article["title"],
            "content": article["text"],
            "url": article["url"]
        }

        batch.add_data_object(properties, "Article")
        counter = counter+1

print("Importing Articles complete")

Importing Articles
Import 0 / 50 
Import 10 / 50 
Import 20 / 50 
Import 30 / 50 
Import 40 / 50 
Importing Articles complete


In [18]:
# Test that all data has loaded – get object count
result = (
    client.query.aggregate("Article")
    .with_fields("meta { count }")
    .do()
)
print("Object count: ", result["data"]["Aggregate"]["Article"], "\n")

Object count:  [{'meta': {'count': 50}}] 



In [19]:
# Test one article has worked by checking one object
test_article = (
    client.query
    .get("Article", ["title", "url", "content"])
    .with_limit(1)
    .do()
)["data"]["Get"]["Article"][0]

print(test_article['title'])
print(test_article['url'])
print(test_article['content'])

Abrahamic religion
https://simple.wikipedia.org/wiki/Abrahamic%20religion
An Abrahamic Religion is a religion whose followers believe in the prophet Abraham. They believe Abraham and his sons/grandsons hold an important role in human spiritual development. The best known Abrahamic religions are Judaism, Christianity and Islam. Smaller religious traditions sometimes included as Abrahamic religions are Samaritanism, Druze, Rastafari, Babism and Bahá'í Faith. Mandaeism (a religion that holds many Abrahamic beliefs) is not called Abrahamic because its followers think Abraham was a false prophet

True Abrahamic religions are monotheistic (the belief that there is only one God). They also all believe that people should pray to God and worship God often. Among monotheistic religions, the Abrahamic religions have the world's largest number of followers. They are also all ethical monotheistic religions. This means they have rules that they have to follow.

Religions


In [20]:
def query_weaviate(query, collection_name):

    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "title", "content", "url",
        "_additional {certainty distance}"
    ]

    result = (
        client.query
        .get(collection_name, properties)
        .with_near_text(nearText)
        .with_limit(10)
        .do()
    )

    # Check for errors
    if ("errors" in result):
        print ("\033[91mYou probably have run out of OpenAI API calls for the current minute – the limit is set at 60 per minute.")
        raise Exception(result["errors"][0]['message'])

    return result["data"]["Get"][collection_name]

In [21]:
query_result = query_weaviate("modern art in Europe", "Article")

for i, article in enumerate(query_result):
    print(f"{i+1}. { article['title']} (Score: {round(article['_additional']['certainty'],3) })")

1. Art (Score: 0.896)
2. Architecture (Score: 0.893)
3. Austria (Score: 0.882)
4. Armenia (Score: 0.876)
5. Archaeology (Score: 0.872)
6. Autonomous communities of Spain (Score: 0.871)
7. Astronomy (Score: 0.871)
8. Alan Turing (Score: 0.871)
9. Being (Score: 0.869)
10. Afghanistan (Score: 0.866)


In [22]:

query_result = query_weaviate("Famous battles in Scottish history", "Article")

for i, article in enumerate(query_result):
    print(f"{i+1}. { article['title']} (Score: {round(article['_additional']['certainty'],3) })")

1. Alan Turing (Score: 0.867)
2. Australia (Score: 0.858)
3. Black pudding (Score: 0.858)
4. Alanis Morissette (Score: 0.854)
5. April (Score: 0.854)
6. Afghanistan (Score: 0.854)
7. British English (Score: 0.854)
8. Austria (Score: 0.851)
9. Architecture (Score: 0.85)
10. Argentina (Score: 0.85)
