Dado que no se tiene una API de OpenAI, ya que es de pago, se pega el output de cada celda de código importante

# Sparse Dense and Hybrid Search

## Remove old Weaviate DB files

In [1]:
!rm -rf ~/.local/share/weaviate

## Recreate the example
With the same data as in the previous lesson

In [2]:
import requests
import json

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))

def json_print(data):
    print(json.dumps(data, indent=2))

<class 'list'> 10


In [3]:
import weaviate, os
from weaviate import EmbeddedOptions
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-Api-BaseURL": os.environ['OPENAI_API_BASE'],
        "X-OpenAI-Api-Key": openai.api_key,  # Replace this with your actual key
    }
)
print(f"Client created? {client.is_ready()}")

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


Started /Users/n.c.rodriguez/.cache/weaviate-embedded: process ID 4282


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-02-16T18:11:15+01:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-02-16T18:11:15+01:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-02-16T18:11:15+01:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2024-02-16T18:11:15+01:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-02-16T18:11:15+01:00"}


Client created? True


In [4]:
# Uncomment the following two lines if you want to run this block for a second time.
if client.schema.exists("Question"):
   client.schema.delete_class("Question")
 
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  # Use OpenAI as the vectorizer
    "moduleConfig": {
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "baseURL": os.environ["OPENAI_API_BASE"]
        }
    }
}

client.schema.create_class(class_obj)

{"level":"info","msg":"Created shard question_rBb7iHY0s81p in 11.058792ms","time":"2024-02-16T18:11:15+01:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-02-16T18:11:15+01:00","took":55458}


In [None]:
with client.batch.configure(batch_size=5) as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

importing question: 1

importing question: 2

importing question: 3

importing question: 4

importing question: 5

importing question: 6

importing question: 7

importing question: 8

importing question: 9

importing question: 10

## Queries

### Dense Search

In [None]:
# búsqueda de algo relacionado con 'animal'
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts":["animal"]})
    .with_limit(3)
    .do()
)

json_print(response)

{

  "data": {
  
    "Get": {
    
      "Question": [
      
        {
        
          "answer": "Elephant",
          
          "question": "It's the only living mammal in the order Proboseidea"
          
        },
        
        {
        
          "answer": "the nose or snout",
          
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
          
        },
        
        {
        
          "answer": "Antelope",
          
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
          
        }
        
      ]
      
    }
    
  }
  
}


### Sparse Search - BM25

In [None]:
# búsqueda de un objeto que tenga la palabra 'animal' en la question
response = (
    client.query
    .get("Question",["question","answer"])
    .with_bm25(query="animal")
    .with_limit(3)
    .do()
)

json_print(response)

{

  "data": {
  
    "Get": {
    
      "Question": [
      
        {
        
          "answer": "Antelope",
          
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
          
        }
        
      ]
      
    }
    
  }
  
}


### Hybrid Search

In [5]:
# con la búsqueda híbrida por 'animal', el objeto con 'animal' en la question aparece arriba de todo y el resto debajo.
# que aparezcan más objetos aparte del que tiene 'animal' en la question es por el aplha=0.5
response = (
    client.query
    .get("Question",["question","answer"])
    .with_hybrid(query="animal", alpha=0.5)
    .with_limit(3)
    .do()
)

json_print(response)

{
  "errors": [
    {
      "locations": [
        {
          "column": 62,
          "line": 1
        }
      ],
      "message": "Cannot query field \"question\" on type \"Question\".",
      "path": null
    },
    {
      "locations": [
        {
          "column": 71,
          "line": 1
        }
      ],
      "message": "Cannot query field \"answer\" on type \"Question\".",
      "path": null
    }
  ]
}


{

  "data": {
  
    "Get": {
      "Question": [
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        }
      ]
    }
  }
}

In [None]:
# misma búsqueda que la anterior pero con el parámetro alpha=0, con lo cual sólo aparece el objeto con 'animal'
response = (
    client.query
    .get("Question",["question","answer"])
    .with_hybrid(query="animal", alpha=0)
    .with_limit(3)
    .do()
)

json_print(response)

{

  "data": {
  
    "Get": {
      "Question": [
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
  
}

In [None]:
# parámetro alpha=1 , el objeto con 'animal' aparece de último
response = (
    client.query
    .get("Question",["question","answer"])
    .with_hybrid(query="animal", alpha=1)
    .with_limit(3)
    .do()
)

json_print(response)

{

  "data": {
  
    "Get": {
      "Question": [
        {
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}