In [None]:
# You need to install the python Weaviate client
!pip install weaviate-client

## Download data
Download data from [drive](https://drive.google.com/file/d/1oOv6zT7_whFE78fUzo8hi64_FWO6-l44/view?usp=sharing)

## ☁️☁️ Configure the Weaviate Cloud Instance ☁️☁️
### Free 14 day sandbox here: https://console.weaviate.cloud/

In [None]:
import os
import weaviate
import json

auth_config = weaviate.auth.AuthApiKey(api_key=#TODO)

client = weaviate.Client(
    url=#TODO,
    auth_client_secret=auth_config,
    
    additional_headers={
        "X-Cohere-Api-Key": #TODO,
        "X-OpenAI-Api-Key": #TODO
    }
)
client.is_ready()

## ䷀䷀Create Database Schema䷀䷀

In [None]:
# delete existing schema, (note, this will delete all your weaviate data)
client.schema.delete_all()

article_schema = { #TODO 
}

# add the schema
#client.schema.delete_all()
client.schema.create_class(article_schema)

print("The schema has been created")

In [None]:
client.schema.get()

In [None]:
import pandas as pd
df = pd.read_parquet('wiki_simple_100k.parquet')

In [None]:
df.sample(10)

In [None]:
df['emb'][0].shape

## 💽💽 Batch and Add 100k Wikipedia Articles to Weaviate 💽💽

In [None]:
### Step 1 - configure Weaviate Batch, which optimizes CRUD operations in bulk
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

#TODO

In [None]:
data = df[:100000] # make sure it is not more than 100k objects

counter=0

with client.batch as batch:
    for idx, item in data.iterrows():        
        # print update message every 100 objects        
        if (counter %100 == 0):
            print(f"Import {counter} / {len(data)} ", end="\r")

        #TODO
        
        counter = counter+1
    print(f"Import {counter} / {len(data)}")
        
print("Import complete")

In [None]:
# Test that all data has loaded – get object count
result = #TODO

## 🔎🔎All the ways you can search your data:🔍🔍

### 1. Classic Word Search 
- Basic word matching. Look for the occurence of a word in the document.

### 2. Vector Search
- Find closest object vectors closest to query vector. Fetches objects the have similar meaning to the query.

### 3. Hybrid Search - combine word and semantic match.
- Perform both word and vector search and then combine the results.

### 4. Generative Search - search and interpret with an LLM.
- Search for semantically relevant documents to a prompt and then provide them as context to a LLM to guide its generation.

### 1. Classic Word Search 

In [None]:
where_filter = #TODO

query_result = #TODO

print(json.dumps(query_result, indent=2))

In [None]:
where_filter = {
  "path": ["title"],
  "operator": "Like",
  "valueString": "*fastest animals*" #getting back animals that are fast not really matching "fastest animals"
}

query_result = (
  client.query
  .get("Article", ["title", "text","wiki_id"])
  .with_where(where_filter)
  .do()
)

print(query_result['data']['Get']['Article'][0]['title']+'\n'+query_result['data']['Get']['Article'][0]['text'])

In [None]:
print(json.dumps(query_result,indent=2))

### 2. Vector Search/ Semantic Search/ Neural Search

In [None]:
def semantic_search(query): #'fast animals that live in the sea'
    
    nearText = #TODO

    properties = #TODO

    response = #TODO

    result = response['data']['Get']['Article']

    return result
    
#helper print function
def print_result(result):
    for item in result:
        print(f"\033[95m{item['title']} ({item['views']}) {item['_additional']['distance']}\033[0m")
        print(f"\033[4m{item['url']}\033[0m")
        print(item['text'])
        print()

In [None]:
query_result = #TODO

print_result(query_result)

In [None]:
query_result = #TODO

print_result(query_result)

### ... but wait ... this is a Multi-Lingual Model! 🗣❗️

 - You can use it to perform multilingual search! Search in one language that model understands and recieve relevant documents in any language!

In [None]:
# This is a multi-lingual model so it can take in queries in different languages!


query_result = #TODO

print_result(query_result)

In [None]:
#"vacation spots" in Farsi

query_result = #TODO

print_result(query_result)

In [None]:
# GREAT ACTION movies in chinese
query_result = #TODO

print_result(query_result)

### 3. Hybrid Search: IF you want to mix both Keyword search and semantic search!

- Getting the best of both world!

In [None]:
response = #TODO

print(json.dumps(response, indent=2))

In [None]:
response = #TODO

print(json.dumps(response, indent=2))

### 4. Generative Search:
- Attaching your search engine outputs to a LLM to generate with!/

In [None]:
bb_res = semantic_search("famous basketball players NBA")
print_result(bb_res)

In [None]:
#Q for the LLM
generatePrompt = #TODO

result = #TODO

In [None]:
print(json.dumps(result, indent=2))

In [None]:
print("Generated Text:\n" + result['data']['Get']['Article'][0]['_additional']['generate']['singleResult']+"\n")


print("Relevant Context:\n" + result['data']['Get']['Article'][0]['title']+"\n")

#### Passing all relevant documents to complete a Task specified in the Prompt: Grouped Task

In [None]:
generateTask = #TODO

result = #TODO

print("Generated Text:\n" + result['data']['Get']['Article'][0]['_additional']['generate']['groupedResult']+"\n"+"\nArticle Titles Provided as Context:\n")

k = [print(result['data']['Get']['Article'][i]['title']+"\n") for i in range(len(result['data']['Get']['Article']))]

In [None]:
generateTask = "Tell me a story where these people {title} fight each other, here's some information about them {text}"

result = (
  client.query
  .get("Article", ["title",'text'])
  .with_generate(grouped_task=generateTask)
  .with_near_text({
    "concepts": ["famous basketball players NBA"]
  })
  .with_limit(5)
).do()

print("Generated Text:\n" + result['data']['Get']['Article'][0]['_additional']['generate']['groupedResult']+"\n\nArticle Text Provided as Context:\n")

k = [print(result['data']['Get']['Article'][i]['title']+"\n") for i in range(len(result['data']['Get']['Article']))]