https://github.com/openai/openai-cookbook/tree/main/examples/vector_databases/weaviate


https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/weaviate/question-answering-with-weaviate-and-openai.ipynb

Docker Compose for Weaviate + OpenAI
https://github.com/openai/openai-cookbook/blob/main/examples/vector_databases/weaviate/docker-compose.yml
openai-cookbook/docker-compose.yml at main · openai/openai-cookbook · GitHub


In [20]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import json
import os

import tiktoken
from openai.embeddings_utils import get_embedding
from sklearn.cluster import AgglomerativeClustering

import openai
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

# Create an SQLAlchemy engine to connect to the database
engine = create_engine('postgresql://postgres:mysecretpassword@localhost/postgres')

# Read the ASIN values from the CSV file
asin_list_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/external/asin_list.csv'
#asin_list_path = './data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

OPENAI_API_KEY is ready


In [8]:
import weaviate
# Connect to your Weaviate instance
client = weaviate.Client(
    # url="https://your-wcs-instance-name.weaviate.network/",
    url="http://localhost:8089/",
    # auth_client_secret=weaviate.auth.AuthApiKey(api_key="<YOUR-WEAVIATE-API-KEY>"), # comment out this line if you are not using authentication for your Weaviate instance (i.e. for locally deployed instances)
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }
)

# Check if your instance is live and ready
# This should return `True`
client.is_ready()

True

In [None]:
### Step 2 - configure Weaviate Batch, with
# - starting batch size of 100
# - dynamically increase/decrease based on performance
# - add timeout retries if something goes wrong

client.batch.configure(
    batch_size=100, 
    dynamic=True,
    timeout_retries=3,
#   callback=None,
)

In [None]:
### Step 3 - import data

print("Importing Data")

counter=0

with client.batch as batch:
    for line in dataset:
        if (counter %10 == 0):
            print(f"Import {counter} / {len(dataset)} ")

        properties = {
            "title": line["title"],
            "content": line["text"],
            "url": line["url"]
        }
        
        batch.add_data_object(properties, "Article")
        counter = counter+1

print("Importing Articles complete")       

In [None]:
# Clear up the schema, so that we can recreate it
client.schema.delete_all()
client.schema.get()

# Define the Schema object to use `text-embedding-ada-002` on `title` and `content`, but skip it for `url`
article_schema = {
    "class": "Article",
    "description": "A collection of articles",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
          "model": "ada",
          "modelVersion": "002",
          "type": "text"
        }, 
        "qna-openai": {
          "model": "text-davinci-002",
          "maxTokens": 16,
          "temperature": 0.0,
          "topP": 1,
          "frequencyPenalty": 0.0,conda install -c conda-forge sentence-transformers
          "presencePenalty": 0.0
        }
    },
    "properties": [{
        "name": "title",
        "description": "Title of the article",
        "dataType": ["string"]
    },
    {
        "name": "content",
        "description": "Contents of the article",
        "dataType": ["text"]
    },
    {
        "name": "url",
        "description": "URL to the article",
        "dataType": ["string"],
        "moduleConfig": { "text2vec-openai": { "skip": True } }
    }]
}

# add the Article schema
client.schema.create_class(article_schema)

# get the schema to make sure it worked
client.schema.get()

In [9]:
database = 'postgresql://postgres:mysecretpassword@localhost/postgres'
load_dotenv()
engine = create_engine(database)

In [19]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering

embedder = SentenceTransformer('all-mpnet-base-v2')

In [10]:
def get_type_categories(engine=engine, data_table='weighted_trait_graph'):
    query = f"""
        SELECT DISTINCT type FROM {data_table};
        """
    type_list = pd.read_sql_query(query, engine)
    return type_list    

types = get_type_categories(engine=engine, data_table='weighted_trait_graph')
types_list = types['type'].tolist()


In [15]:
def get_type_data(type, engine=engine, data_table='weighted_trait_graph'):
    query = f"""
        SELECT DISTINCT data_label FROM {data_table} WHERE type = '{type}';
        """
    selected_data = pd.read_sql_query(query, engine)
    return selected_data

type = types_list[0]
df = get_type_data(type = type, data_table = 'weighted_trait_graph')

In [31]:
# Encode the sentences and store the embeddings in the 'embeddings' column
df['embedding'] = df['data_label'].apply(lambda x: embedder.encode(x))

# Normalize the embeddings to unit length
df['embedding'] = df['embedding'].apply(lambda x: x / np.linalg.norm(x))
df["embedding"] = df["embedding"].apply(np.array)  # convert string to numpy array
matrix = np.vstack(df.embedding.values)

# Fit clusters
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1)
cluster_model = clustering.fit(matrix)

cluster_assignment = cluster_model.labels_



In [32]:
# Assign the cluster assignments to the DataFrame
df['cluster_assignment'] = cluster_assignment

# Create a dictionary to store the clustered sentences
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []
    clustered_sentences[cluster_id].append(df['data_label'].iloc[sentence_id])

# Print the clusters
for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  6
['improved magnet strength and quality', 'improved magnet strength', 'improved magnet strength and functionality', 'improved quality control for magnets', 'improved magnet strength, quality, and functionality', 'improved noise level and magnet strength', 'improved magnet functionality', 'improved magnet pushability']

Cluster  20
['added string attachment for pen to prevent loss', 'added string attachment for pens to prevent loss', 'added string attachment for pen and more color options', 'added string attachment for pen']

Cluster  3
['improved pen security to the board', 'improved secure attachment of pen to board', 'improved secure attachment for pen', 'improved secure attachment for pen and noise level', 'improved pen storage design']

Cluster  5
['added more engaging features for adults', 'added more hole pattern options for adults']

Cluster  14

Cluster  2
['improved quality control with attached pen', 'improved stylus attachment mechanism', 'improved pen attachment d