In [None]:
# !pip install -q torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap datasets langchain-community ragatouille

In [52]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", None)  # This will be helpful when visualizing retriever outputs

In [53]:
from dotenv import load_dotenv
from time import sleep
import os
import json

load_dotenv()  # take environment variables from .env.

True

In [54]:
# establish a connection to the PostgreSQL database
import psycopg2 as pg

conn = pg.connect(
    dbname=os.environ["POSTGRES_DB"],
    user=os.environ["POSTGRES_USER"],
    password=os.environ["POSTGRES_PASSWORD"],
    host=os.environ["POSTGRES_HOST"]
)

In [4]:
df = pd.read_sql_query(
    """SELECT
            fe.object_id as entity_id,
            fe.name as entity_name,
            fe.class as entity_class,
            fe.subclass as entity_subclass,
            fe.description as entity_description,

            fp.id as property_id,
            fp.property_name as property_name,
            fp.description as property_description,
            fp.target_entity as target_entity,
            fp.target_class as target_class

    FROM fandom_properties_clean fp join fandom_entities_clean fe on fp.object_id = fe.object_id;""",
      conn)

# id	object_id	name	value	description	parent	created_at	id	object_id	name	class	subclass	description	created_at

  df = pd.read_sql_query(


In [5]:
df.head()

Unnamed: 0,entity_id,entity_name,entity_class,entity_subclass,entity_description,property_id,property_name,property_description,target_entity,target_class
0,67139f9e8f64cb721b2f3eec,Adam Holograd,Character,LordCommander,"Adam Holograd is a character from the game Bravely Default, known as the Lord Commander of the Holograd Empire.",1,hasHome,The home location of the character.,Holograd,Location
1,67139f9e8f64cb721b2f3eec,Adam Holograd,Character,LordCommander,"Adam Holograd is a character from the game Bravely Default, known as the Lord Commander of the Holograd Empire.",2,affiliatedWith,The faction or group the character is associated with.,Holograd Empire,Faction
2,67139f9e8f64cb721b2f3eec,Adam Holograd,Character,LordCommander,"Adam Holograd is a character from the game Bravely Default, known as the Lord Commander of the Holograd Empire.",3,hasOccupation,The professional role or title held by the character.,Lord Commander of the Holograd Empire,Occupation
3,67139f9e8f64cb721b2f3eec,Adam Holograd,Character,LordCommander,"Adam Holograd is a character from the game Bravely Default, known as the Lord Commander of the Holograd Empire.",4,hasRace,The racial identity of the character.,Human,Race
4,67139f9e8f64cb721b2f3eec,Adam Holograd,Character,LordCommander,"Adam Holograd is a character from the game Bravely Default, known as the Lord Commander of the Holograd Empire.",5,hasGender,The gender identity of the character.,Male,Gender


In [6]:
df.shape

(66246, 10)


## Create and populate Vectorstore using PostgreSQL and PGVector

As described in LangChain docs:

"PGVector: An implementation of LangChain vectorstore abstraction using postgres as the backend and utilizing the pgvector extension."

https://python.langchain.com/docs/integrations/vectorstores/pgvector/

#### PGVector docs:
https://api.python.langchain.com/en/latest/vectorstores/langchain_postgres.vectorstores.PGVector.html

In [55]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = "text-embedding-3-large"
dimensions = 256
embeddings = OpenAIEmbeddings(model=embeddings_model, dimensions=dimensions)


In [56]:
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector, DistanceStrategy

# See docker command above to launch a postgres instance with pgvector enabled.
connection = f'postgresql+psycopg://{os.environ["POSTGRES_USER"]}:{os.environ["POSTGRES_PASSWORD"]}@{os.environ["POSTGRES_HOST"]}:5432/{os.environ["POSTGRES_DB_VECTOR"]}'  # Uses psycopg3!
collection_name = "all_properties"

# https://api.python.langchain.com/en/latest/vectorstores/langchain_postgres.vectorstores.PGVector.html

vector_store = PGVector(
    embeddings=embeddings,
    embedding_length=dimensions,
    distance_strategy=DistanceStrategy.COSINE,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True
)

## Prepare data queried from Postgres

In [10]:
df_unique = df.drop_duplicates(subset=["property_name", "target_class", "entity_class", "property_description"]).reset_index(drop=True)

In [11]:
df_unique.shape

(52352, 10)

In [12]:
df_unique['id'] = df_unique.index

In [13]:
df_unique.head().to_dict(orient="records")

[{'entity_id': '67139f9e8f64cb721b2f3eec',
  'entity_name': 'Adam Holograd',
  'entity_class': 'Character',
  'entity_subclass': 'LordCommander',
  'entity_description': 'Adam Holograd is a character from the game Bravely Default, known as the Lord Commander of the Holograd Empire.',
  'property_id': 1,
  'property_name': 'hasHome',
  'property_description': 'The home location of the character.',
  'target_entity': 'Holograd',
  'target_class': 'Location',
  'id': 0},
 {'entity_id': '67139f9e8f64cb721b2f3eec',
  'entity_name': 'Adam Holograd',
  'entity_class': 'Character',
  'entity_subclass': 'LordCommander',
  'entity_description': 'Adam Holograd is a character from the game Bravely Default, known as the Lord Commander of the Holograd Empire.',
  'property_id': 2,
  'property_name': 'affiliatedWith',
  'property_description': 'The faction or group the character is associated with.',
  'target_entity': 'Holograd Empire',
  'target_class': 'Faction',
  'id': 1},
 {'entity_id': '67139f

In [None]:
from langchain.docstore.document import Document

docs = [
    Document(
        page_content=f"Poperty name: '{doc['property_name']}'; property description: {doc['property_description']} This property applies to objects of class '{doc['entity_class']}' and has a target object or value with class '{doc['target_class']}'. \
Using OWL: '{doc['property_name']}' rdfs:comment '{doc['property_description']}'; rdfs:domain '{doc['entity_class']}'; rdfs:range '{doc['target_class']}'.",
        metadata={
            "id": doc["id"],
            "property_id": doc["property_id"],
            "entity_class": doc["entity_class"],
            "entity_subclass": doc["entity_subclass"],
            "property_name": doc["property_name"],
            "target_class": doc["target_class"],
            "property_description": doc["property_description"],
            }
        ) for doc in tqdm(df_unique.to_dict(orient="records"))
]

  0%|          | 0/52352 [00:00<?, ?it/s]

## Create embeddings using OpenAI batch API

In [62]:
batch_filename = f'batchinput_embed_properties_all_30K_2.jsonl'
with open(batch_filename, 'w') as f:
    for doc in tqdm(docs[30000:]):
        request = {
            "custom_id": str(doc.metadata['id']),
            "method": "POST",
            "url": "/v1/embeddings",
            "body": {
                "input": doc.page_content,
                "model": embeddings_model,
                "encoding_format": "float",
                "dimensions": dimensions,
                "user": "user_001",
            }
        }
        json.dump(request, f)
        f.write('\n')


  0%|          | 0/22352 [00:00<?, ?it/s]

In [None]:
from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_APIKEY'])

batch_input_file = client.files.create(
  file=open(batch_filename, "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch_create_msg = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/embeddings",
    completion_window="24h",
    metadata={
      "description": f"property embeddings all 30K 2"
    }
)
batch_create_msg

In [41]:
# batch_info = client.batches.retrieve(batch_create_msg.id)
batch_info = client.batches.retrieve("batch_6717c4726d148190b3d3bc2dc00093ac")
print(batch_info.metadata['description'], batch_info.status)
print(batch_info.request_counts)

property embeddings all 30K 2 completed
BatchRequestCounts(completed=22352, failed=0, total=22352)


In [42]:
batch_info

Batch(id='batch_6717c4726d148190b3d3bc2dc00093ac', completion_window='24h', created_at=1729610866, endpoint='/v1/embeddings', input_file_id='file-ixz7vab17rH55efmDx4NVAoj', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1729619794, error_file_id=None, errors=None, expired_at=None, expires_at=1729697266, failed_at=None, finalizing_at=1729616205, in_progress_at=1729610871, metadata={'description': 'property embeddings all 30K 2'}, output_file_id='file-cg9RXjCmd8KThPJbkIV0dT7P', request_counts=BatchRequestCounts(completed=22352, failed=0, total=22352))

In [43]:
batch_output_filename = 'batchoutput_embed_properties_all_30K_2.jsonl'

In [44]:
if batch_info.status == 'completed':
    file_response = client.files.content(batch_info.output_file_id)
    with open(batch_output_filename, "wb") as f:
        f.write(file_response.content)

    print(f"{batch_output_filename} completed.")
else:
    print(f"Batch job {batch_info.metadata['description']} is still {batch_info.status}.")

batchoutput_embed_properties_all_30K_2.jsonl completed.


In [45]:

batch_data = []
with open(batch_output_filename, "r") as f:
    for i, line in tqdm(enumerate(f)):
        batch_data.append(json.loads(line))



0it [00:00, ?it/s]

In [46]:

output_dicts = []
for response in tqdm(batch_data):
    content = {
        "id": int(response["custom_id"]),
        "embedding": response["response"]["body"]["data"][0]["embedding"]
    }
    # print(content)
    # break
    output_dicts.append(content)

  0%|          | 0/22352 [00:00<?, ?it/s]

In [47]:
len(output_dicts[0]['embedding'])

256

In [49]:
# vector_store.add_documents(docs, ids=[doc.metadata["id"] for doc in docs[:100]])
for i in tqdm(range(30)):
    start = i*1000
    end = (i+1)*1000
    if start > len(output_dicts):
        break
    print(f"Adding embeddings {start} to {end}...")
    vector_store.add_embeddings(texts=[docs[em['id']].page_content for em in output_dicts[start:end]], embeddings=[em['embedding'] for em in output_dicts[start:end]], metadatas=[docs[em['id']].metadata for em in output_dicts[start:end]], ids=[em['id'] for em in output_dicts[start:end]])

  0%|          | 0/30 [00:00<?, ?it/s]

Adding embeddings 0 to 1000...
Adding embeddings 1000 to 2000...
Adding embeddings 2000 to 3000...
Adding embeddings 3000 to 4000...
Adding embeddings 4000 to 5000...
Adding embeddings 5000 to 6000...
Adding embeddings 6000 to 7000...
Adding embeddings 7000 to 8000...
Adding embeddings 8000 to 9000...
Adding embeddings 9000 to 10000...
Adding embeddings 10000 to 11000...
Adding embeddings 11000 to 12000...
Adding embeddings 12000 to 13000...
Adding embeddings 13000 to 14000...
Adding embeddings 14000 to 15000...
Adding embeddings 15000 to 16000...
Adding embeddings 16000 to 17000...
Adding embeddings 17000 to 18000...
Adding embeddings 18000 to 19000...
Adding embeddings 19000 to 20000...
Adding embeddings 20000 to 21000...
Adding embeddings 21000 to 22000...
Adding embeddings 22000 to 23000...


In [60]:
query = "data property that describes or points to a Number or Data Vaue"

results = vector_store.similarity_search_with_score(
    query=query,
    k=50,
    filter={
        # "entity_class": {"$in": ['Game']},
        # "target_class": {"$in": ['Location', 'City']}
    }
)
for doc in results:
    print(f"* {doc[1]} {doc[0].page_content} [{doc[0].metadata}]")

* 0.3783160448074341 Poperty name: 'hasNumber'; property description: Specifies the numerical identifier of the in-game item. This property applies to objects of class 'Item' and has a target object or value with class 'DataValue'.             Using OWL: 'hasNumber' rdfs:comment 'Specifies the numerical identifier of the in-game item.'; rdfs:domain 'Item'; rdfs:range 'DataValue'. [{'id': 37877, 'property_id': 50397, 'entity_class': 'Item', 'target_class': 'DataValue', 'property_name': 'hasNumber', 'entity_subclass': 'Vehicle', 'property_description': 'Specifies the numerical identifier of the in-game item.'}]
* 0.37843549251556396 Poperty name: 'hasValue'; property description: Represents the monetary value or importance of the item, if applicable. This property applies to objects of class 'Item' and has a target object or value with class 'DataValue'.             Using OWL: 'hasValue' rdfs:comment 'Represents the monetary value or importance of the item, if applicable.'; rdfs:domain '

In [None]:
query = "property of a game that describes the developer/designer of the game"

response = client.embeddings.create(
    model=embeddings_model, input=query, encoding_format="float", dimensions=dimensions
)

embedding = response.data[0].embedding

results = vector_store.similarity_search_with_score_by_vector(
    embedding=embedding,
    k=50,
    filter={
        "entity_class": {"$in": ['Game']},
        # "target_class": {"$in": ['Location', 'City']}
    }
)
for doc in results:
    print(f"* {doc[1]} {doc[0].page_content} [{doc[0].metadata}]")

In [33]:
results[0]

(Document(id='20732', metadata={'id': 20732, 'property_id': 27045, 'entity_class': 'Character', 'target_class': 'Designer', 'property_name': 'characterDesignBy', 'entity_subclass': 'Hero', 'property_description': "Indicates the designer responsible for creating the character's appearance."}, page_content="Poperty name: 'characterDesignBy'; property description: Indicates the designer responsible for creating the character's appearance. This property applies to objects of class 'Character' and has a target object or value with class 'Designer'.             Using OWL: 'characterDesignBy' rdfs:comment 'Indicates the designer responsible for creating the character's appearance.'; rdfs:domain 'Character'; rdfs:range 'Designer'."),
 0.24955305462443178)

In [None]:
# vector_store.delete(ids=[str(i) for i in range(1000)])

In [1]:
import pacmap
import numpy as np
import plotly.express as px

embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

embeddings_2d = [
    list(vector_store.index.reconstruct_n(idx, 1)[0]) for idx in range(len(docs[:1000]))
]

# Fit the data (the index of transformed data corresponds to the index of the original data)
documents_projected = embedding_projector.fit_transform(np.array(embeddings_2d), init="pca")

ModuleNotFoundError: No module named 'plotly'

In [None]:
df = pd.DataFrame.from_dict(
    [
        {
            "x": documents_projected[i, 0],
            "y": documents_projected[i, 1],
            "class": RAW_KNOWLEDGE_BASE[:1000][i].metadata["entity_class"],
            "extract": RAW_KNOWLEDGE_BASE[:1000][i].page_content[:100],
            "symbol": "circle",
            "size_col": 4,
        }
        for i in range(len(RAW_KNOWLEDGE_BASE[:1000]))
    ]
)

# Visualize the embedding
fig = px.scatter(
    df,
    x="x",
    y="y",
    color="source",
    hover_data="extract",
    size="size_col",
    symbol="symbol",
    color_discrete_map={"User query": "black"},
    width=1000,
    height=700,
)
fig.update_traces(
    marker=dict(opacity=1, line=dict(width=0, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
)
fig.update_layout(
    legend_title_text="<b>Chunk source</b>",
    title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>",
)
fig.show()