In [None]:
# !pip install -q torch transformers transformers accelerate bitsandbytes langchain sentence-transformers faiss-gpu openpyxl pacmap datasets langchain-community ragatouille

In [28]:
from tqdm.notebook import tqdm
import pandas as pd
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt

pd.set_option("display.max_colwidth", None)  # This will be helpful when visualizing retriever outputs

In [29]:
from dotenv import load_dotenv
from time import sleep
import os
import json
from tqdm.notebook import tqdm

load_dotenv()  # take environment variables from .env.

True

In [30]:
# establish a connection to the PostgreSQL database
import psycopg2 as pg

conn = pg.connect(
    dbname=os.environ["POSTGRES_DB"],
    user=os.environ["POSTGRES_USER"],
    password=os.environ["POSTGRES_PASSWORD"],
    host=os.environ["POSTGRES_HOST"]
)

In [None]:
df = pd.read_sql_query(
    """SELECT
            *
    FROM metacritic_pages;""",
      conn)

# object_id	title	score	platforms	release_date	developers	publisher	genres

  df = pd.read_sql_query(


In [6]:
df.head()

Unnamed: 0,object_id,title,score,platforms,release_date,developers,publisher,genres
0,6718cddc17c581114a9d20cf,Super Mario Galaxy 2,97.0,[Wii],2010-05-23,[Nintendo EAD Tokyo],Nintendo,[3D Platformer]
1,6718cedb17c581114a9d20d0,The Legend of Zelda: Breath of the Wild,97.0,"[Wii U, Nintendo Switch]",2017-03-03,[Nintendo],Nintendo,[Open-World Action]
2,6718cedb17c581114a9d20d1,Red Dead Redemption 2,97.0,"[Xbox One, PlayStation 4, PC]",2018-10-26,[Rockstar Games],Rockstar Games,[Open-World Action]
3,6718cedb17c581114a9d20d2,Grand Theft Auto V,97.0,"[PlayStation 3, Xbox 360, PlayStation 4, Xbox One, PC, PlayStation 5, Xbox Series X]",2014-11-18,[Rockstar North],Rockstar Games,[Open-World Action]
4,6718cedb17c581114a9d20d3,Super Mario Odyssey,97.0,[Nintendo Switch],2017-10-27,[Nintendo],Nintendo,[3D Platformer]


In [7]:
df.shape

(8015, 8)


## Create and populate Vectorstore using PostgreSQL and PGVector

As described in LangChain docs:

"PGVector: An implementation of LangChain vectorstore abstraction using postgres as the backend and utilizing the pgvector extension."

https://python.langchain.com/docs/integrations/vectorstores/pgvector/

#### PGVector docs:
https://api.python.langchain.com/en/latest/vectorstores/langchain_postgres.vectorstores.PGVector.html

In [44]:
from langchain_openai import OpenAIEmbeddings

embeddings_model = "text-embedding-3-large"
dimensions = 256
embeddings = OpenAIEmbeddings(model=embeddings_model, dimensions=dimensions)


In [45]:
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector, DistanceStrategy

# See docker command above to launch a postgres instance with pgvector enabled.
connection = f'postgresql+psycopg://{os.environ["POSTGRES_USER"]}:{os.environ["POSTGRES_PASSWORD"]}@{os.environ["POSTGRES_HOST"]}:5432/{os.environ["POSTGRES_DB_VECTOR"]}'  # Uses psycopg3!
collection_name = "metacritic_pages"

# https://api.python.langchain.com/en/latest/vectorstores/langchain_postgres.vectorstores.PGVector.html

vector_store = PGVector(
    embeddings=embeddings,
    embedding_length=dimensions,
    distance_strategy=DistanceStrategy.COSINE,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True
)

## Prepare data queried from Postgres

In [10]:
from langchain.docstore.document import Document

docs = [
    Document(
        page_content=f"Game name: '{doc['title']}'; score: {doc['score']}; platforms: {doc['platforms']}; release date: {doc['release_date']}; developers: {doc['developers']}; publisher: {doc['publisher']}; genres: {doc['genres']}",
        metadata={
            "object_id": doc["object_id"],
            "title": doc["title"],
            "score": doc["score"],
            "platforms": doc["platforms"],
            "release_date": doc["release_date"],
            "developers": doc["developers"],
            "publisher": doc["publisher"],
            "genres": doc["genres"],
        }
    ) for doc in tqdm(df.to_dict(orient="records"))
]

  0%|          | 0/8015 [00:00<?, ?it/s]

In [52]:
df.score.isna().sum()

5

## Create embeddings using OpenAI batch API

In [12]:
batch_filename = f'batchinput_embed_metacritic_pages.jsonl'
with open(batch_filename, 'w') as f:
    for doc in tqdm(docs):
        request = {
            "custom_id": str(doc.metadata['object_id']),
            "method": "POST",
            "url": "/v1/embeddings",
            "body": {
                "input": doc.page_content,
                "model": embeddings_model,
                "encoding_format": "float",
                "dimensions": dimensions,
                "user": "user_001",
            }
        }
        json.dump(request, f)
        f.write('\n')


  0%|          | 0/8015 [00:00<?, ?it/s]

In [13]:
from openai import OpenAI
client = OpenAI(api_key=os.environ['OPENAI_APIKEY'])

batch_input_file = client.files.create(
  file=open(batch_filename, "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

batch_create_msg = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/embeddings",
    completion_window="24h",
    metadata={
      "description": f"metacritic embeddings all 8K"
    }
)
batch_create_msg

Batch(id='batch_6718d65381d881908175d1b785c20cc5', completion_window='24h', created_at=1729680979, endpoint='/v1/embeddings', input_file_id='file-S6l0liz9kM1RqItSDIInEDrl', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1729767379, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'metacritic embeddings all 8K'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [34]:
batch_info = client.batches.retrieve(batch_create_msg.id)
# batch_info = client.batches.retrieve("batch_6717c4726d148190b3d3bc2dc00093ac")
print(batch_info.metadata['description'], batch_info.status)
print(batch_info.request_counts)

metacritic embeddings all 8K completed
BatchRequestCounts(completed=8015, failed=0, total=8015)


In [35]:
batch_output_filename = 'batchoutput_embed_metacritic_pages.jsonl'

In [36]:
if batch_info.status == 'completed':
    file_response = client.files.content(batch_info.output_file_id)
    with open(batch_output_filename, "wb") as f:
        f.write(file_response.content)

    print(f"{batch_output_filename} completed.")
else:
    print(f"Batch job {batch_info.metadata['description']} is still {batch_info.status}.")

batchoutput_embed_metacritic_pages.jsonl completed.


In [37]:

batch_data = []
with open(batch_output_filename, "r") as f:
    for i, line in tqdm(enumerate(f)):
        batch_data.append(json.loads(line))



0it [00:00, ?it/s]

In [39]:

output_dicts = []
for response in tqdm(batch_data):
    content = {
        "id": response["custom_id"],
        "embedding": response["response"]["body"]["data"][0]["embedding"]
    }
    # print(content)
    # break
    output_dicts.append(content)

  0%|          | 0/8015 [00:00<?, ?it/s]

In [47]:
len(output_dicts[0]['embedding'])

256

In [40]:
# sort both lists by object_id
output_dicts = sorted(output_dicts, key=lambda x: x['id'])
docs = sorted(docs, key=lambda x: x.metadata['object_id'])

In [None]:
# convert date to string and swap NaN with None for score
for doc in docs:
    doc.metadata['release_date'] = str(doc.metadata['release_date'])
    if pd.isnull(doc.metadata['score']):
        doc.metadata['score'] = None

In [42]:
for i in range(len(output_dicts)):
    assert output_dicts[i]['id'] == docs[i].metadata['object_id']

In [54]:
# vector_store.add_documents(docs, ids=[doc.metadata["id"] for doc in docs[:100]])
for i in tqdm(range(9)):
    start = i*1000
    end = (i+1)*1000
    if start > len(output_dicts):
        break
    print(f"Adding embeddings {start} to {end}...")
    vector_store.add_embeddings(
        texts=[doc.page_content for doc in docs[start:end]],
        embeddings=[em['embedding'] for em in output_dicts[start:end]],
        metadatas=[doc.metadata for doc in docs[start:end]],
        ids=[em['id'] for em in output_dicts[start:end]]
    )

  0%|          | 0/9 [00:00<?, ?it/s]

Adding embeddings 0 to 1000...
Adding embeddings 1000 to 2000...
Adding embeddings 2000 to 3000...
Adding embeddings 3000 to 4000...
Adding embeddings 4000 to 5000...
Adding embeddings 5000 to 6000...
Adding embeddings 6000 to 7000...
Adding embeddings 7000 to 8000...
Adding embeddings 8000 to 9000...


In [55]:
query = "super mario 2"

results = vector_store.similarity_search_with_score(
    query=query,
    k=50,
    filter={
        # "entity_class": {"$in": ['Game']},
        # "target_class": {"$in": ['Location', 'City']}
    }
)
for doc in results:
    print(f"* {doc[1]} {doc[0].page_content} [{doc[0].metadata}]")

* 0.4137864359489929 Game name: 'Super Mario Bros. Wonder'; score: 92.0; platforms: ['Nintendo Switch']; release date: 2023-10-20; developers: ['Nintendo']; publisher: Nintendo; genres: ['2D Platformer'] [{'score': 92.0, 'title': 'Super Mario Bros. Wonder', 'genres': ['2D Platformer'], 'object_id': '6718cede17c581114a9d20fe', 'platforms': ['Nintendo Switch'], 'publisher': 'Nintendo', 'developers': ['Nintendo'], 'release_date': '2023-10-20'}]
* 0.4416879748291974 Game name: 'Mario & Luigi: Superstar Saga + Bowser's Minions'; score: 81.0; platforms: ['3DS']; release date: 2017-10-06; developers: ['Alphadream Corporation']; publisher: Nintendo; genres: ['JRPG'] [{'score': 81.0, 'title': "Mario & Luigi: Superstar Saga + Bowser's Minions", 'genres': ['JRPG'], 'object_id': '6718cfe517c581114a9d2630', 'platforms': ['3DS'], 'publisher': 'Nintendo', 'developers': ['Alphadream Corporation'], 'release_date': '2017-10-06'}]
* 0.4585527181625366 Game name: 'New Super Mario Bros. 2'; score: 78.0; pl

In [33]:
results[0]

(Document(id='20732', metadata={'id': 20732, 'property_id': 27045, 'entity_class': 'Character', 'target_class': 'Designer', 'property_name': 'characterDesignBy', 'entity_subclass': 'Hero', 'property_description': "Indicates the designer responsible for creating the character's appearance."}, page_content="Poperty name: 'characterDesignBy'; property description: Indicates the designer responsible for creating the character's appearance. This property applies to objects of class 'Character' and has a target object or value with class 'Designer'.             Using OWL: 'characterDesignBy' rdfs:comment 'Indicates the designer responsible for creating the character's appearance.'; rdfs:domain 'Character'; rdfs:range 'Designer'."),
 0.24955305462443178)

In [74]:
vector_store.delete(ids=[str(i) for i in range(1000)])

In [1]:
import pacmap
import numpy as np
import plotly.express as px

embedding_projector = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, random_state=1)

embeddings_2d = [
    list(vector_store.index.reconstruct_n(idx, 1)[0]) for idx in range(len(docs[:1000]))
]

# Fit the data (the index of transformed data corresponds to the index of the original data)
documents_projected = embedding_projector.fit_transform(np.array(embeddings_2d), init="pca")

ModuleNotFoundError: No module named 'plotly'

In [None]:
df = pd.DataFrame.from_dict(
    [
        {
            "x": documents_projected[i, 0],
            "y": documents_projected[i, 1],
            "class": RAW_KNOWLEDGE_BASE[:1000][i].metadata["entity_class"],
            "extract": RAW_KNOWLEDGE_BASE[:1000][i].page_content[:100],
            "symbol": "circle",
            "size_col": 4,
        }
        for i in range(len(RAW_KNOWLEDGE_BASE[:1000]))
    ]
)

# Visualize the embedding
fig = px.scatter(
    df,
    x="x",
    y="y",
    color="source",
    hover_data="extract",
    size="size_col",
    symbol="symbol",
    color_discrete_map={"User query": "black"},
    width=1000,
    height=700,
)
fig.update_traces(
    marker=dict(opacity=1, line=dict(width=0, color="DarkSlateGrey")),
    selector=dict(mode="markers"),
)
fig.update_layout(
    legend_title_text="<b>Chunk source</b>",
    title="<b>2D Projection of Chunk Embeddings via PaCMAP</b>",
)
fig.show()