In [None]:
# %pip install opensearch-py pyspark matplotlib scikit-learn seaborn pandas

Imports

In [None]:
import os
import sys

rootpath = os.path.abspath("/home/robertc/Git/pfun-cma-model")
if rootpath not in sys.path:
    sys.path.insert(0, rootpath)
from pfun_cma_model.embed import Embedder

Run the embedder -> embeddings -> Opensearch domain

In [None]:
defaults = dict(grid_params=dict(num=8, kind="random"),
                require_ssh_tunnel=False)
kwds = {**defaults}
embedder = Embedder(**kwds)
embedder.run()  # type: ignore

Initialize opensearch client

In [None]:
osearc = embedder.opensearch_client
res = osearc.search(
    index="embeddings", body={"size": 10, "_source": "embedding"}, scroll="2m"
)
scroll_id = res["_scroll_id"]
scroll_size = res["hits"]["total"]["value"]

In [None]:
res

Get Data from OpenSearch

In [None]:
embeddings = [(d["_source"]["embedding"][0]["embedding"],) for d in res["hits"]["hits"]]

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
import pandas as pd

pca = PCA(n_components=2)
df_pandas = pd.DataFrame(
    model.transform(df)
    .rdd.map(lambda r: (float(r.features[0]), float(r.features[1]), int(r.prediction)))
    .collect(),
    columns=["x", "y", "cluster"],
)
df_pandas["x"], df_pandas["y"] = zip(*pca.fit_transform(df_pandas[["x", "y"]]))
plt.rc("figure", figsize=(10, 8))
sns.scatterplot(x="x", y="y", hue="cluster", data=df_pandas)

In [None]:
pca.explained_variance_

In [None]:
def generate_query_vector():
    from pfun_cma_model.embed import encode
    from pfun_cma_model.runtime.chalicelib.engine.cma_sleepwake import CMASleepWakeModel
    cma = CMASleepWakeModel()
    raw_text = cma.run().to_json()
    queryVector = encode(raw_text)[0].tolist()
    return queryVector


def get_sample_query_vector():
    sample = osearc.search(
        index="embeddings", body={"size": 1, "_source": "embedding"}, scroll="2m"
    )
    queryVector = sample['hits']['hits'][0]['_source']['embedding'][0]['embedding']
    return queryVector


queryVector = get_sample_query_vector()

query = {
    "size": 1,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.queryVector, doc['embedding']) + 1.0",
                "params": {
                    "queryVector": queryVector
                }
            }
        }
    }
}

response = osearc.search(index="embeddings", body=query, error_trace=True)

# Extract hit and score
hit = response['hits']['hits'][0]['_source']
score = response['hits']['hits'][0]['_score']
hit_id = response['hits']['hits'][0]['_id']

print(hit_id, hit, score)