In [None]:
# %pip install opensearch-py pyspark matplotlib scikit-learn seaborn pandas

Imports

In [None]:
import os
import sys

rootpath = os.path.abspath("/home/robertc/Git/pfun-cma-model")
if rootpath not in sys.path:
    sys.path.insert(0, rootpath)
from pfun_cma_model.embed import EmbedClient, run_embedder

Run the embedder -> embeddings -> Opensearch domain

In [None]:
run_embedder(grid_params=dict(num=8))

Initialize opensearch client

In [None]:
osearc = EmbedClient(require_ssh_tunnel=False).opensearch_client
res = osearc.search(
    index="embeddings", body={"size": 10, "_source": "embedding"}, scroll="2m"
)
scroll_id = res["_scroll_id"]
scroll_size = res["hits"]["total"]["value"]

In [None]:
res

Setup spark

In [None]:
from pyspark.sql import SparkSession
import os

# .config("spark.jars", os.path.join(rootpath, "pfun_cma_model/embed/pyspark_jars/opensearch-spark-30_2.13-1.0.1.jar"))
spark = SparkSession.builder \
    .config("spark.cores.max", "8") \
    .config("spark.kubernetes.container.image", "docker.io/bitnami/spark:3.5.0-debian-11-r0") \
    .config("spark.executor.instances", "4") \
.config("spark.jars", os.path.join(rootpath, "pfun_cma_model/embed/pyspark_jars/elasticsearch-spark-20_2.11-8.10.2.jar")) \
    .appName("pfun-cma-model-embed") \
    .getOrCreate()
spark.conf.set('spark.sql.shuffle.partitions', int(16 * 2.5))
spark.conf.set('spark.default.parallelism', 16)

Get Data from OpenSearch

In [None]:
embeddings = [(d["_source"]["embedding"][0]["embedding"],) for d in res["hits"]["hits"]]

In [None]:
def getDataFromOSWithSpark(index: str = "embeddings", sample_fraction: float | None = 0.1):
    #: Get data from opensearch (with spark)
    df = (
        spark.read.format("org.elasticsearch.spark.sql")
        .option("es.port", "9201")
        .option("es.net.ssl", "false")
        .option("es.nodes", "192.168.1.64")
        .load(f"{index}/float")
    )
    if sample_fraction is not None:
        # Create random sample of 10% of the data
        df_sample = df.sample(False, sample_fraction)
        return df_sample
    else:
        return df


# df = getDataFromOSWithSpark(sample_fraction=0.1)
# df.persist()
# df.show(5)

In [None]:
from pyspark.sql.types import ArrayType, DoubleType, StructType, StructField

schema = StructType([StructField("list_features", ArrayType(DoubleType()))])
df = spark.createDataFrame(embeddings, schema=schema)

from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf

# UDF to convert array into vector
vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
df = df.withColumn("features", vector_udf("list_features"))

In [None]:
df = df.repartition("features")
df.persist()

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=8, seed=23)
model = kmeans.fit(df)

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns
import pandas as pd

pca = PCA(n_components=2)
df_pandas = pd.DataFrame(
    model.transform(df)
    .rdd.map(lambda r: (float(r.features[0]), float(r.features[1]), int(r.prediction)))
    .collect(),
    columns=["x", "y", "cluster"],
)
df_pandas["x"], df_pandas["y"] = zip(*pca.fit_transform(df_pandas[["x", "y"]]))
plt.rc("figure", figsize=(10, 8))
sns.scatterplot(x="x", y="y", hue="cluster", data=df_pandas)

In [None]:
pca.explained_variance_

In [None]:
def generate_query_vector():
    from pfun_cma_model.embed import encode
    from pfun_cma_model.runtime.chalicelib.engine.cma_sleepwake import CMASleepWakeModel
    cma = CMASleepWakeModel()
    raw_text = cma.run().to_json()
    queryVector = encode(raw_text)[0].tolist()
    return queryVector


def get_sample_query_vector():
    sample = osearc.search(
        index="embeddings", body={"size": 1, "_source": "embedding"}, scroll="2m"
    )
    queryVector = sample['hits']['hits'][0]['_source']['embedding'][0]['embedding']
    return queryVector


queryVector = get_sample_query_vector()

query = {
    "size": 1,
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.queryVector, doc['embedding']) + 1.0",
                "params": {
                    "queryVector": queryVector
                }
            }
        }
    }
}

response = osearc.search(index="embeddings", body=query, error_trace=True)

# Extract hit and score
hit = response['hits']['hits'][0]['_source']
score = response['hits']['hits'][0]['_score']
hit_id = response['hits']['hits'][0]['_id']

print(hit_id, hit, score)