In [1]:
from typing import Tuple, List
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
import pandas as pd
import plotly.express as px
import re
from sentence_transformers import SentenceTransformer


In [5]:
class Model(object):
    def __init__(self, model_path):
        super(Model, self).__init__()
        self.sbert = SentenceTransformer(model_path)

    def measure_distance(self, sents: Tuple[str, str]):
        # compute embeddings
        corpus_embeddings = self.sbert.encode(sents)
        # compute distance
        distances = (
            pairwise_distances(
                corpus_embeddings[0].reshape(1, -1),
                corpus_embeddings[1].reshape(1, -1),
                metric)[0][0] for metric in ["cosine", "manhattan", "euclidean"]
        )
        return distances

    def fit_kmeans(self, corpus: List[str], n_clusters: int):
        # compute embeddings
        corpus_embeddings = self.sbert.encode(corpus)
        # cluster
        clustering_model = KMeans(n_clusters)
        clustering_model.fit(corpus_embeddings)
        # perform PCA
        n_components = int(len(corpus) > 2) + 2
        pca = PCA(n_components)
        X = np.array(corpus_embeddings)
        X_reduced = pca.fit_transform(X)
        # plot corpus in 3d scatter plot
        df = pd.DataFrame({
            'sent': corpus,
            'cluster': clustering_model.labels_.astype(str),
            'x': X_reduced[:, 0],
            'y': X_reduced[:, 1],
            'z': X_reduced[:, 2] if X_reduced.shape[1] > 2 else np.zeros(X_reduced.shape[0])
        })
        
        fig = px.scatter_3d(df, x='x', y='y', z='z',
                            color='cluster', hover_name='sent',
                            range_x=[df.x.min()-1, df.x.max()+1],
                            range_y=[df.y.min()-1, df.y.max()+1],
                            range_z=[df.z.min()-1, df.z.max()+1])
        fig.update_traces(hovertemplate='<b>%{hovertext}</b>')
        # convert graph to html and replace its id
        graph = fig.to_html(full_html=False, include_plotlyjs=False)
        return fig.to_json()
        # re_graph = r"Plotly\.newPlot\(\s*'(.*?)',.*?\)"
        # groups_html = re.search(re_graph, graph, re.DOTALL)
        # result = groups_html[0].replace(groups_html[1], 'plotly')
        # return result


model = Model('./store/all-mpnet-base-v2')


def get_model():
    return model

In [3]:
text = [
    "What is a singleton class",
    "What is the difference between an Inner Class and a Sub-Class",
    "While loop is used when certain statements need to be executed repeatedly until a condition is ",
    "What is an infinite Loop",
    "Do While Loop is same as While loop with only difference"
]
cluster = 3

In [6]:
model.fit_kmeans(text, cluster)

'{"data":[{"hovertemplate":"<b>%{hovertext}</b>","hovertext":["What is a singleton class","What is the difference between an Inner Class and a Sub-Class"],"legendgroup":"1","marker":{"color":"#636efa","symbol":"circle"},"mode":"markers","name":"1","scene":"scene","showlegend":true,"x":[0.63931507,0.6294491],"y":[0.3465421,-0.47794017],"z":[-0.42355457,0.32323027],"type":"scatter3d"},{"hovertemplate":"<b>%{hovertext}</b>","hovertext":["While loop is used when certain statements need to be executed repeatedly until a condition is ","Do While Loop is same as While loop with only difference"],"legendgroup":"2","marker":{"color":"#EF553B","symbol":"circle"},"mode":"markers","name":"2","scene":"scene","showlegend":true,"x":[-0.54114264,-0.48862782],"y":[-0.12907483,-0.28719234],"z":[-0.17688242,-0.15166253],"type":"scatter3d"},{"hovertemplate":"<b>%{hovertext}</b>","hovertext":["What is an infinite Loop"],"legendgroup":"0","marker":{"color":"#00cc96","symbol":"circle"},"mode":"markers","name

In [9]:
re_graph = r"Plotly\.newPlot\(\s*'(.*?)',.*?\)"
groups_html = re.search(re_graph, res)
groups_html
# result = groups_html[0].replace(groups_html[1], 'plotly')

In [10]:
res

'<div>                            <div id="074d0a25-75e4-45aa-b5e1-f4ca885354c2" class="plotly-graph-div" style="height:100%; width:100%;"></div>            <script type="text/javascript">                                    window.PLOTLYENV=window.PLOTLYENV || {};                                    if (document.getElementById("074d0a25-75e4-45aa-b5e1-f4ca885354c2")) {                    Plotly.newPlot(                        "074d0a25-75e4-45aa-b5e1-f4ca885354c2",                        [{"hovertemplate":"<b>%{hovertext}</b>","hovertext":["What is a singleton class","What is the difference between an Inner Class and a Sub-Class"],"legendgroup":"1","marker":{"color":"#636efa","symbol":"circle"},"mode":"markers","name":"1","scene":"scene","showlegend":true,"x":[0.63931525,0.6294487],"y":[0.34654158,-0.4779399],"z":[-0.42355505,0.32323065],"type":"scatter3d"},{"hovertemplate":"<b>%{hovertext}</b>","hovertext":["While loop is used when certain statements need to be executed repeatedly unti