In [12]:
import plotly.express as px
import pandas as pd
import json

from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

In [4]:
files = ["examples/banking-clusters-min-size-10-solution.json", 
         "examples/covid19-clusters-min-size-10-solution.json"]

In [5]:
clustering_examples = []
for file in files:
    with open(file, "r") as fin:
        clustering_examples.append(json.load(fin))

In [6]:
dfs = []
for clustering in clustering_examples:
    row_list = []
    cluster_id = 0
    for cluster in clustering["cluster_list"]:
        title = cluster["cluster_name"]
        
        for request in cluster["requests"]:
            row_list.append({"text": request, "cluster": cluster_id, "title": title})
        
        cluster_id += 1
    
    for request in clustering["unclustered"]:
        row_list.append({"text": request, "cluster": -1, "title": "Unclustered"})
    
    dfs.append(pd.DataFrame(
        data=row_list,
        columns=["text", "cluster", "title"]
    ))

In [9]:
encoder = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
for df in dfs:
    df['encoded'] = df['text'].map(lambda x: encoder.encode(x))

In [14]:
for df in dfs:
    pca = PCA(n_components=3)
    reduced = pca.fit_transform(df['encoded'].to_list())
    for i in range(3):
        df[f'reduced{str(i)}'] = reduced[:, i]

In [18]:
figs = []
x, y, z = "reduced0", "reduced1", "reduced2"
for df in dfs:
    fig = px.scatter_3d(df, x=x, y=y, z=z,
                        title="Examples",
                        hover_data={
                            'text': True,
                            'title': True,
                            x: True,
                            y: True,
                            z: True
                        },
                        size_max=1.5,
                        color='title')
    figs.append(fig)

In [19]:
for i, fig in enumerate(figs):
    fig.write_html(f"graphing/example{i}.html")