In [None]:
# 2 things: similarity search w/ embeddings; clustering/topic modelling to understand what kind of documents and use cases our customers use Juro for


# 1. prep the data
# 2. load the data — docs
# * selecting a model to do embeddings (BERT, BERTLegal, OpenAI Embedding)
# 3. tokenize the docs
# 4. use the model to create embeddings out of docs
# 5. store the embeddings
# * selecting a vector store (pgvector Postgres, MongoDB, ChromaDB, Pinecone)
# 6. similarity search, convert incoming doc to embedding and comapring
# 7. clustering and topic modelling, figuring out what kind of tasks and docs our users upload and create in Juro




In [2]:
# GET DATA 
# (this part would normally need quite a lot of attention, with preprocessing, cleaning, trimming, etc)
# for simplicity, let's assume that the test dataset is in a csv file, with every row(column string) being a doc to process

import pandas as pd

# load the CSV file
csv_file_path = 'data/string.csv'  
data = pd.read_csv(csv_file_path)

# get data from the column that has the doc
text_column = 'String'
if text_column in data.columns:
    document_array = data[text_column].astype(str).tolist()
else:
    print(f"Column {text_column} not found in the CSV file.")
    document_array = []

In [3]:
# GENERATE EMBEDDINGS
# LegalBERT

from transformers import BertModel, BertTokenizer
import torch


# path to your custom model directory
model_directory = "legal-bert-base-uncased"

# load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_directory)
model = BertModel.from_pretrained(model_directory)

# function to generate embeddings for a list of documents
def generate_embeddings(documents):
    embeddings = []

    for doc in documents:
        # tokenize the document
        # breaking down big docs into chunks => paragraphs
        inputs = tokenizer(doc, return_tensors="pt", truncation=True, max_length=512, padding="max_length")

        # get the output from the BERT model
        with torch.no_grad():
            outputs = model(**inputs)

        # extract embeddings
        doc_embedding = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(doc_embedding)

    return embeddings

document_embeddings = generate_embeddings(document_array)


Some weights of the model checkpoint at legal-bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# SAVE EMBEDDINGS
#

import numpy as np

document_embeddings_homo = [embedding.squeeze().tolist() for embedding in document_embeddings]

np.save('data/embeddings.npy', document_embeddings_homo)



In [3]:
# LOAD EMBEDDINGS
#

import numpy as np

loaded_embeddings = np.load('data/embeddings.npy')


In [27]:
# DO SIMILARITY SEARCH
#
from transformers import BertModel, BertTokenizer
import torch

# path to your custom model directory
model_directory = "legal-bert-base-uncased"

# load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained(model_directory)
model = BertModel.from_pretrained(model_directory)


# generate embedding for input doc
def calculate_embedding(doc):
    embeddings = []

    # Tokenize the document
    inputs = tokenizer(doc, return_tensors="pt", truncation=True, max_length=512, padding="max_length")

    # Get the output from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)

    # Extract embeddings
    doc_embedding = outputs.last_hidden_state.mean(dim=1)
    

    return doc_embedding

# a naive approach to comparing vectors, this would be performance optimised in vector storage systems as a function
def cosine_similarity(vec1, vec2):
    # Ensure the vectors are numpy arrays
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)

    # Calculate the dot product of the vectors
    dot_product = np.dot(vec1, vec2)

    # Calculate the magnitude (norm) of each vector
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Compute the cosine similarity
    similarity = dot_product / (norm_vec1 * norm_vec2)

    return similarity

# let's glue it all together, comparing new vector against all stored and selecting top 10 matching
def find_most_similar(new_document, stored_embeddings, top_n=5):
    # Calculate the embedding for the new document
    new_embedding = calculate_embedding(new_document)

    # Dictionary to hold document index and its similarity score
    similarity_scores = {}

    # Compute similarity of the new document against each stored embedding
    for index, stored_embedding in enumerate(stored_embeddings):
        similarity = cosine_similarity(new_embedding, stored_embedding)
        similarity_scores[index] = similarity

    # Sort the documents based on similarity scores
    sorted_docs = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

    # Return the indices of the top N similar documents
    return [doc_index for doc_index, _ in sorted_docs[:top_n]]

# find similar documents
top_similar_indices = find_most_similar("counterparty", loaded_embeddings)

for index in top_similar_indices:
        if 0 <= index < len(document_array):
            print(f"Document at index {index}:")
            print(document_array[index])
            print("----------")
        else:
            print(f"Index {index} is out of bounds for the document array.")

Some weights of the model checkpoint at legal-bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Document at index 8207:
counterparty
----------
Document at index 33716:
counterparty name
----------
Document at index 13242:
Viral signup per counterparty
----------
Document at index 26528:
counterparties
----------
Document at index 19706:
counterparty type 
----------


In [None]:
import numpy as np

loaded_embeddings = np.load('data/embeddings.npy')

# CLUSTER
# dbscan
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
similarity_matrix = cosine_similarity(loaded_embeddings)

# Apply DBSCAN
# Note: As DBSCAN works with distances, and cosine similarity is a measure of similarity, 
# you might need to convert similarities to distances.
distance_matrix = 1 - similarity_matrix
dbscan = DBSCAN(eps=0.5, min_samples=5, metric="precomputed")
clusters = dbscan.fit_predict(distance_matrix)


In [26]:
# VISUALIZE
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# reduce dimensionality
tsne = TSNE(n_components=2, random_state=0)
embeddings_2d = tsne.fit_transform(loaded_embeddings)

plt.figure(figsize=(12, 8))
# Scatter plot of the reduced data. You can color points by cluster label if available.
# For example, if you have DBSCAN cluster labels, you can use `c=clusters`
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=clusters, cmap='rainbow')
plt.colorbar()
plt.xlabel('t-SNE feature 1')
plt.ylabel('t-SNE feature 2')
plt.title('t-SNE visualization of document embeddings')
plt.show()

NameError: name 'clusters' is not defined

<Figure size 1200x800 with 0 Axes>

In [None]:
# ID TOPICS BASED ON CLUSTERS
import numpy as np

# compute centroids
centroids = kmeans.cluster_centers_

# to analyze representative terms, we need to find the closest words to each centroid
# this is a bit more complex, it requires mapping back from the embedding space to the word space
# a simplistic approach is to find the documents closest to each centroid
for i, centroid in enumerate(centroids):
    distances = np.linalg.norm(reshaped_embeddings - centroid, axis=1)
    closest_doc_index = np.argmin(distances)
    print(f"Cluster {i} representative document: {document_array[closest_doc_index]}")

In [None]:
# TOPIC REVIEW AND FEATURE EXTRACTION FROM DOCUMENTS

from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# create TF-IDF representation of the documents
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(document_array)

# function to extract top N keywords from each cluster
def extract_top_keywords_per_cluster(tfidf_matrix, cluster_assignments, n_top_keywords=20):
    keywords_per_cluster = {}
    feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

    for cluster_num in range(num_clusters):
        # indices of documents in the current cluster
        document_indices = np.where(cluster_assignments == cluster_num)[0]

        # aggregate TF-IDF scores of these documents
        if isinstance(tfidf_matrix, np.matrix):
            aggregated_tfidf = np.mean(tfidf_matrix[document_indices], axis=0).A1  # Use .A1 for numpy matrix
        else:
            aggregated_tfidf = np.mean(tfidf_matrix[document_indices].toarray(), axis=0)  # Use .toarray() for sparse matrix

        # get top N keywords
        top_keyword_indices = aggregated_tfidf.argsort()[-n_top_keywords:][::-1]
        top_keywords = feature_names[top_keyword_indices]
        keywords_per_cluster[cluster_num] = top_keywords

    return keywords_per_cluster

# extract and print top keywords for each cluster
top_keywords_per_cluster = extract_top_keywords_per_cluster(tfidf_matrix, cluster_assignments)
for cluster, keywords in top_keywords_per_cluster.items():
    print(f"Cluster {cluster}: {', '.join(keywords)}")

In [None]:
# VISUALIZE CLUSTERS w TOPIC NAMES

import plotly.express as px
import plotly.graph_objs as go

# prepare hover text (first 100 characters of each document)
hover_texts = ["Doc " + str(i) + ": " + doc[:100] + "..." for i, doc in enumerate(document_array)]

# create a scatter plot
fig = go.Figure()

for i in range(num_clusters):
    # filter points belonging to the current cluster
    indices = [j for j, x in enumerate(cluster_assignments) if x == i]
    current_cluster_points = reduced_embeddings[indices]
    
    cluster_hover_texts = [hover_texts[j] for j in indices]

    fig.add_trace(go.Scatter(
        x=current_cluster_points[:, 0], 
        y=current_cluster_points[:, 1], 
        mode='markers',
        marker=dict(
            size=5,
            line=dict(
                width=1,  
                color='black' 
            )
        ),
        name=f"Cluster {i}",
        text=cluster_hover_texts,
        hoverinfo='text',
        hoverlabel=dict(
            bgcolor='white', 
            bordercolor='black', 
            font_size=12, 
            font_family='Arial'  
        )
    ))
    

fig.update_layout(
    title="Document Embeddings Clustered with t-SNE",
    xaxis_title="t-SNE feature 1",
    yaxis_title="t-SNE feature 2",
    legend_title="Clusters"
)

fig.show()