In [5]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap.umap_ as umap
from mpl_toolkits.mplot3d import Axes3D
import imageio.v2 as imageio
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer
import torch
from matplotlib.colors import Normalize




In [6]:
client = MongoClient('mongodb+srv://pipo:melgeoffrey@cluster0.yzkq3xh.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
db = client['Thesis']
collection = db['Documents']

In [7]:
cursor = collection.find({}, no_cursor_timeout=True)

embeddings = []
categories = []
titles = []

try:
    for doc in cursor:
        embeddings.append(doc['embed'])  # Assuming 'embed' holds the embedding data
        categories.append(doc.get('category', 'unknown'))  # Get category, default to 'unknown'
        titles.append(doc.get('title', 'unknown'))  # Get title, default to 'unknown'
finally:
    cursor.close()

  return Cursor(self, *args, **kwargs)


In [8]:
embeddings_np = np.array(embeddings)
print(f"Embeddings shape: {embeddings_np.shape}")

# Check if a GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

# Load the SentenceTransformer model
model = SentenceTransformer('Alibaba-NLP/gte-large-en-v1.5', trust_remote_code=True)
model = model.to(device)

# Generate embedding for the query
query = "The court found the defendant guilty of murder in the first degree, citing overwhelming evidence of premeditation and intent."
query_embedding = model.encode(query, convert_to_tensor=True, device=device).cpu().numpy()

# Stack the query embedding to the existing MongoDB embeddings
all_embeddings_np = np.vstack([embeddings_np, query_embedding])
print(f"All embeddings shape (including query): {all_embeddings_np.shape}")

# Apply UMAP for dimensionality reduction (adjust n_components for 2D or 3D)
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, random_state=42, metric='cosine')
umap_embeddings = umap_model.fit_transform(all_embeddings_np)


Embeddings shape: (43977, 1024)
Using device: cuda


  attn_output = torch.nn.functional.scaled_dot_product_attention(


All embeddings shape (including query): (43978, 1024)


  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [9]:
umap_df = pd.DataFrame(umap_embeddings, columns=['dim1', 'dim2', 'dim3'])
categories.append('Query')  # Add 'Query' label for the query embedding
umap_df['category'] = categories

# Visualize 3D UMAP Embeddings and Create GIF
output_dir = '.'  # Output directory for the frames and GIF
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Normalize categories for color mapping
unique_categories = umap_df['category'].unique()
norm = Normalize(vmin=0, vmax=len(unique_categories) - 2)
cmap = plt.get_cmap('viridis')


In [10]:
frames = []
zoom_levels = np.linspace(0.1, 4, 30).tolist() + np.linspace(4, 0.1, 30).tolist()

for zoom in zoom_levels:
    fig = plt.figure(figsize=(10, 7))
    ax = fig.add_subplot(111, projection='3d')

    # Plot non-query points
    non_query_points = umap_df[umap_df['category'] != 'Query']
    for idx, category in enumerate(non_query_points['category'].unique()):
        category_points = non_query_points[non_query_points['category'] == category]
        color = cmap(norm(idx))
        ax.scatter(category_points['dim1'], category_points['dim2'], category_points['dim3'],
                   label=category, s=20, alpha=0.7, color=color)

    # Plot the query point
    query_point = umap_df[umap_df['category'] == 'Query']
    query_x, query_y, query_z = query_point.iloc[0][['dim1', 'dim2', 'dim3']]
    ax.scatter(query_x, query_y, query_z, color='red', edgecolor='black', s=100, label='Query')

    # Set labels and title
    ax.set_xlabel('UMAP Dimension 1')
    ax.set_ylabel('UMAP Dimension 2')
    ax.set_zlabel('UMAP Dimension 3')
    ax.set_title('3D UMAP Visualization of Document Embeddings')
    ax.legend(loc='best', title='Category')

    # Zoom in and out on the query point
    ax.set_xlim([query_x - zoom, query_x + zoom])
    ax.set_ylim([query_y - zoom, query_y + zoom])
    ax.set_zlim([query_z - zoom, query_z + zoom])

    # Save each frame
    frame_path = os.path.join(output_dir, f'frame_{zoom:.2f}.png')
    plt.savefig(frame_path)
    frames.append(imageio.imread(frame_path))
    plt.close()

# Save GIF
gif_path = os.path.join(output_dir, 'mongo_query_3d_umap_zoom.gif')
imageio.mimsave(gif_path, frames, fps=10)

print(f'GIF saved to {gif_path}')

GIF saved to .\mongo_query_3d_umap_zoom.gif


In [11]:
output_dir = './umap_frames'
os.makedirs(output_dir, exist_ok=True)

# Generate frames for the GIF
angles = np.linspace(0, 360, 60)  # 60 frames for a full rotation
filenames = []

for angle in angles:
    fig = plt.figure(figsize=(12, 8))
    ax = fig.add_subplot(111, projection='3d')
    
    # Plot non-query embeddings (color by category)
    non_query_points = umap_df[umap_df['category'] != 'Query']
    sc = ax.scatter(non_query_points['dim1'], non_query_points['dim2'], non_query_points['dim3'], 
                    c=non_query_points['category'].astype('category').cat.codes, 
                    cmap='viridis', alpha=0.7)
    
    # Plot query embedding separately (in red)
    query_point = umap_df[umap_df['category'] == 'Query']
    ax.scatter(query_point['dim1'], query_point['dim2'], query_point['dim3'], 
               color='red', edgecolor='black', s=100, label='Query')

    # Add legend for categories
    handles, labels = sc.legend_elements()
    legend = ax.legend(handles, non_query_points['category'].astype('category').cat.categories, title='Category')

    # Set title and labels
    ax.set_title('3D UMAP Visualization of Document Embeddings')
    ax.set_xlabel('UMAP Dimension 1')
    ax.set_ylabel('UMAP Dimension 2')
    ax.set_zlabel('UMAP Dimension 3')

    # Rotate the view
    ax.view_init(30, angle)

    # Save the frame
    filename = os.path.join(output_dir, f'frame_{int(angle):03d}.png')
    plt.savefig(filename)
    plt.close()
    filenames.append(filename)

# Create a GIF from the saved frames
gif_path = os.path.join(output_dir, 'cosine_umap_visualization_3d_rotation_with_query.gif')
with imageio.get_writer(gif_path, mode='I', duration=0.1, loop = 0) as writer:
    for filename in filenames:
        image = imageio.imread(filename)
        writer.append_data(image)

# Clean up (optional): Remove the individual frame files
for filename in filenames:
    os.remove(filename)

print(f'GIF saved to {gif_path}')

GIF saved to ./umap_frames\cosine_umap_visualization_3d_rotation.gif
