In [57]:
from glob import glob
from os.path import exists, join, basename
from tqdm import tqdm
from json import load, dump
from matplotlib import pyplot as plt
from collections import Counter

from umap import UMAP

import pandas as pd
import numpy as np

import importlib.util
from pathlib import Path

# import local wizmap
path = Path.cwd().parent / "notebook_widget" / "wizmap" / "wizmap.py"

spec = importlib.util.spec_from_file_location("wizmap", path)
wizmap = importlib.util.module_from_spec(spec)
spec.loader.exec_module(wizmap)

# Read Data

In [5]:
# Load data
with open("train.json", "r") as f:
    texts = load(f)
texts_arr = np.array([texts[f"{i}"] for i in range(len(texts))])
embs = np.loadtxt("12.txt")

# Lists to store each part
array_indices = []
word_indices = []
words = []
categories = []

# Open and parse the file
with open("train.txt", "r") as f:
    for line in f:
        line = line.strip()  # remove newline
        if not line:
            continue  # skip empty lines
        
        # Split at ':' to separate indices from word+category
        indices_part, rest = line.split(":", 1)
        
        # Remove parentheses and split indices
        array_idx, word_idx = indices_part.strip("()").split(",")
        array_idx = int(array_idx)
        word_idx = int(word_idx) - 1
        
        # Split word and category by whitespace
        word, category = rest.strip().split()
        
        # Append to lists
        array_indices.append(array_idx)
        word_indices.append(word_idx)
        words.append(word)
        categories.append(category)

# Optional: convert indices to numpy arrays
array_indices = np.array(array_indices)
word_indices = np.array(word_indices)
words = np.array(words)
categories = np.array(categories)


In [None]:
# Quick check
print(embs.shape)
print(texts_arr.shape)
#print(text_arr[0:4])
# print(array_indices[:5])
# print(word_indices[:5])
# print(words[:5])
# print(categories[:5])

# Dim Reduction

In [6]:
reducer = UMAP(metric="cosine")
embeddings_2d = reducer.fit_transform(embs)

In [None]:
# plt.title(f"UMAP Projected Embeddings of {embs.shape[0]} Texts")
# plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=0.1, alpha=0.2)
# plt.show()

# Wizmap

In [31]:
xs = embeddings_2d[:, 0].astype(float).tolist()
ys = embeddings_2d[:, 1].astype(float).tolist()

# set texts with bracket around focus word
temp_texts = []

for text_i, word, idx in zip(array_indices, words, word_indices):
    tokens = texts_arr[text_i].split()

    # Safety check
    if 0 <= idx < len(tokens) and tokens[idx] == word:
        tokens[idx] = f"[{tokens[idx]}]"
    else:
        raise ValueError(
            f"Mismatch at index {idx}: expected '{word}', found '{tokens[idx] if idx < len(tokens) else None}'"
        )

    temp_texts.append(" ".join(tokens))

texts = np.array(temp_texts)

## Grid dictionary

In [56]:
data_list = wizmap.generate_data_list(xs, ys, texts)
# The following is where the values are computed
grid_dict = wizmap.generate_grid_dict(xs, ys, texts, "Word Contexts") # replaced by llm format

Start generating data list...
Start generating contours...
Start generating multi-level summaries...


4284it [00:00, 221424.77it/s]
  0%|          | 0/6 [00:01<?, ?it/s]


BadRequestError: Error code: 400 - {'error': {'message': "Invalid type for 'input[0]': expected an input item, but got a string instead.", 'type': 'invalid_request_error', 'param': 'input[0]', 'code': 'invalid_type'}}

In [None]:
wizmap.save_json_files(data_list, grid_dict, output_dir="./")

In [38]:
data_url = (
    "https://raw.githubusercontent.com/lnsidiou5/wizmap/refs/heads/main/expMap/data.ndjson"
)
grid_url = (
    "https://raw.githubusercontent.com/lnsidiou5/wizmap/refs/heads/main/expMap/grid.json"
)

In [None]:
# Display wizmaps
wizmap.visualize(data_url = data_url, grid_url = grid_url, height=700)