In [None]:
from glob import glob
from os.path import exists, join, basename
from tqdm import tqdm
from json import load, dump
from matplotlib import pyplot as plt
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from quadtreed3 import Quadtree, Node
from scipy.sparse import csr_matrix
from sklearn.neighbors import KernelDensity
from scipy.stats import norm
from typing import Tuple
from io import BytesIO
from umap import UMAP

import pandas as pd
import numpy as np
import ndjson
import requests
import urllib
import wizmap

# Read Data

In [None]:
# Load data
with open("train.json", "r") as f:
    texts = load(f)
texts_arr = np.array([texts[f"{i}"] for i in range(len(texts))])
embs = np.loadtxt("12.txt")

# Lists to store each part
array_indices = []
word_indices = []
words = []
categories = []

# Open and parse the file
with open("train.txt", "r") as f:
    for line in f:
        line = line.strip()  # remove newline
        if not line:
            continue  # skip empty lines
        
        # Split at ':' to separate indices from word+category
        indices_part, rest = line.split(":", 1)
        
        # Remove parentheses and split indices
        array_idx, word_idx = indices_part.strip("()").split(",")
        array_idx = int(array_idx)
        word_idx = int(word_idx) - 1
        
        # Split word and category by whitespace
        word, category = rest.strip().split()
        
        # Append to lists
        array_indices.append(array_idx)
        word_indices.append(word_idx)
        words.append(word)
        categories.append(category)

# Optional: convert indices to numpy arrays
array_indices = np.array(array_indices)
word_indices = np.array(word_indices)
words = np.array(words)
categories = np.array(categories)


In [None]:
# Quick check
print(embs.shape)
print(texts_arr.shape)
#print(text_arr[0:4])
# print(array_indices[:5])
# print(word_indices[:5])
# print(words[:5])
# print(categories[:5])

# Dim Reduction

In [None]:
reducer = UMAP(metric="cosine")
embeddings_2d = reducer.fit_transform(embs)

In [None]:
# plt.title(f"UMAP Projected Embeddings of {embs.shape[0]} Texts")
# plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], s=0.1, alpha=0.2)
# plt.show()

# Wizmap

In [None]:
xs = embeddings_2d[:, 0].astype(float).tolist()
ys = embeddings_2d[:, 1].astype(float).tolist()
texts = np.array([
    f"[{words[i]},{word_indices[i]}]: {texts_arr[array_indices[i]]}" for i in range(len(array_indices))
])

In [None]:
data_list = wizmap.generate_data_list(xs, ys, texts)
# The following is where the values are computed
grid_dict = wizmap.generate_grid_dict(xs, ys, texts, "Word Contexts")

In [None]:
wizmap.save_json_files(data_list, grid_dict, output_dir="./")

Hosting Wizmap Files using `python -m http.server 8000`

In [None]:
data_url = (
    "https://raw.githubusercontent.com/lnsidiou5/wizmap/refs/heads/main/expMap/data.ndjson"
)
grid_url = (
    "https://raw.githubusercontent.com/lnsidiou5/wizmap/refs/heads/main/expMap/grid.json"
)

In [None]:
# Display wizmaps
wizmap.visualize(data_url = data_url, grid_url = grid_url, height=700)