In [1]:
import holoviews as hv
import datashader as ds
import pandas as pd
import numpy as np
from datashader import count
from holoviews.operation.datashader import datashade, dynspread
from holoviews.operation import apply_when
from tqdm import tqdm
from datasets import load_dataset, get_dataset_config_names
import panel as pn

pn.extension()

import os

from holoviews import opts
from holoviews.streams import RangeXY, Selection1D
from datashader import transfer_functions as tf

import holoviews.operation.datashader as hd

from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.service import Service

In [2]:
hv.extension('bokeh', logo=False)

In [3]:

parquet_path = './wikimap.parquet'
if not os.path.exists(parquet_path):
    np_positions = np.load('./low_d.npy')
    coords = pd.DataFrame(np_positions, columns=['x', 'y'])

    subsets = sorted(get_dataset_config_names("wikimedia/wikipedia"))

    df_list = []
    for i, subset in tqdm(enumerate(subsets), desc="Loading subsets"):
        ds = load_dataset("wikimedia/wikipedia", subset)
        df_list.append(pd.DataFrame({
            'title': ds['train']['title'],
            'subset': subset,
            'url': ds['train']['url'],
            'wid': ds['train']['id']
        }))
    meta = pd.concat(df_list, ignore_index=True)
    # Combine the metadata with coordinates
    data = pd.concat([coords, meta], axis=1)

    data.to_parquet(parquet_path)
else:
    print(f"File already exists at {parquet_path}")



File already exists at ./wikimap.parquet


In [4]:
# Load the data from the parquet file
data = pd.read_parquet(parquet_path)

In [5]:
data.head()

Unnamed: 0,x,y,title,subset,url,wid
0,-74.382103,259.87851,Аԥсуа бызшәа,20231101.ab,https://ab.wikipedia.org/wiki/%D0%90%D4%A5%D1%...,807
1,-74.580994,259.981293,Аҟәа,20231101.ab,https://ab.wikipedia.org/wiki/%D0%90%D2%9F%D3%...,1040
2,-270.739716,486.283691,Аԥсуа алфавит,20231101.ab,https://ab.wikipedia.org/wiki/%D0%90%D4%A5%D1%...,1044
3,-74.554779,259.965332,Гагра,20231101.ab,https://ab.wikipedia.org/wiki/%D0%93%D0%B0%D0%...,1046
4,-74.614174,259.980438,Аԥсны Аҳәынҭқарра,20231101.ab,https://ab.wikipedia.org/wiki/%D0%90%D4%A5%D1%...,1053


In [6]:
# Print the number of rows in the data DataFrame
print(f"Number of rows in data: {len(data)}")


Number of rows in data: 61614907


In [7]:
import time

class ThrottledFunction:
    def __init__(self, func, throttle_time=0.05):
        """
        Initializes the throttled function.

        :param func: The function to wrap.
        :param throttle_time: Minimum time (seconds) between calls.
        """
        self.func = func
        self.throttle_time = throttle_time
        self.last_call = 0
        self.last_result = None

    def __call__(self, *args, **kwargs):
        """
        Calls the function only if enough time has passed.
        """
        current_time = time.time()
        if current_time - self.last_call >= self.throttle_time:
            self.last_result = self.func(*args, **kwargs)
            self.last_call = current_time
        return self.last_result

    def set_throttle_time(self, throttle_time):
        """
        Updates the throttle time dynamically.
        """
        self.throttle_time = throttle_time

class HistoricalParamsProvider:
    def __init__(self, func, throttle_time=0.05):
        self.func = func
        self.last_args = None
        self.last_kwargs = None

    def __call__(self, *args, **kwargs):
        r = self.func(*args, **kwargs, last_args=self.last_args, last_kwargs=self.last_kwargs)
        self.last_args = args
        self.last_kwargs = kwargs
        return r

In [8]:
import webbrowser
def compare(comparison_subset, interactive=True, res=800):
    comparison_colormaps = {
        'fr': ['#FFD1DB', '#FF0033'],  # Brighter pink to deep red
        'es': ['#FFDB9E', '#FF7700'],  # Warmer orange range
        'ru': ['#B3FFD8', '#00A86B'],
        'zh': ['#FFC3C3', '#FF0000'],  # Brighter red range
        'ja': ['#FFB1EC', '#DF005F'],
        'ja': ['#FFB1EC', '#FF00AA'],  # Vivid magenta
        'de': ['#FFEA80', '#FFD000']   # Brighter yellow range
    }

    en_colormaps = {
        'fr': ['#00362D', '#00FFD1'],  # Deeper teal contrast
        'es': ['#00205A', '#00BBFF'],  # More vivid blue range
        'ru': ['#3D004A', '#D459FF'],  # Deep purple to lavender
        'zh': ['#003C5A', '#00F7FF'],  # More electric cyan
        'ja': ['#4A1F00', '#FF9500'],  # Deep brown to bright orange
        'de': ['#0C005A', '#4D4DFF']   # Deeper blue-violet range
    }

    en_data = data[data['subset'] == '20231101.en']
    # Subsample 1M points from English data if more than 1M points exist
    if len(en_data) > 1_000_000:
        en_data = en_data.sample(n=1_000_000, random_state=42)
    en_vecs = en_data[['x', 'y']].values
    comp_data = data[data['subset'] == f'20231101.{comparison_subset}']
    # Subsample 1M points from comparison data if more than 1M points exist
    if len(comp_data) > 1_000_000:
        comp_data = comp_data.sample(n=1_000_000, random_state=42)
    comp_vecs = comp_data[['x', 'y']].values

    # Create points for each language
    en_points = hv.Points(en_data, kdims=['x', 'y'], vdims=['title', 'url'])
    en_shaded = datashade(en_points, cmap=en_colormaps[comparison_subset], aggregator=count()).opts(height=res, width=res, alpha=0.8)
    #en_shaded = dynspread(en_shaded, max_px=10)

    comp_points = hv.Points(comp_data, kdims=['x', 'y'], vdims=['title', 'url'])
    comp_shaded = datashade(comp_points, cmap=comparison_colormaps[comparison_subset], aggregator=count()).opts(height=res, width=res, alpha=0.8)
    comp_shaded = dynspread(comp_shaded, max_px=10)

    combined = en_shaded * comp_shaded

    pointer_stream = hv.streams.PointerXY()
    range_stream = hv.streams.RangeXY(en_shaded)
    tap_stream = hv.streams.Tap(rename = {'x': 'x_tap', 'y': 'y_tap'})


    def get_nearest_label(x, y, x_range, y_range, x_tap, y_tap, last_args, last_kwargs):

        if y_range is None:
            y_range = [-800, 800]

        #return early init case
        if x is None or y is None:
            x = 0
            y = 0
            return hv.Text(x, y+10, '').opts(text_color='black', text_font_size='12pt')

        

        #return early debounce cases
        if last_kwargs is not None and x is not None and y is not None and last_kwargs['x'] is not None and last_kwargs['y'] is not None and last_kwargs['y_range'] is not None:

            last_yrange = last_kwargs['y_range']
            if y_range != last_yrange:
                return hv.Text(x, y+10, '').opts(text_color='black', text_font_size='12pt')


            last_xy = [last_kwargs['x'], last_kwargs['y']]
            dist = np.sqrt((last_xy[0] - x)**2 + (last_xy[1] - y)**2)
            dist_threshold = 0.05 * (y_range[1] - y_range[0])
            if dist > dist_threshold:
                return hv.Text(x, y+10, '').opts(text_color='black', text_font_size='12pt')

        #do update
        offset = 50 
        if y_range is not None:
            offset = min(50, 0.05 * (y_range[1] - y_range[0]))
        q = np.array([x, y]).reshape(1, 2)
        en_nearest_dist, en_nearest_idx = np.min(np.sum((en_vecs - q) ** 2, axis=1)), np.argmin(np.sum((en_vecs - q) ** 2, axis=1))
        comp_nearest_dist, comp_nearest_idx = np.min(np.sum((comp_vecs - q) ** 2, axis=1)), np.argmin(np.sum((comp_vecs - q) ** 2, axis=1))

        if en_nearest_dist < comp_nearest_dist:
            en_nearest_loc = en_vecs[en_nearest_idx]
            en_nearest_title = en_data['title'].iloc[en_nearest_idx]
            if x_tap is not None and y_tap is not None:
                if np.allclose([x_tap, y_tap], [x, y]):
                    en_nearest_url = en_data['url'].iloc[en_nearest_idx]
                    webbrowser.open(en_nearest_url)
            annot = hv.Text(en_nearest_loc[0], en_nearest_loc[1] + offset, f'en: {en_nearest_title}').opts(text_color='black', text_font_size='12pt')
        else:
            comp_nearest_loc = comp_vecs[comp_nearest_idx]
            comp_nearest_title = comp_data['title'].iloc[comp_nearest_idx]
            if x_tap is not None and y_tap is not None:
                if np.allclose([x_tap, y_tap], [x, y]):
                    comp_nearest_url = comp_data['url'].iloc[comp_nearest_idx]
                    webbrowser.open(comp_nearest_url)
            annot = hv.Text(comp_nearest_loc[0], comp_nearest_loc[1] + offset, f'{comparison_subset}: {comp_nearest_title}').opts(text_color='black', text_font_size='12pt')

        return annot 

    def get_nearest_location(x, y, x_range, y_range, last_args, last_kwargs):

        if y_range is None:
            y_range = [-800, 800]

        #return early init case
        if x is None or y is None:
            x = 0
            y = 0
            return hv.Text(x, y, '').opts(text_color='black', text_font_size='12pt')

        #return early debounce case
        if last_kwargs is not None and x is not None and y is not None and last_kwargs['x'] is not None and last_kwargs['y'] is not None:
            last_xy = [last_kwargs['x'], last_kwargs['y']]
            dist = np.sqrt((last_xy[0] - x)**2 + (last_xy[1] - y)**2)
            dist_threshold = 0.05 * (y_range[1] - y_range[0])
            if dist > dist_threshold:
                return hv.Text(x, y, '').opts(text_color='black', text_font_size='12pt')

        q = np.array([x, y]).reshape(1, 2)
        en_nearest_dist, en_nearest_idx = np.min(np.sum((en_vecs - q) ** 2, axis=1)), np.argmin(np.sum((en_vecs - q) ** 2, axis=1))
        comp_nearest_dist, comp_nearest_idx = np.min(np.sum((comp_vecs - q) ** 2, axis=1)), np.argmin(np.sum((comp_vecs - q) ** 2, axis=1))
        if en_nearest_dist < comp_nearest_dist:
            en_nearest_loc = en_vecs[en_nearest_idx]
            x = hv.Text(en_nearest_loc[0], en_nearest_loc[1], 'X').opts(text_color=en_colormaps[comparison_subset][1], text_font_size='12pt')
        else:
            comp_nearest_loc = comp_vecs[comp_nearest_idx]
            x = hv.Text(comp_nearest_loc[0], comp_nearest_loc[1], 'X').opts(text_color=comparison_colormaps[comparison_subset][1], text_font_size='12pt')
        return x


    if interactive:
        get_nearest_label = ThrottledFunction(HistoricalParamsProvider(get_nearest_label), throttle_time=0.25)
        get_nearest_location = ThrottledFunction(HistoricalParamsProvider(get_nearest_location), throttle_time=0.25)

        hover_text = hv.DynamicMap(get_nearest_label, streams=[pointer_stream, range_stream, tap_stream])
        hover_x = hv.DynamicMap(get_nearest_location, streams=[pointer_stream, range_stream])

        layout = combined * hover_text * hover_x
    else:
        layout = combined

    plot = layout.opts(
        width=res,
        height=res,
        title="",
        #xaxis=None,
        #yaxis=None,
    )


    def remove_bokeh_logo(plot, element):
        plot.state.toolbar.logo = None

    layout.opts(hooks=[remove_bokeh_logo])

    pn.serve(plot)



In [9]:
compare('ru', interactive=True, res=800)

Launching server at http://localhost:64491
