In [1]:
!pipx install requests pandas lxml

# Import necessary libraries
import pandas as pd
import os
import requests

# Step 1: Download the XML file
xml_url = "https://raw.githubusercontent.com/w3c/xml-entities/refs/heads/gh-pages/unicode.xml"
xml_file_path = "/home/jovyan/unicode.xml"

# Download XML
response = requests.get(xml_url)
with open(xml_file_path, 'wb') as file:
    file.write(response.content)

print(f"XML file downloaded to {xml_file_path}")

# Step 2: Run XQuery (the script will generate a CSV file)
from lxml import etree
from subprocess import run

# Define XQuery to generate CSV
xquery_script = """
xquery version "3.1";

declare namespace output = "http://www.w3.org/2010/xslt-xquery-serialization";
declare option output:method "text";

let $doc := doc("unicode.xml")
let $header := "unicode,mathvariant_unicode,mathvariant_type"
let $rows :=
  for $c in $doc//character[latex][surrogate]
  let $unicode := $c/@id
  let $mathvariant_unicode := $c/surrogate/@ref
  let $mathvariant_type := $c/surrogate/@mathvariant
  let $row := string-join((
    $unicode,
    $mathvariant_unicode,
    $mathvariant_type
  ), ",")
  return $row

return string-join(($header, $rows), "&#10;")
"""

# Write the XQuery script to a file
xquery_file_path = "/home/jovyan/generate_mapping.xq"
with open(xquery_file_path, 'w') as xqfile:
    xqfile.write(xquery_script)

# Run XQuery command
csv_output_path = "/home/jovyan/surrogate_mapping.csv"
run(["basex", xquery_file_path, "-o", csv_output_path])

print(f"XQuery script executed. The CSV file is saved at {csv_output_path}")

# Step 3: Load the generated CSV
df = pd.read_csv(csv_output_path)

# Function to convert hex string like 'U1D400' to integer
def hex_to_int(s):
    return int(s[1:], 16)

# Function to convert integer to hex string like 'U1D400'
def int_to_hex(n):
    return f"U{n:05X}"

# Sort data to ensure correct range compression
df['unicode_int'] = df['unicode'].map(hex_to_int)
df['mathvariant_unicode_int'] = df['mathvariant_unicode'].map(hex_to_int)
df = df.sort_values(by=['mathvariant_type', 'unicode_int'])

# Compress into ranges
compressed = []
start_idx = 0
for i in range(1, len(df)):
    prev = df.iloc[i - 1]
    curr = df.iloc[i]
    if (
        curr['mathvariant_type'] != prev['mathvariant_type'] or
        curr['unicode_int'] != prev['unicode_int'] + 1 or
        curr['mathvariant_unicode_int'] != prev['mathvariant_unicode_int'] + 1
    ):
        start = df.iloc[start_idx]
        end = df.iloc[i - 1]
        compressed.append({
            "start_unicode": start['unicode'],
            "end_unicode": end['unicode'],
            "start_mathvariant_unicode": start['mathvariant_unicode'],
            "end_mathvariant_unicode": end['mathvariant_unicode'],
            "mathvariant_type": start['mathvariant_type']
        })
        start_idx = i
# Append the last range
start = df.iloc[start_idx]
end = df.iloc[-1]
compressed.append({
    "start_unicode": start['unicode'],
    "end_unicode": end['unicode'],
    "start_mathvariant_unicode": start['mathvariant_unicode'],
    "end_mathvariant_unicode": end['mathvariant_unicode'],
    "mathvariant_type": start['mathvariant_type']
})

compressed_df = pd.DataFrame(compressed)

# Step 4: Decompress the compressed table and compare with original

# Decompress the ranges
expanded_rows = []
for _, row in compressed_df.iterrows():
    start_u = hex_to_int(row["start_unicode"])
    end_u = hex_to_int(row["end_unicode"])
    start_b = hex_to_int(row["start_mathvariant_unicode"])
    end_b = hex_to_int(row["end_mathvariant_unicode"])
    variant = row["mathvariant_type"]

    for offset in range(end_u - start_u + 1):
        expanded_rows.append({
            "unicode": int_to_hex(start_u + offset),
            "mathvariant_unicode": int_to_hex(start_b + offset),
            "mathvariant_type": variant
        })

expanded_df = pd.DataFrame(expanded_rows)

# Compare to original table (after dropping int helper columns)
original_df = df[["unicode", "mathvariant_unicode", "mathvariant_type"]].reset_index(drop=True)
expanded_df = expanded_df.sort_values(by=["unicode"]).reset_index(drop=True)
original_df = original_df.sort_values(by=["unicode"]).reset_index(drop=True)

# Check for equality
tables_match = original_df.equals(expanded_df)

tables_match  # Should return True

[KNote: Dependent package 'charset-normalizer' contains 1 apps
  - normalizer

No apps associated with package requests. Try again with '--include-deps' to
include apps of dependent packages, which are listed above. If you are
attempting to install a library, pipx should not be used. Consider using pip
or a similar tool instead.
[?25h



FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/unicode.xml'