# Symbols mapping

## Obtaining all valid symbols

In [2]:
import myunicode

In [6]:
from itertools import count

def all_unicode_iterator():
    for code in count(0):
        try:
            chr(code)
        except ValueError:
            return
        
        yield chr(code)

In [11]:
all_currency_symbols = [char for char in all_unicode_iterator() if myunicode.category(char) == 'Sc']
all_math_symbols     = [char for char in all_unicode_iterator() if myunicode.category(char) == 'Sm']
all_other_symbols    = [char for char in all_unicode_iterator() if myunicode.category(char) == 'So']

In [12]:
len(all_currency_symbols), len(all_math_symbols), len(all_other_symbols)

(63, 948, 6605)

In [13]:
def is_normalized(char: str) -> bool:
    try:
        return myunicode.ens_normalize(char) == char
    except ValueError:
        return False

In [37]:
def is_valid(char: str) -> bool:
    return is_normalized(char) and not myunicode.is_emoji(char)

In [40]:
valid_currency_symbols = [char for char in all_currency_symbols if is_valid(char)]
valid_math_symbols     = [char for char in all_math_symbols if is_valid(char)]
valid_other_symbols    = [char for char in all_other_symbols if is_valid(char)]

In [41]:
len(valid_currency_symbols), len(valid_math_symbols), len(valid_other_symbols)

(55, 102, 1943)

## Math and Currencies mapping made by hand

In [65]:
# for symb in valid_currency_symbols:
#     print(f"{repr(symb)}: [{repr(myunicode.name(symb))}]")

# print()

# for symb in valid_math_symbols:
#     print(f"{repr(symb)}: [{repr(myunicode.name(symb))}]")

In [66]:
currency_mapping = {
    '$': ['DOLLAR'],
    '¢': ['CENT'],
    '£': ['POUND'],
    '¤': ['CURRENCY'],
    '¥': ['YEN'],
    '֏': ['DRAM'],
    '؋': ['AFGHANI'],
    '߾': ['NKO DOROME SIGN'], #
    '߿': ['NKO TAMAN SIGN'], #
    '৲': ['RUPEE'],
    '৳': ['RUPEE'],
    '৻': ['GANDA'],
    '૱': ['RUPEE'],
    '௹': ['RUPEE'],
    '฿': ['BAHT'],
    '៛': ['RIEL'],
    '₠': ['EURO'],
    '₡': ['COLON'],
    '₢': ['CRUZEIRO'],
    '₣': ['FRANC'],
    '₤': ['LIRA'],
    '₥': ['MILL'],
    '₦': ['NAIRA'],
    '₧': ['PESETA'],
    '₩': ['WON'],
    '₪': ['SHEQEL'],
    '₫': ['DONG'],
    '€': ['EURO'],
    '₭': ['KIP'],
    '₮': ['TUGRIK'],
    '₯': ['DRACHMA'],
    '₰': ['PENNY'],
    '₱': ['PESO'],
    '₲': ['GUARANI'],
    '₳': ['AUSTRAL'],
    '₴': ['HRYVNIA'],
    '₵': ['CEDI'],
    '₶': ['LIVRE TOURNOIS SIGN'], #
    '₷': ['SPESMILO'],
    '₸': ['TENGE'],
    '₹': ['RUPEE'],
    '₺': ['LIRA'],
    '₻': ['NORDIC MARK SIGN'], #
    '₼': ['MANAT'],
    '₽': ['RUBLE'],
    '₾': ['LARI'],
    '₿': ['BITCOIN'],
    '\u20c0': ['SOM'],
    '꠸': ['RUPEE'],
    '𑿝': ['KAACU'],
    '𑿞': ['PANAM'],
    '𑿟': ['PON'],
    '𑿠': ['VARAAKAN'],
    '𞋿': ['WANCHO NGUN SIGN'], #
    '𞲰': ['INDIC SIYAQ RUPEE MARK'], #
}

In [67]:
math_mapping = {
    '¬': ['NOT', 'NEGATION'],
    '±': ['PLUS-MINUS', 'UNCERTAINTY', 'ERROR'],  # ??
    '∓': ['MINUS-PLUS'], # ??
    '϶': ['EPSILON'],
    '؆': ['ROOT'],
    '؇': ['ROOT'],
    '؈': ['RAY'],
    '⁄': ['FRACTION'],
    '℘': ['SCRIPT CAPITAL P'],  #
    
    '←': ['LEFTWARDS', 'LEFT', 'ARROW'],
    '↑': ['UPWARDS', 'UP', 'ARROW'],
    '→': ['RIGHTWARDS', 'RIGHT', 'ARROW'],
    '↓': ['DOWNWARDS', 'DOWN', 'ARROW'],
    '↚': ['LEFTWARDS', 'ARROW', 'STROKE'], # ??
    '↛': ['RIGHTWARDS', 'ARROW', 'STROKE'], # ??
    '↠': ['RIGHTWARDS', 'ARROW'], # ??
    '↣': ['RIGHTWARDS', 'ARROW'], # ??
    '↦': ['RIGHTWARDS', 'ARROW'], # ??
    '↮': ['ARROW', 'STROKE'], # ??
    '⇎': ['ARROW', 'STROKE'], #
    '⇏': ['RIGHTWARDS', 'ARROW', 'STROKE'], # ??
    '⇒': ['RIGHTWARDS', 'RIGHT', 'ARROW'],
    '⇔': ['LEFT RIGHT DOUBLE ARROW'], #
    '⇴': ['ARROW'], # ??
    '⇵': ['DOWNWARDS ARROW LEFTWARDS OF UPWARDS ARROW'], #
    '⇶': ['THREE RIGHTWARDS ARROWS'], #
    '⇷': ['LEFTWARDS', 'ARROW', 'STROKE'],
    '⇸': ['RIGHTWARDS', 'ARROW', 'STROKE'],
    '⇹': ['LEFT RIGHT ARROW WITH VERTICAL STROKE'], #
    '⇺': ['LEFTWARDS ARROW WITH DOUBLE VERTICAL STROKE'], #
    '⇻': ['RIGHTWARDS ARROW WITH DOUBLE VERTICAL STROKE'], #
    '⇼': ['LEFT RIGHT ARROW WITH DOUBLE VERTICAL STROKE'], #
    '⇽': ['LEFTWARDS OPEN-HEADED ARROW'], #
    '⇾': ['RIGHTWARDS OPEN-HEADED ARROW'], #
    '⇿': ['LEFT RIGHT OPEN-HEADED ARROW'], #
    '⟰': ['UPWARDS', 'ARROW'],
    '⟱': ['DOWNWARDS', 'ARROW'],
    '⬰': ['LEFT ARROW WITH SMALL CIRCLE'], #
    '⬱': ['THREE LEFTWARDS ARROWS'], #
    '⬲': ['LEFT ARROW WITH CIRCLED PLUS'], #
    '⬳': ['LONG LEFTWARDS SQUIGGLE ARROW'], #
    '⬴': ['LEFTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE'], #
    '⬵': ['LEFTWARDS TWO-HEADED ARROW WITH DOUBLE VERTICAL STROKE'], #
    '⬶': ['LEFTWARDS TWO-HEADED ARROW FROM BAR'], #
    '⬷': ['LEFTWARDS TWO-HEADED TRIPLE DASH ARROW'], #
    '⬸': ['LEFTWARDS ARROW WITH DOTTED STEM'], #
    '⬹': ['LEFTWARDS ARROW WITH TAIL WITH VERTICAL STROKE'], #
    '⬺': ['LEFTWARDS ARROW WITH TAIL WITH DOUBLE VERTICAL STROKE'], #
    '⬻': ['LEFTWARDS TWO-HEADED ARROW WITH TAIL'], #
    '⬼': ['LEFTWARDS TWO-HEADED ARROW WITH TAIL WITH VERTICAL STROKE'], #
    '⬽': ['LEFTWARDS TWO-HEADED ARROW WITH TAIL WITH DOUBLE VERTICAL STROKE'], #
    '⬾': ['LEFTWARDS ARROW THROUGH X'], #
    '⬿': ['WAVE ARROW POINTING DIRECTLY LEFT'], #
    '⭀': ['EQUALS SIGN ABOVE LEFTWARDS ARROW'], #
    '⭁': ['REVERSE TILDE OPERATOR ABOVE LEFTWARDS ARROW'], #
    '⭂': ['LEFTWARDS ARROW ABOVE REVERSE ALMOST EQUAL TO'], #
    '⭃': ['RIGHTWARDS ARROW THROUGH GREATER-THAN'], #
    '⭄': ['RIGHTWARDS ARROW THROUGH SUPERSET'], #
    '⭇': ['REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW'], #
    '⭈': ['RIGHTWARDS ARROW ABOVE REVERSE ALMOST EQUAL TO'], #
    '⭉': ['TILDE OPERATOR ABOVE LEFTWARDS ARROW'], #
    '⭊': ['LEFTWARDS ARROW ABOVE ALMOST EQUAL TO'], #

    
    '∀': ['FORALL', 'ALL', 'EVERY'],
    '∂': ['DIFFERENTIAL', 'DERIVATIVE'],
    '∏': ['PRODUCT'],
    '∐': ['COPRODUCT'],
    '∑': ['SUMMATION', 'SUM', 'SIGMA'],
    '√': ['ROOT', 'SQRT'],
    '∝': ['PROPORTIONAL'],
    '∞': ['INFINITY'],
    '∫': ['INTEGRAL'],
    '∮': ['CONTOUR', 'INTEGRAL'], # CONTOUR?
    '≋': ['TILDE'],
    '⊕': ['PLUS'],
    '⊖': ['MINUS'],
    '⊗': ['TIMES'],
    '⊘': ['DIVISION'],
    '⊙': ['DOT'],
    '⊚': ['RING'],
    '⊛': ['ASTERISK'],
    '⊜': ['EQUALS'],
    '⊝': ['DASH'],
    '⊞': ['PLUS'],
    '⊟': ['MINUS'],
    '⊠': ['TIMES'],
    '⊡': ['DOT'],
    
    '⊶': ['ORIGINAL'],
    '⊷': ['IMAGE'],
    '⋈': ['BOWTIE'],
    '▷': ['RIGHT', 'TRIANGLE'],
    '◁': ['LEFT', 'TRIANGLE'],
    '◸': ['UPPERLEFT', 'TRIANGLE'],
    '◹': ['UPPERRIGHT', 'TRIANGLE'],
    '◺': ['LOWERLEFT', 'TRIANGLE'],
    '◿': ['LOWERRIGHT', 'TRIANGLE'],
    '⟀': ['ANGLE', 'ORIGIN', 'SPACE', 'COORDINATES'],
    '⟁': ['TRIANGLE'],
    '⟂': ['PERPENDICULAR'],

    '⭋': ['LEFTWARDS ARROW ABOVE REVERSE TILDE OPERATOR'], #
    '⭌': ['RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR'], #
    '𞻰': ['ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL'], #
    '𞻱': ['ARABIC MATHEMATICAL OPERATOR HAH WITH DAL'], #
}

## Extracting mapping

In [42]:
used_symbols = set()
# TODO remove all language-specific symbols

### Extracting all one-word symbols

In [43]:
def char_name(char: str) -> str:
    return myunicode.name(char)

In [50]:
one_word_other_symbols = {
    char: char_name(char)
    for char in valid_other_symbols if len(char_name(char).split()) == 1
}

In [45]:
len(one_word_other_symbols)

(0, 4, 33)

In [47]:
one_word_other_symbols

{'℈': 'SCRUPLE',
 '℟': 'RESPONSE',
 '℣': 'VERSICLE',
 '⅍': 'AKTIESELSKAB',
 '⌂': 'HOUSE',
 '⌅': 'PROJECTIVE',
 '⌆': 'PERSPECTIVE',
 '⌒': 'ARC',
 '⌓': 'SEGMENT',
 '⌔': 'SECTOR',
 '⌭': 'CYLINDRICITY',
 '⌯': 'SYMMETRY',
 '⌳': 'SLOPE',
 '⏛': 'FUSE',
 '⏥': 'FLATNESS',
 '◉': 'FISHEYE',
 '◎': 'BULLSEYE',
 '⯗': 'TRANSPLUTO',
 '⯘': 'PROSERPINA',
 '⯙': 'ASTRAEA',
 '⯚': 'HYGIEA',
 '⯛': 'PHOLUS',
 '⯜': 'NESSUS',
 '⯠': 'CUPIDO',
 '⯡': 'HADES',
 '⯢': 'ZEUS',
 '⯣': 'KRONOS',
 '⯤': 'APOLLON',
 '⯥': 'ADMETOS',
 '⯦': 'VULCANUS',
 '⯧': 'POSEIDON',
 '⯲': 'SEDNA',
 '🥆': 'RIFLE'}

In [48]:
used_symbols.update(one_word_other_symbols.keys())

### Extracting the mapping from the names

In [None]:
currency_symbols2tokens = {
    char: char_name(char)
    for char in valid_currency_symbols if len(char_name(char).split()) == 1
}
math_symbols2tokens = {
    char: char_name(char)
    for char in valid_math_symbols if len(char_name(char).split()) == 1
}
other_symbols2tokens = {
    char: char_name(char)
    for char in valid_other_symbols if len(char_name(char).split()) == 1
}