# Symbols mapping

## Obtaining all valid symbols

In [2]:
import myunicode

In [6]:
from itertools import count

def all_unicode_iterator():
    for code in count(0):
        try:
            chr(code)
        except ValueError:
            return
        
        yield chr(code)

In [11]:
all_currency_symbols = [char for char in all_unicode_iterator() if myunicode.category(char) == 'Sc']
all_math_symbols     = [char for char in all_unicode_iterator() if myunicode.category(char) == 'Sm']
all_other_symbols    = [char for char in all_unicode_iterator() if myunicode.category(char) == 'So']

In [12]:
len(all_currency_symbols), len(all_math_symbols), len(all_other_symbols)

(63, 948, 6605)

In [13]:
def is_normalized(char: str) -> bool:
    try:
        return myunicode.ens_normalize(char) == char
    except ValueError:
        return False

In [37]:
def is_valid(char: str) -> bool:
    return is_normalized(char) and not myunicode.is_emoji(char)

In [40]:
valid_currency_symbols = [char for char in all_currency_symbols if is_valid(char)]
valid_math_symbols     = [char for char in all_math_symbols if is_valid(char)]
valid_other_symbols    = [char for char in all_other_symbols if is_valid(char)]

In [41]:
len(valid_currency_symbols), len(valid_math_symbols), len(valid_other_symbols)

(55, 102, 1943)

## Math and Currencies mapping made by hand

In [65]:
# for symb in valid_currency_symbols:
#     print(f"{repr(symb)}: [{repr(myunicode.name(symb))}]")

# print()

# for symb in valid_math_symbols:
#     print(f"{repr(symb)}: [{repr(myunicode.name(symb))}]")

In [66]:
currency_mapping = {
    '$': ['DOLLAR'],
    '¬¢': ['CENT'],
    '¬£': ['POUND'],
    '¬§': ['CURRENCY'],
    '¬•': ['YEN'],
    '÷è': ['DRAM'],
    'ÿã': ['AFGHANI'],
    'ﬂæ': ['NKO DOROME SIGN'], #
    'ﬂø': ['NKO TAMAN SIGN'], #
    '‡ß≤': ['RUPEE'],
    '‡ß≥': ['RUPEE'],
    '‡ßª': ['GANDA'],
    '‡´±': ['RUPEE'],
    '‡Øπ': ['RUPEE'],
    '‡∏ø': ['BAHT'],
    '·üõ': ['RIEL'],
    '‚Ç†': ['EURO'],
    '‚Ç°': ['COLON'],
    '‚Ç¢': ['CRUZEIRO'],
    '‚Ç£': ['FRANC'],
    '‚Ç§': ['LIRA'],
    '‚Ç•': ['MILL'],
    '‚Ç¶': ['NAIRA'],
    '‚Çß': ['PESETA'],
    '‚Ç©': ['WON'],
    '‚Ç™': ['SHEQEL'],
    '‚Ç´': ['DONG'],
    '‚Ç¨': ['EURO'],
    '‚Ç≠': ['KIP'],
    '‚ÇÆ': ['TUGRIK'],
    '‚ÇØ': ['DRACHMA'],
    '‚Ç∞': ['PENNY'],
    '‚Ç±': ['PESO'],
    '‚Ç≤': ['GUARANI'],
    '‚Ç≥': ['AUSTRAL'],
    '‚Ç¥': ['HRYVNIA'],
    '‚Çµ': ['CEDI'],
    '‚Ç∂': ['LIVRE TOURNOIS SIGN'], #
    '‚Ç∑': ['SPESMILO'],
    '‚Ç∏': ['TENGE'],
    '‚Çπ': ['RUPEE'],
    '‚Ç∫': ['LIRA'],
    '‚Çª': ['NORDIC MARK SIGN'], #
    '‚Çº': ['MANAT'],
    '‚ÇΩ': ['RUBLE'],
    '‚Çæ': ['LARI'],
    '‚Çø': ['BITCOIN'],
    '\u20c0': ['SOM'],
    'Í†∏': ['RUPEE'],
    'ëøù': ['KAACU'],
    'ëøû': ['PANAM'],
    'ëøü': ['PON'],
    'ëø†': ['VARAAKAN'],
    'ûãø': ['WANCHO NGUN SIGN'], #
    'û≤∞': ['INDIC SIYAQ RUPEE MARK'], #
}

In [67]:
math_mapping = {
    '¬¨': ['NOT', 'NEGATION'],
    '¬±': ['PLUS-MINUS', 'UNCERTAINTY', 'ERROR'],  # ??
    '‚àì': ['MINUS-PLUS'], # ??
    'œ∂': ['EPSILON'],
    'ÿÜ': ['ROOT'],
    'ÿá': ['ROOT'],
    'ÿà': ['RAY'],
    '‚ÅÑ': ['FRACTION'],
    '‚Ñò': ['SCRIPT CAPITAL P'],  #
    
    '‚Üê': ['LEFTWARDS', 'LEFT', 'ARROW'],
    '‚Üë': ['UPWARDS', 'UP', 'ARROW'],
    '‚Üí': ['RIGHTWARDS', 'RIGHT', 'ARROW'],
    '‚Üì': ['DOWNWARDS', 'DOWN', 'ARROW'],
    '‚Üö': ['LEFTWARDS', 'ARROW', 'STROKE'], # ??
    '‚Üõ': ['RIGHTWARDS', 'ARROW', 'STROKE'], # ??
    '‚Ü†': ['RIGHTWARDS', 'ARROW'], # ??
    '‚Ü£': ['RIGHTWARDS', 'ARROW'], # ??
    '‚Ü¶': ['RIGHTWARDS', 'ARROW'], # ??
    '‚ÜÆ': ['ARROW', 'STROKE'], # ??
    '‚áé': ['ARROW', 'STROKE'], #
    '‚áè': ['RIGHTWARDS', 'ARROW', 'STROKE'], # ??
    '‚áí': ['RIGHTWARDS', 'RIGHT', 'ARROW'],
    '‚áî': ['LEFT RIGHT DOUBLE ARROW'], #
    '‚á¥': ['ARROW'], # ??
    '‚áµ': ['DOWNWARDS ARROW LEFTWARDS OF UPWARDS ARROW'], #
    '‚á∂': ['THREE RIGHTWARDS ARROWS'], #
    '‚á∑': ['LEFTWARDS', 'ARROW', 'STROKE'],
    '‚á∏': ['RIGHTWARDS', 'ARROW', 'STROKE'],
    '‚áπ': ['LEFT RIGHT ARROW WITH VERTICAL STROKE'], #
    '‚á∫': ['LEFTWARDS ARROW WITH DOUBLE VERTICAL STROKE'], #
    '‚áª': ['RIGHTWARDS ARROW WITH DOUBLE VERTICAL STROKE'], #
    '‚áº': ['LEFT RIGHT ARROW WITH DOUBLE VERTICAL STROKE'], #
    '‚áΩ': ['LEFTWARDS OPEN-HEADED ARROW'], #
    '‚áæ': ['RIGHTWARDS OPEN-HEADED ARROW'], #
    '‚áø': ['LEFT RIGHT OPEN-HEADED ARROW'], #
    '‚ü∞': ['UPWARDS', 'ARROW'],
    '‚ü±': ['DOWNWARDS', 'ARROW'],
    '‚¨∞': ['LEFT ARROW WITH SMALL CIRCLE'], #
    '‚¨±': ['THREE LEFTWARDS ARROWS'], #
    '‚¨≤': ['LEFT ARROW WITH CIRCLED PLUS'], #
    '‚¨≥': ['LONG LEFTWARDS SQUIGGLE ARROW'], #
    '‚¨¥': ['LEFTWARDS TWO-HEADED ARROW WITH VERTICAL STROKE'], #
    '‚¨µ': ['LEFTWARDS TWO-HEADED ARROW WITH DOUBLE VERTICAL STROKE'], #
    '‚¨∂': ['LEFTWARDS TWO-HEADED ARROW FROM BAR'], #
    '‚¨∑': ['LEFTWARDS TWO-HEADED TRIPLE DASH ARROW'], #
    '‚¨∏': ['LEFTWARDS ARROW WITH DOTTED STEM'], #
    '‚¨π': ['LEFTWARDS ARROW WITH TAIL WITH VERTICAL STROKE'], #
    '‚¨∫': ['LEFTWARDS ARROW WITH TAIL WITH DOUBLE VERTICAL STROKE'], #
    '‚¨ª': ['LEFTWARDS TWO-HEADED ARROW WITH TAIL'], #
    '‚¨º': ['LEFTWARDS TWO-HEADED ARROW WITH TAIL WITH VERTICAL STROKE'], #
    '‚¨Ω': ['LEFTWARDS TWO-HEADED ARROW WITH TAIL WITH DOUBLE VERTICAL STROKE'], #
    '‚¨æ': ['LEFTWARDS ARROW THROUGH X'], #
    '‚¨ø': ['WAVE ARROW POINTING DIRECTLY LEFT'], #
    '‚≠Ä': ['EQUALS SIGN ABOVE LEFTWARDS ARROW'], #
    '‚≠Å': ['REVERSE TILDE OPERATOR ABOVE LEFTWARDS ARROW'], #
    '‚≠Ç': ['LEFTWARDS ARROW ABOVE REVERSE ALMOST EQUAL TO'], #
    '‚≠É': ['RIGHTWARDS ARROW THROUGH GREATER-THAN'], #
    '‚≠Ñ': ['RIGHTWARDS ARROW THROUGH SUPERSET'], #
    '‚≠á': ['REVERSE TILDE OPERATOR ABOVE RIGHTWARDS ARROW'], #
    '‚≠à': ['RIGHTWARDS ARROW ABOVE REVERSE ALMOST EQUAL TO'], #
    '‚≠â': ['TILDE OPERATOR ABOVE LEFTWARDS ARROW'], #
    '‚≠ä': ['LEFTWARDS ARROW ABOVE ALMOST EQUAL TO'], #

    
    '‚àÄ': ['FORALL', 'ALL', 'EVERY'],
    '‚àÇ': ['DIFFERENTIAL', 'DERIVATIVE'],
    '‚àè': ['PRODUCT'],
    '‚àê': ['COPRODUCT'],
    '‚àë': ['SUMMATION', 'SUM', 'SIGMA'],
    '‚àö': ['ROOT', 'SQRT'],
    '‚àù': ['PROPORTIONAL'],
    '‚àû': ['INFINITY'],
    '‚à´': ['INTEGRAL'],
    '‚àÆ': ['CONTOUR', 'INTEGRAL'], # CONTOUR?
    '‚âã': ['TILDE'],
    '‚äï': ['PLUS'],
    '‚äñ': ['MINUS'],
    '‚äó': ['TIMES'],
    '‚äò': ['DIVISION'],
    '‚äô': ['DOT'],
    '‚äö': ['RING'],
    '‚äõ': ['ASTERISK'],
    '‚äú': ['EQUALS'],
    '‚äù': ['DASH'],
    '‚äû': ['PLUS'],
    '‚äü': ['MINUS'],
    '‚ä†': ['TIMES'],
    '‚ä°': ['DOT'],
    
    '‚ä∂': ['ORIGINAL'],
    '‚ä∑': ['IMAGE'],
    '‚ãà': ['BOWTIE'],
    '‚ñ∑': ['RIGHT', 'TRIANGLE'],
    '‚óÅ': ['LEFT', 'TRIANGLE'],
    '‚ó∏': ['UPPERLEFT', 'TRIANGLE'],
    '‚óπ': ['UPPERRIGHT', 'TRIANGLE'],
    '‚ó∫': ['LOWERLEFT', 'TRIANGLE'],
    '‚óø': ['LOWERRIGHT', 'TRIANGLE'],
    '‚üÄ': ['ANGLE', 'ORIGIN', 'SPACE', 'COORDINATES'],
    '‚üÅ': ['TRIANGLE'],
    '‚üÇ': ['PERPENDICULAR'],

    '‚≠ã': ['LEFTWARDS ARROW ABOVE REVERSE TILDE OPERATOR'], #
    '‚≠å': ['RIGHTWARDS ARROW ABOVE REVERSE TILDE OPERATOR'], #
    'ûª∞': ['ARABIC MATHEMATICAL OPERATOR MEEM WITH HAH WITH TATWEEL'], #
    'ûª±': ['ARABIC MATHEMATICAL OPERATOR HAH WITH DAL'], #
}

## Extracting mapping

In [42]:
used_symbols = set()
# TODO remove all language-specific symbols

### Extracting all one-word symbols

In [43]:
def char_name(char: str) -> str:
    return myunicode.name(char)

In [50]:
one_word_other_symbols = {
    char: char_name(char)
    for char in valid_other_symbols if len(char_name(char).split()) == 1
}

In [45]:
len(one_word_other_symbols)

(0, 4, 33)

In [47]:
one_word_other_symbols

{'‚Ñà': 'SCRUPLE',
 '‚Ñü': 'RESPONSE',
 '‚Ñ£': 'VERSICLE',
 '‚Öç': 'AKTIESELSKAB',
 '‚åÇ': 'HOUSE',
 '‚åÖ': 'PROJECTIVE',
 '‚åÜ': 'PERSPECTIVE',
 '‚åí': 'ARC',
 '‚åì': 'SEGMENT',
 '‚åî': 'SECTOR',
 '‚å≠': 'CYLINDRICITY',
 '‚åØ': 'SYMMETRY',
 '‚å≥': 'SLOPE',
 '‚èõ': 'FUSE',
 '‚è•': 'FLATNESS',
 '‚óâ': 'FISHEYE',
 '‚óé': 'BULLSEYE',
 '‚Øó': 'TRANSPLUTO',
 '‚Øò': 'PROSERPINA',
 '‚Øô': 'ASTRAEA',
 '‚Øö': 'HYGIEA',
 '‚Øõ': 'PHOLUS',
 '‚Øú': 'NESSUS',
 '‚Ø†': 'CUPIDO',
 '‚Ø°': 'HADES',
 '‚Ø¢': 'ZEUS',
 '‚Ø£': 'KRONOS',
 '‚Ø§': 'APOLLON',
 '‚Ø•': 'ADMETOS',
 '‚Ø¶': 'VULCANUS',
 '‚Øß': 'POSEIDON',
 '‚Ø≤': 'SEDNA',
 'ü•Ü': 'RIFLE'}

In [48]:
used_symbols.update(one_word_other_symbols.keys())

### Extracting the mapping from the names

In [None]:
currency_symbols2tokens = {
    char: char_name(char)
    for char in valid_currency_symbols if len(char_name(char).split()) == 1
}
math_symbols2tokens = {
    char: char_name(char)
    for char in valid_math_symbols if len(char_name(char).split()) == 1
}
other_symbols2tokens = {
    char: char_name(char)
    for char in valid_other_symbols if len(char_name(char).split()) == 1
}