# A navigable, hierarchical keyword dictionary

I wanted to build a more powerful way of handling keywords. 

In [275]:
def categorize(func, elements):
    """
    A touch of functional programming.
    This function classifies elements in a dictionary according
    to the result of the function passed as parameter.
    """
    dic = {}
    for e in elements:
        index = func(e)
        l = dic.get(index, None)
        if l is None:
            dic[index] = []
        dic[index].append(e)
    return dic

categorize(len, ['ae', 'a', 'ee', 'rrr'])

{2: ['ae', 'ee'], 1: ['a'], 3: ['rrr']}

In [276]:
class CONST_DICT:
    """
    This class is a static dictionary of constants. You could see it as a hierarchical enumeration.
    Every level has children that either sublevels or constants of any type. A level is a Python
    class. See further below for use.
    """
    @classmethod
    def contains(self, item):
        """
        Returns True if the item is contained in one of the constants in the dictionary.
        """
        tlv = [(k, v) for k, v in vars(self).items() if k and k[0] != '_']
        
        tlv = categorize(lambda x : type(x[1]), tlv)
        
        sub_dicts = tlv.get(type, [])
        tlv.pop(type, None)
        
        for t, v in tlv.items():
            if t in [list, tuple]:
                for elem in v:
                    if item in elem[1]:
                        return True
            else:
                for elem in v:
                    if item == elem[1]:
                        return True
                    
        for d in sub_dicts:
            if d[1].contains(item):
                return True
        
        return False
        
    @classmethod
    def get(self, item):
        """
        Returns the exact path of the item if it is present in the dictionary.
        Otherwise, return an empty string.
        """
        tlv = [(k, v) for k, v in vars(self).items() if k and k[0] != '_']
        
        tlv = categorize(lambda x : type(x[1]), tlv)
        
        sub_dicts = tlv.get(type, [])
        tlv.pop(type, None)
        
        for t, v in tlv.items():
            if t in [list, tuple]:
                for elem in v:
                    if item in elem[1]:
                        return self.__name__ + '.' + elem[0]
            else:
                for elem in v:
                    if item == elem[1]:
                        return self.__name__ + '.' + elem[0]
                    
        for d in sub_dicts:
            ans = d[1].get(item)
            if ans:
                return self.__name__ + '.' + ans
        
        return ''
    
    @classmethod
    def get_name(self, item):
        """
        Returns the name of the constant where the item appears.
        """
        a = self.get(item)
        if a:
            return a.split('.')[-1]
        else:
            return a
    
    @classmethod
    def values(self):
        """
        Returns a list of all items contained in the dictionary, regardless of
        hierarchy.
        """
        values = []
        
        tlv = [(k, v) for k, v in vars(self).items() if k and k[0] != '_']
        
        tlv = categorize(lambda x : type(x[1]), tlv)
        
        sub_dicts = tlv.get(type, [])
        tlv.pop(type, None)
        
        for t, v in tlv.items():
            if t in [list, tuple]:
                for elem in v:
                    values += (elem[1])
            else:
                for elem in v:
                    values.append(elem[1])
                    
        for d in sub_dicts:
            ans = d[1].values()
            values += ans
        
        return values

## Rockstar keyword dictionary

Now that we have our class, we can describe the keywords of the Rockstar programming language with it :

In [277]:
class KEYWORDS(CONST_DICT):
    class ASSIGNMENT(CONST_DICT):
        PUT = 'Put'
        INTO = 'into'
        PREFIX = 'Put'
        SUFFIX = 'into'

    SAY = ['Shout', 'Say', 'Whisper', 'Scream']
    
    STUTTER = ("Spit", 'Stutter')

    class OPERATOR(CONST_DICT):
        class CONDITIONAL(CONST_DICT):
            NEQ = ['is not', 
                   'aint', 
                   'werent', 
                   'wasnt']
            
            GT = ['higher than',
                  'greater than',
                  'more than',
                  'stronger than',
                  'bigger than']
            
            LT = ['lower than',
                 'less than',
                 'weaker than',
                 'smaller than']
            
            GE = ['as high as',
                  'as big as',
                  'as strong as',
                  'as great as',
                  'as beautiful as']
            
            LE = ['as low as',
                  'as small as',
                  'as weak as',
                  'as bad as',
                  'as little as',
                  'as ugly as']
            
        class ARITHMETIC(CONST_DICT):
            ADD = ['plus', 'with']
            SUB = ['minus', 'without']
            DIV = 'over'
            MUL = ['times', 'of']
        
        class INCREMENT(CONST_DICT):
            BUILD = 'Build'
            UP = 'up'
            PREFIX = 'Build'
            SUFFIX = 'up'
            
        class DECREMENT(CONST_DICT):
            PREFIX = 'Knock'
            SUFFIX = 'down'
            
        class FLOW(CONST_DICT):
            WHILE = 'While'
            IF = 'If'
            UNTIL = 'Until'
            
    class POETIC(CONST_DICT):
        class LITERAL(CONST_DICT):
            class BOOLEAN(CONST_DICT):
                TRUE = ("true", "right", "yes", "ok")
                FALSE = ("false", "wrong", "no", "lies")
            NULL = ("null", "nobody", "nowhere", "empty", "gone")
            
        ASSIGNMENT = ['is', 'was', 'were']
          
    PRONOUN = ("it", "he", "she", "him", "her", "they", "them")
            
    class FUNCTION(CONST_DICT):
        DECLARATION = 'takes'
        CALL = 'taking'
        RETURN = 'Give back'

In [278]:
KEYWORDS.contains('is'), KEYWORDS.contains('over'), KEYWORDS.contains(1)

(True, True, False)

In [279]:
KEYWORDS.get('aint'), KEYWORDS.get('over'), KEYWORDS.get(1)

('KEYWORDS.OPERATOR.CONDITIONAL.NEQ', 'KEYWORDS.OPERATOR.ARITHMETIC.DIV', '')

In [280]:
KEYWORDS.get('Shout')

'KEYWORDS.SAY'

In [281]:
KEYWORDS.values()

['Shout',
 'Say',
 'Whisper',
 'Scream',
 'Spit',
 'Stutter',
 'it',
 'he',
 'she',
 'him',
 'her',
 'they',
 'them',
 'Put',
 'into',
 'Put',
 'into',
 'is not',
 'aint',
 'werent',
 'wasnt',
 'higher than',
 'greater than',
 'more than',
 'stronger than',
 'bigger than',
 'lower than',
 'less than',
 'weaker than',
 'smaller than',
 'as high as',
 'as big as',
 'as strong as',
 'as great as',
 'as beautiful as',
 'as low as',
 'as small as',
 'as weak as',
 'as bad as',
 'as little as',
 'as ugly as',
 'plus',
 'with',
 'minus',
 'without',
 'times',
 'of',
 'over',
 'Build',
 'up',
 'Build',
 'up',
 'Knock',
 'down',
 'While',
 'If',
 'Until',
 'is',
 'was',
 'were',
 'null',
 'nobody',
 'nowhere',
 'empty',
 'gone',
 'true',
 'right',
 'yes',
 'ok',
 'false',
 'wrong',
 'no',
 'lies',
 'takes',
 'taking',
 'Give back']

In [282]:
class KeywordDictionary:
    """
    If you are lazy like me, you may want to be able to overload operators to 
    write more expressive (ie less) code. We can't do that on raw classes, so 
    we need a instantiated class that serves as an interface between the static 
    class and our program. Python is magic.
    """
    def __init__(self, const_dict):
        
        self.cdict = const_dict
        
        tlv = [(k, v) for k, v in vars(self.cdict).items() if k and k[0] != '_']
        
        tlv = categorize(lambda x : type(x[1]), tlv)
        
        sub_dicts = tlv.get(type, [])
        tlv.pop(type, None)
        
        for t, v in tlv.items():
            for elem in v:
                setattr(self, *elem)
        
        for d in sub_dicts:
            setattr(self, d[0], self.__class__(d[1]))
            
    def __contains__(self, item):
        return self.cdict.contains(item)
    def __getitem__(self, item):
        return self.cdict.get(item)
    def __call__(self, item):
        return self.cdict.get_name(item)
    def get(self, item):
        return self[item]
    def name(self, item):
        return self.cdict.get_name(item)
    def values(self, item):
        return self.cdict.values()
    def __str__(self):
        return str(vars(self))
    def __repr__(self):
        return self.__str__()

In [283]:
kd = KeywordDictionary(KEYWORDS)

In [284]:
# Now you can write
'true' in kd.POETIC, kd['as high as'], kd('as high as'), kd.name('as high as'), kd('non-existing')

(True, 'KEYWORDS.OPERATOR.CONDITIONAL.GE', 'GE', 'GE', '')

In [285]:
# Instead of
KEYWORDS.POETIC.contains('true'), KEYWORDS.get('as high as'), KEYWORDS.get_name('as high as')

(True, 'KEYWORDS.OPERATOR.CONDITIONAL.GE', 'GE')

We can store our operations in the same way :

In [286]:
import operator

class OPERATIONS(CONST_DICT):
    class ARITHMETIC(CONST_DICT):
        ADD = operator.add
        SUB = operator.sub
        DIV = operator.truediv
        MUL = operator.mul
    class CONDITIONAL(CONST_DICT):
        EQ = operator.eq
        NE = operator.ne
        LT = operator.lt
        LE = operator.le
        GT = operator.gt
        GE = operator.ge
        OR = operator.or_
        AND = operator.and_
        TRUTH = operator.truth
        NOT = operator.__not__

# Simple tokenization with our Rockstar keyword dictionary

Now that our dictionary has been defined, we can try to use the tokenization function from the expression parser. This function takes a list of entities that needs to be tokenized first, then tokenizes the rest of the text in string and number literals. Everything else is tokenized on a word basis. 

I am taking this code from `test-bool.rock` :

In [287]:
code = """Put 1 into my heart
Put 2 into my arm
Put 3 into my head
Put 0 into my test
Put 6 into the allfather

If my head is not my heart
Say "is not: Okay"
Put my test plus 1 into my test

If my head is my head
Say "is: Okay"
Put my test plus 1 into my test

If my arm aint my head
Say "aint: Okay"
Put my test plus 1 into my test

If my head is more than my heart
Say "is X than: Okay"
Put my test plus 1 into my test

If my arm is less than 5
Say "is X than numeric literal: Okay"
Put my test plus 1 into my test

If my arm is as high as my heart
Say "is as X as: Okay"
Put my test plus 1 into my test

If my test is the allfather
Say "All tests passed"

If my test is not the allfather
Spit my test over the allfather
Shout "tests passed"

"""

In [288]:
import re

def rgxer(sss):
    result = ''
    for c in sss:
        if c in '.<>*[]{}+':
            result += '[' + c + ']'
        else:
            result += c
    return result

def match(regex, token):
    return bool(re.match('^{}$'.format(regex), token))

def find_all(ss, s):
    """
    Return all occurences of a substring found in a string.
    Ouput is (<start>, <end>) of occurence
    """
    pos = []
    p = 0
    offset = 0
    while p + 1 and ss:
        p = ss.find(s)
        if p + 1:
            pos.append((offset + p, offset + p + len(s)))
            ss = ss[p+1:]
            offset += p + 1
    return pos

def tokenize(expr, entities=[]):
    """
    The big, fluffy tokenize function.
    This function will tokenize as following :
    You can give it a list of entities that will be prioritized over any token, 
    except if they are in a "string literal".
    """
    items = []
    item = ''
    entities = sorted(entities, key=len)
    entity_occs = [(entity, find_all(expr, entity)) for entity in entities]
    entity_occs = [e for e in entity_occs if e[1]]
    found_entities = {occ[0]: (occ[1], occs[0]) for occs in entity_occs for occ in occs[1]}
    
    ignoring = 0
    in_quotes = ''
    for i, e in enumerate(expr):
        if ignoring:
            ignoring -= 1
            continue
            
        if not in_quotes and e in ['"', "'"]:
            in_quotes = e
            items.append(item) if item else None
            item = e        
            
        elif in_quotes:
            if e == in_quotes:
                in_quotes = ''
                item += e
                items.append(item) if item else None
                item = ''
            else:
                item += e
        
        elif i in found_entities and not match('[a-zA-Z][a-zA-Z0-9]*', found_entities[i][1]):

            # if found entity is all letters and item is not empty, do nothing
            # else
            ignoring = found_entities[i][0] - i - 1
            items.append(item) if item else None
            item = ''
            items.append(found_entities[i][1])
        
        elif match('[\w.]', e):
            item += e
        elif e == ' ':
            items.append(item) if item else None
            item = ''
        else:
            items.append(item) if item else None
            item = ''
            items.append(e)
        
    if item:
        items.append(item)    
    
    return items

In [290]:
for line in code.split('\n'):
    line = line.strip()
    if not line or line and line[0] == '(':
        continue
    print(line)
    print(tokenize(line, KEYWORDS.values()))
    print(*map(lambda x : KEYWORDS.get_name(x) if x in KEYWORDS.values() else x, tokenize(line, KEYWORDS.values())))
    print()

Put 1 into my heart
['Put', '1', 'into', 'my', 'heart']
PUT 1 INTO my heart

Put 2 into my arm
['Put', '2', 'into', 'my', 'arm']
PUT 2 INTO my arm

Put 3 into my head
['Put', '3', 'into', 'my', 'head']
PUT 3 INTO my head

Put 0 into my test
['Put', '0', 'into', 'my', 'test']
PUT 0 INTO my test

Put 6 into the allfather
['Put', '6', 'into', 'the', 'allfather']
PUT 6 INTO the allfather

If my head is not my heart
['If', 'my', 'head', 'is not', 'my', 'heart']
IF my head NEQ my heart

Say "is not: Okay"
['Say', '"is not: Okay"']
SAY "is not: Okay"

Put my test plus 1 into my test
['Put', 'my', 'test', 'plus', '1', 'into', 'my', 'test']
PUT my test ADD 1 INTO my test

If my head is my head
['If', 'my', 'head', 'is', 'my', 'head']
IF my head ASSIGNMENT my head

Say "is: Okay"
['Say', '"is: Okay"']
SAY "is: Okay"

Put my test plus 1 into my test
['Put', 'my', 'test', 'plus', '1', 'into', 'my', 'test']
PUT my test ADD 1 INTO my test

If my arm aint my head
['If', 'my', 'arm', 'aint', 'my', 'he

As we can see, the tokenization correctly captures keywords separately. But as some of them can appear as inert tokens unless they follow a specific pattern (for instance, `times` is a keyword, but can also appear as a regular word), we can't just do a search-and-replace as done above after the tokenization function. We need to disambiguate according to their position and neighborhood. That's what the current tokenization function does.

There may also be a need for disambiguation if by any chance, any of our multi-word keywords is allowed to appear as regular tokens, since this tokenization function automatically merges them when spotted.
