# Reminder Parser Scratch Pad

Automatically reload modules and functions before execution

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from local import parser

In [3]:
import re

Test Strings, because _what is the world without fixtures?_

In [4]:
a = "bob in 28 seconds to mow the lawn"
b = "jack and jill in 3 days to go fuck themselves"
c = "jack, jill and dr_seuss to eat a dick in 3 days"
d = "robert, mike and claire about that shovel in my trunk in 28 years"
e = "robert, mike, douche, and oxford_comma to write about lawns to mow in 30 minutes"
f = u"me in 20 minutes about la fois où j'ai chié par la fenêtre du presbytère (屮゜Д゜)屮" # We need a UTF-8 string in there for good measure

g = "Some invalid string, I guess." # Complete garbage
h = "me to huaaah hueeeeeh" # Vague effort
i = "me in 30 googlblsjhs eerrr fffuuuh" # a+ for effort (partial time, missing message)
j = "me to fuaaah eeeeh in goobledeegook" # special olympics (message, incomplete time)
k = "me bob, matthew and johnesss about that one time at band camp in 2 seconds" #bad recipient
l = "Alice and Bob, Marie, Doug, and Mike to do something in 3 aeons" # funky recipients, oxford, invalid time unit
m = "me, me, me and me to fuck off in 30 years"
n = "ocean and ocean in 30 minutes to fuck a guy"
o = "ocean, ccfreak2k and a mean pimp to do some thugging in 30 minutes"
p = "Spartacus, some_greek, plato, and spartacus in 3 days to maybe die or something"

q = "knifa to get blind drunk in -98 nanoseconds"
r = "A about the colors that are too loud in 1 second"
s = "some_guy about the dildoes in 48 beard-seconds"

t = "Bob, Alice, Robert Marie to have too many recipients in 20 minutes"

teststrings = [a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t]

Bullshit Test Harness for tokenizer

In [5]:
for line in teststrings:
    try:
        print repr(parser.tokenize(line))
    except parser.ReminderSyntaxError as err:
        print "I don't understand: %s" % (err.message)

{'message': 'mow the lawn', 'recipient': ['bob'], 'time': ['28', 'seconds']}
{'message': 'go fuck themselves', 'recipient': ['jack', 'jill'], 'time': ['3', 'days']}
{'message': 'eat a dick', 'recipient': ['jack', 'jill', 'dr_seuss'], 'time': ['3', 'days']}
{'message': 'that shovel', 'recipient': ['robert', 'mike', 'claire'], 'time': ['28', 'years']}
{'message': 'write about lawns to mow', 'recipient': ['robert', 'mike', 'douche', 'oxford_comma'], 'time': ['30', 'minutes']}
{'message': u"la fois o\xf9 j'ai chi\xe9 par la fen\xeatre du presbyt\xe8re (\u5c6e\u309c\u0414\u309c)\u5c6e", 'recipient': [u'me'], 'time': [u'20', u'minutes']}
I don't understand: Remind who?
I don't understand: Remind when?
I don't understand: Remind what..?
I don't understand: Remind when?
{'message': 'that one time at band camp', 'recipient': ['me bob', 'matthew', 'johnesss'], 'time': ['2', 'seconds']}
{'message': 'do something', 'recipient': ['Alice', 'Bob', 'Marie', 'Doug', 'Mike'], 'time': ['3', 'aeons']}
{'m

What would be the best way to figure out what is a legit recipient?
Obviously the following characteristics are somewhere to start.
I don't want this to really sophisticated, just ignore the _blatantly invalid_ shit.

 1. whole word, no whitespaces, boundaries on each side.
 2. Allowed characters only?
 3. no duplicates (although that is a problem for the parser logic)
 
 BTW thanks RFCs, these characters are allowed: 
     []\`_^{|}


In [6]:
validchars = r'[]\`_^{|}'
nickexpr = r'^[a-z%s][-0-9a-z%s]*$' % (re.escape(validchars), re.escape(validchars))
validnick = re.compile(nickexpr, re.IGNORECASE) 

# Rough Testing
def isvalid(s):
    return bool(validnick.match(s))

nicks = [
            "mr_daemon", 
            "ccfreak2k|work", 
            "A", "Ember`", 
            "hua-hueh_mchueh|magic`",
            "@ssh,ole//",
            "@ass",
            "c++",
            "Z0rg\dicks",
            "somehow{thisisvalid}",
            "are space",
            "are REALLY space!",
        ]

for nick in nicks:
    print nick, "=>", isvalid(nick)



mr_daemon => True
ccfreak2k|work => True
A => True
Ember` => True
hua-hueh_mchueh|magic` => True
@ssh,ole// => False
@ass => False
c++ => False
Z0rg\dicks => True
somehow{thisisvalid} => True
are space => False
are REALLY space! => False


## SIDETRACK: Checkout the tokenizer regexes for recipient

Actually, now that I know what I nick /is/, I can probably use this to clean up the match in the tokenizer.
Let's try it out:

In [7]:
firstpassexpr = re.compile(r'^(.+?)\s(in|to|about)\b', re.IGNORECASE)
firstpass_output = firstpassexpr.search(l).group(1) # still the ass oxford comma input
print "Source:", repr(firstpass_output)

originaldelim = re.compile(r'\b,\sand\b|\b,\s|\s\band\b\s', re.IGNORECASE) # I was dumb and my word boundaries were flipped around

badnickexpr = re.compile(r'(?!\b[a-z%s][-0-9a-z%s]*\b)' % (re.escape(validchars), re.escape(validchars)))
badnickdelim = re.compile(nickexpr, re.IGNORECASE)

print repr(originaldelim.search(firstpass_output))
print repr(badnickdelim.search(firstpass_output))

print "Original:", repr(originaldelim.split(firstpass_output))
print "Magical shit negation:", repr(badnickdelim.split(firstpass_output))

print "cleaned up original", originaldelim.sub(" ", firstpass_output)
print "cleaned up magical", badnickdelim.sub(" ", firstpass_output)

Source: 'Alice and Bob, Marie, Doug, and Mike'
<_sre.SRE_Match object at 0x03653950>
None
Original: ['Alice', 'Bob', 'Marie', 'Doug', ' Mike']
Magical shit negation: ['Alice and Bob, Marie, Doug, and Mike']
cleaned up original Alice Bob Marie Doug  Mike
cleaned up magical Alice and Bob, Marie, Doug, and Mike


In [8]:
originaldelim = re.compile(r'\b,\sand\b\s|\b,\s|\s\band\b\s', re.IGNORECASE) # Let's tweak this

print "Tokens:", repr(originaldelim.split(firstpass_output))
print "Sub:", repr(originaldelim.sub(" ", firstpass_output))

Tokens: ['Alice', 'Bob', 'Marie', 'Doug', 'Mike']
Sub: 'Alice Bob Marie Doug Mike'


So turns you can't split on such a thing, or I am doing wrong, or both. Meanwhile I spotted the problem with the original expression, so let's pursue that.

## BACK TO THE PROGRAM

We're going to need some more fixtures for the next tests:

In [9]:
# Setup some fixtures
sender = "mr_daemon" # This will be handed to parser.parse() in the concrete implementation
tokens = parser.tokenize(e) # e is the shit one with the oxford comma

print repr(tokens)

{'message': 'write about lawns to mow', 'recipient': ['robert', 'mike', 'douche', 'oxford_comma'], 'time': ['30', 'minutes']}


Let's _attempt_ to parse the recipient block

In [10]:

results = []

tokens_ = tokens["recipient"]

for index, token in enumerate(tokens_):
    if token.lower() == "me":
        token = sender
    
    
    if token in results:
        raise parser.ReminderSyntaxError("Duplicate recipient %s at position %d" % 
                                         (tokens_[index], index))
    
    if not bool(validnick.match(token)):
        raise parser.ReminderSyntaxError("Invalid recipient near %s at position %d" % 
                                         (tokens_[index], index))
    
    results.append(token)

print repr(results)

['robert', 'mike', 'douche', 'oxford_comma']


Alright I like this let's make a shit test harness

In [11]:
def recipient_parse(recipients):
    results = []
    for index, token in enumerate(recipients):
        if token.lower() == "me":
            token = sender


        if token.lower() in results:
            raise parser.ReminderSyntaxError("Duplicate recipient %s at position %d" % 
                                             (recipients[index], index))

        if not bool(validnick.match(token)):
            raise parser.ReminderSyntaxError("Invalid recipient near %s at position %d" % 
                                             (recipients[index], index))

        results.append(token.lower())

    return results

for line in teststrings:
    
    print repr(line), "=>"
    
    testresult = None
    try:
        recipient_token = parser.tokenize(line)["recipient"]
        testresult = recipient_parse(recipient_token)
    except parser.ReminderSyntaxError as err:
        print "I don't understand: %s" % (err.message)
        
    print repr(testresult)
        

'bob in 28 seconds to mow the lawn' =>
['bob']
'jack and jill in 3 days to go fuck themselves' =>
['jack', 'jill']
'jack, jill and dr_seuss to eat a dick in 3 days' =>
['jack', 'jill', 'dr_seuss']
'robert, mike and claire about that shovel in my trunk in 28 years' =>
['robert', 'mike', 'claire']
'robert, mike, douche, and oxford_comma to write about lawns to mow in 30 minutes' =>
['robert', 'mike', 'douche', 'oxford_comma']
u"me in 20 minutes about la fois o\xf9 j'ai chi\xe9 par la fen\xeatre du presbyt\xe8re (\u5c6e\u309c\u0414\u309c)\u5c6e" =>
['mr_daemon']
'Some invalid string, I guess.' =>
I don't understand: Remind who?
None
'me to huaaah hueeeeeh' =>
I don't understand: Remind when?
None
'me in 30 googlblsjhs eerrr fffuuuh' =>
I don't understand: Remind what..?
None
'me to fuaaah eeeeh in goobledeegook' =>
I don't understand: Remind when?
None
'me bob, matthew and johnesss about that one time at band camp in 2 seconds' =>
I don't understand: Invalid recipient near me bob at posit

Alright let's just confirm the concrete implementation works

In [12]:
from local import parser

In [13]:
for line in teststrings:
    
    print repr(line), "=>"
    
    testresult = None
    try:
        recipient_token = parser.tokenize(line)["recipient"]
        testresult = parser.parse_recipients(sender, recipient_token)
    except parser.ReminderSyntaxError as err:
        print "I don't understand: %s" % (err.message)
        
    print repr(testresult)

'bob in 28 seconds to mow the lawn' =>
['bob']
'jack and jill in 3 days to go fuck themselves' =>
['jack', 'jill']
'jack, jill and dr_seuss to eat a dick in 3 days' =>
['jack', 'jill', 'dr_seuss']
'robert, mike and claire about that shovel in my trunk in 28 years' =>
['robert', 'mike', 'claire']
'robert, mike, douche, and oxford_comma to write about lawns to mow in 30 minutes' =>
['robert', 'mike', 'douche', 'oxford_comma']
u"me in 20 minutes about la fois o\xf9 j'ai chi\xe9 par la fen\xeatre du presbyt\xe8re (\u5c6e\u309c\u0414\u309c)\u5c6e" =>
['mr_daemon']
'Some invalid string, I guess.' =>
I don't understand: Remind who?
None
'me to huaaah hueeeeeh' =>
I don't understand: Remind when?
None
'me in 30 googlblsjhs eerrr fffuuuh' =>
I don't understand: Remind what..?
None
'me to fuaaah eeeeh in goobledeegook' =>
I don't understand: Remind when?
None
'me bob, matthew and johnesss about that one time at band camp in 2 seconds' =>
I don't understand: Invalid recipient near me bob at posit

## Parsing the date

So now the question is basically what to store dates as.

Things to keep in mind

* The date will have to stored in /something/. If it is sqlite, What column type exactly?
* Do I need an interval or a timestamp? Do I just calculate the distance between now and then and use that?
* How hard would it be to add _at_ as an option, so a user could specify a timestamp themselves? (Probably a lot, it would be a different format entirely)
* What does the Supybot scheduler expects?


In [14]:
token_ = tokens["time"]
print repr(token_)


['30', 'minutes']


In [15]:
def isvalidunit(s):
    units = ["second", "minute", "hour", "day", "week", "month", "year"]
    
    s = s.lower()
    
    if s.endswith("s"):
        s = s[:-1]
    
    if s.lower() in units:
        return True
    else:
        return False

def dateparse(timelist):
    
    if len(timelist) != 2:
        raise parser.ReminderSyntaxError("Malformed time value, expected 2 arguments got %s" % (len(timelist)))
    
    # the tokenizer will barf on this already, so probably unnecessary
    if not timelist[0].isdigit():
        raise parser.ReminderSyntaxError("Quantifying time value '%s' is not an integer." % (timelist[1]))
        
    if not isvalidunit(timelist[1]):
        raise parser.ReminderSyntaxError("'%s' is not a valid time unit." % (timelist[1]))
    
    return timelist

In [16]:
for line in teststrings:
    
    print repr(line), "=>"
    
    testresult = None
    try:
        time_token = parser.tokenize(line)["time"]
        testresult = dateparse(time_token)
    except parser.ReminderSyntaxError as err:
        print "I don't understand: %s" % (err.message)
        
    print repr(testresult)

'bob in 28 seconds to mow the lawn' =>
['28', 'seconds']
'jack and jill in 3 days to go fuck themselves' =>
['3', 'days']
'jack, jill and dr_seuss to eat a dick in 3 days' =>
['3', 'days']
'robert, mike and claire about that shovel in my trunk in 28 years' =>
['28', 'years']
'robert, mike, douche, and oxford_comma to write about lawns to mow in 30 minutes' =>
['30', 'minutes']
u"me in 20 minutes about la fois o\xf9 j'ai chi\xe9 par la fen\xeatre du presbyt\xe8re (\u5c6e\u309c\u0414\u309c)\u5c6e" =>
[u'20', u'minutes']
'Some invalid string, I guess.' =>
I don't understand: Remind who?
None
'me to huaaah hueeeeeh' =>
I don't understand: Remind when?
None
'me in 30 googlblsjhs eerrr fffuuuh' =>
I don't understand: Remind what..?
None
'me to fuaaah eeeeh in goobledeegook' =>
I don't understand: Remind when?
None
'me bob, matthew and johnesss about that one time at band camp in 2 seconds' =>
['2', 'seconds']
'Alice and Bob, Marie, Doug, and Mike to do something in 3 aeons' =>
I don't unders