In [184]:
# coding: utf-8
import sys
import re
import string
import ast
from ast import AST
import tokenize
import token
from numbers import Number
import json
import io

In [185]:
def posFromText(text, textPos):
    snippet = text[:textPos+1]
    lines = snippet.split("\n")
    ln = len(lines)
    ch = len(lines[-1])
    return {'line': ln, 'ch': ch}
    

In [186]:
def regexEnd(text, start, parentEnd, nodeStart):
    endLineno = None
    endCh = None
    first = text[start]
    m = None
    
    if first in parens:
        # find outer parens
        outer = re.compile("\((.*)\)")
        m = outer.search(text[start:parentEnd+1])
    elif first in sqParens:
        # find outer parens
        outer = re.compile("\[(.*)\]")
        m = outer.search(text[start:parentEnd+1])
    elif first in brackets:
        # find outer parens
        outer = re.compile("\{(.*)\}")
        m = outer.search(text[start:parentEnd+1])
    else: return None
    
    lines = m.group(0).split("\n")
    end = len(m.group(0)) + start
    endLineno = nodeStart['line'] + len(lines)
    endCh = len(lines[-1]) - 1
    if (endLineno == nodeStart['line']): endCh += nodeStart['ch']
    myEnd = {'line': endLineno - 1, 'ch': endCh}
    return (end, myEnd,  m.group(0))

In [187]:
def findNodeStart(node):
    if hasattr(node, 'lineno'):
        return {'line': node.lineno, 'ch': node.col_offset}
    elif  type(node).__name__ == "Module":
        return {'line': 1, 'ch': 0}
    else: # must be some kind of wrapper node
        children = ast.iter_child_nodes(node)
        firstChild = next(children, None)
        if firstChild is None: return None
        return findNodeStart(firstChild)

In [188]:
def findLiteralEnd(snippet, nodeStart): # what to do about multiline literal???
    bracketed = regexEnd(snippet, 0, len(snippet), nodeStart)
    if(bracketed):
        i, nodeEnd, punct = bracketed
        literal = [{'syntok': str(p)} for p in list(punct)]
    else:
        end = 0
        for i, character in enumerate(snippet):
            if character in punctuation:
                end = i - 1
                break
        literal = snippet[0:end+1]
        ch = len(literal) + nodeStart['ch']
        line = nodeStart['line']
        nodeEnd = {'line': line, 'ch': ch}
        
    print("literal |"+str(literal)+"|")
    return (i, nodeEnd, literal)

In [189]:
def findNextChild(children, itr):
    banned = ["Store", "Load"]
    if(itr + 1 < len(children)):
        child = children[itr + 1]
        if(type(child).__name__ not in banned):
            return child, itr + 1
        else:
            return findNextChild(children, itr + 1)
    else:
        return None, itr + 1

In [190]:
def captureComment(text, textStart, textEnd):
    line = text[textStart:textEnd]
    line = line[:line.find("\n")]
    return textStart + len(line), line

In [191]:
def captureStuff(text, end, nodeItem, puncStop = "", puncNL = False):
    content = []
    end, item = visit(nodeItem, text, end, len(text), None)
    content.append(item)
    # get any symbols like commas and spaces
    end, symbols = getPunctuationBetween(text, end, puncStop, puncNL)
    content += symbols
    return end, content

In [192]:
'''
mod = Module(stmt* body)
        | Interactive(stmt* body)
        | Expression(expr body)

        -- not really an actual node but useful in Jython's typesystem.
        | Suite(stmt* body)
'''
def visitModule(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    myStart = posFromText(text, textStart)
    end = textStart
    # get any symbols like commas and spaces
    end, symbols = getPunctuationBetween(text, end, "", True)
    myContent += symbols
    print("Start:", myContent)
    
    if(isinstance(node.body, list)):
        for stmt in node.body:
            end, stuff = captureStuff(text, end, stmt, "", True)
            myContent += stuff
            print(myType+" AFTER ", stmt, myContent, text[end:end+3])
    else:
        end, expr = visit(node.body, text, end, textEnd, None)
        
    # get any symbols like commas and spaces
    end, symbols = getPunctuationBetween(text, end, "", True)
    myContent += symbols
    print("END:", myContent)
    
    myEnd = posFromText(text, end)
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me

In [193]:
def stmtOrExpr(node):
    myType = type(node).__name__
    myContent = []
    myStart = {'line': node.lineno, 'ch': node.col_offset}
    me = {'type': myType, 'start': myStart, 'end': None, 'content': myContent}
    return me

In [337]:
'''
Assign(expr* targets, expr value)
'''
def visitAssign(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    for target in node.targets:
        end, stuff = captureStuff(text, end, target, "=")
        me['content'] += stuff
    me['content'].append({"syntok": "="})
    end += 1
    end, spaces = getSpacing(text, end, textEnd)
    me['content'] += spaces
    end, value = visit(node.value, text, end, textEnd, None)
    me['content'].append(value)
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me

In [420]:
'''
-- use 'orelse' because else is a keyword in target languages
'''

'''
| For(expr target, expr iter, stmt* body, stmt* orelse)
'''
def visitFor(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    me['content'].append({"syntok": "for"})
    end += len("for")
    end, spaces = getSpacing(text, end, textEnd)
    me['content'] += spaces
    end, target = visit(node.target, text, end, textEnd, None)
    me['content'].append(target)
    end, spaces = getSpacing(text, end, textEnd)
    me['content'] += spaces
    me['content'].append({"syntok": "in"})
    end += len("in")
    end, itr = visit(node.iter, text, end, textEnd, None)
    me['content'].append(itr)
    # get spaces, : and any new line
    end, symbols = getPunctuationBetween(text, end, "", True)
    me['content'] += symbols
    for stmt in node.body:
        end, stuff = captureStuff(text, end, stmt, "", True)
        me['content'] += stuff
    for stmt in node.orelse:
        me['content'].append({"syntok": "else"})
        end += len("else")
        # get spaces, : and any new line
        end, symbols = getPunctuationBetween(text, end, "", True)
        me['content'] += symbols
        end, stuff = captureStuff(text, end, stmt, "", True)
        me['content'] += stuff
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me
    

'''
| Try(stmt* body, excepthandler* handlers, stmt* orelse, stmt* finalbody)
'''
def visitTry(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    me['content'].append({"syntok": "try"})
    end += len("try")
    
    end, spaces = getSpacing(text, end, textEnd)
    me['content'] += spaces
    end, symbols = getPunctuationBetween(text,end, "", True)
    me["content"] += symbols
    for stmt in node.body:
        end, stuff = captureStuff(text, end, stmt, "", True)
        me['content'] += stuff
    for excepthandle in node.handlers:
        end, stuff = captureStuff(text, end, excepthandle, "", True)
        me['content'] += stuff
    for stmt in node.orelse:
        me['content'].append({"syntok": "else:"})
        end += len("else:")
        end, stuff = captureStuff(text, end, stmt, "", True)
        me['content'] += stuff
    for stmt in node.finalbody:
        me['content'].append({"syntok": "finally:"})
        end += len("finally:")
        end, stuff = captureStuff(text, end, stmt, "", True)
        me['content'] += stuff
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me
    

In [196]:
'''
Import(alias* names)
'''
def visitImport(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    me['content'].append({"syntok": "import"})
    end += len("import")
    end, symbols = getPunctuationBetween(text,end)
    me["content"] += symbols
    for alias in node.names:
        end, stuff = captureStuff(text, end, alias)
        me['content'] += stuff
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me

'''
ImportFrom(identifier? module, alias* names, int? level)
'''
def visitImportFrom(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    if(node.module):
        me['content'].append({"syntok": "from"})
        end += len("from")
        end, spaces = getSpacing(text, end, textEnd)
        me['content'] += spaces
        module = str(node.module)
        me['content'].append({"sytnok": module})
        end += len(module)
        end, spaces = getSpacing(text, end, textEnd)
        me['content'] += spaces
    me['content'].append({"syntok": "import"})
    end += len("import")
    end, spaces = getSpacing(text, end, textEnd)
    me['content'] += spaces
    for alias in node.names:
        end, stuff = captureStuff(text, end, alias)
        me['content'] += stuff
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me
    
    
'''
-- import name with optional 'as' alias.
    alias = (identifier name, identifier? asname)
'''
def visitAlias(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    end = textStart
    myStart = posFromText(text, end)
    name = str(node.name)
    myContent.append({"syntok": name})
    end += len(name)
    if(node.asname):
        end, symbols = getPunctuationBetween(text,end)
        myContent += symbols
        myContent.append({"syntok": "as"})
        end += len("as")
        end, symbols = getPunctuationBetween(text,end)
        myContent += symbols
        asname = str(node.asname)
        myContent.append({"syntok": asname})
        end += len(asname)
    myEnd = posFromText(text, end)
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:", end, ast.dump(node, True, False), "\n",me,"\n")
    return end, me

In [197]:
'''
Expr(expr value)
'''
def visitExpr(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    end, value = visit(node.value, text, end, textEnd, None)
    me['content'].append(value)
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me

In [456]:
'''
| BinOp(expr left, operator op, expr right)
'''
def visitBinOp(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    end, left = visit(node.left, text, end, textEnd, None)
    me['content'].append(left)
    end, spaces = getSpacing(text, end, textEnd)
    me['content'] += spaces
    end, op = visit(node.op, text, end, textEnd, None)
    me['content'].append(op)
    end, spaces = getSpacing(text, end, textEnd)
    me['content'] += spaces
    end, right = visit(node.right, text, end, textEnd, None)
    me['content'].append(right)
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me
    

'''
| UnaryOp(unaryop op, expr operand)
'''
def visitUnaryOp(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    end, op = visit(node.op, text, end, textEnd, None)
    me['content'].append(op)
    end, symbols = getPunctuationBetween(text, end)
    me['content'] += symbols
    end, operand = visit(node.operand, text, end, textEnd, None)
    me['content'].append(operand)
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me


'''
-- need sequences for compare to distinguish between
-- x < 4 < 3 and (x < 4) < 3
| Compare(expr left, cmpop* ops, expr* comparators)
'''
def visitCompare(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    # now use regex to get all ( and space tokens before the call args begin
    before_tok = re.compile('[\s]*\(*\s*')
    match = before_tok.search(text[end:textEnd])
    for char in match[0]: me['content'].append({"syntok": char}) 
    end += len(match[0])
    # get left expr
    end, left = visit(node.left, text, end, textEnd, None)
    me['content'].append(left)
    if(node.ops):
        end, spaces = getSpacing(text, end, textEnd)
        me['content'] += spaces
        for cmpop in node.ops:
            end, stuff = captureStuff(text, end, cmpop)
            me['content'] += stuff
    if(node.comparators):
        end, spaces = getSpacing(text, end, textEnd)
        me['content'] += spaces
        for expr in node.comparators:
            end, stuff = captureStuff(text, end, expr)
            me['content'] += stuff
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me


'''
Call(expr func, expr* args, keyword* keywords)
'''
def visitCall(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    end, value = visit(node.func, text, end, textEnd, None)
    me['content'].append(value)

    # now use regex to get all punctuation then ( and space tokens before the call args begin
    before_tok = re.compile('[^\w\s]*\(+\s*')
    match = before_tok.search(text[end:textEnd])
    for char in match[0]: me['content'].append({"syntok": char}) 
    end += len(match[0])
    print("did regex work?", me['content'], match)
    
    for argument in node.args:
        end, stuff = captureStuff(text, end, argument, ")")
        me['content'] += stuff
    for keyword in node.keywords:
        end, stuff = captureStuff(text, end, keyword, ")")
        me['content'] += stuff
    end, symbols = getPunctuationBetween(text, end, ")")
    me['content'] += symbols
    me['content'].append({'syntok': ')'})
    end += 1
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me
  
'''
| Num(object n) -- a number as a PyObject.
we need this one while we don't need one for Str because we need special regex for decimals
'''
def visitNum(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    value = str(node.n)
    me['content'].append(value)
    end += len(value)
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me

In [401]:
'''
-- the following expression can appear in assignment context
'''
'''
 | Attribute(expr value, identifier attr, expr_context ctx)
'''
def visitAttribute(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    end = textStart
    end, value = visit(node.value, text, textStart, textEnd, None)
    myContent.append(value)
    myContent.append({'syntok': '.'})
    attr = str(node.attr)
    myContent.append({'syntok': attr})
    myStart = value['start']
    myEnd = {'line': myStart['line'], 'ch': value['end']['ch'] + 1 + len(attr)}
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    end += 1 + len(attr)
    print("MADE:", end, ast.dump(node, True, False), "\n",me,"\n")
    return end, me

'''
 | Subscript(expr value, slice slice, expr_context ctx)
'''
def visitSubscript(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    end = textStart
    myStart = posFromText(text, end)
    end, value = visit(node.value, text, textStart, textEnd, None)
    end, slicey = visit(node.slice, text, end, textEnd, None)
    myContent.append(value)
    myContent.append(slicey)
    myEnd = slicey['end']
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me

'''
 | Starred(expr value, expr_context ctx)
'''
def visitStarred(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    end = textStart
    myStart = posFromText(text, end)
    myContent.append({"syntok": "*"})
    end += 1
    end, value = visit(node.value, text, end, textEnd, None)
    myContent.append(value)
    myEnd = posFromText(text, end)
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me
    
'''
 | List(expr* elts, expr_context ctx)
'''
def visitList(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    end = textStart
    myStart = posFromText(text, end)
    myContent.append({"syntok": "["})
    end += 1
    end, spaces = getSpacing(text, end, textEnd)
    myContent += spaces
    commas_spaces = re.compile('\s*,*\s*')
    for elem in node.elts:
        end, value = visit(elem, text, end, textEnd, None)
        myContent.append(value)
        #get spaces and commas only
        match = commas_spaces.search(text[end:textEnd])
        for char in match[0]: myContent.append({"syntok": char}) 
        end += len(match[0])
        
    myContent.append({"syntok": "]"})
    end += 1
    myEnd = posFromText(text, end)
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me

'''
 | Tuple(expr* elts, expr_context ctx)
'''
def visitTuple(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    end = textStart
    myStart = posFromText(text, end)
    if text[end] == "(":
        myContent.append({"syntok": "("})
        end += 1
    end, spaces = getSpacing(text, end, textEnd)
    myContent += spaces
    commas_spaces = re.compile('\s*,*\s*')
    for elem in node.elts:
        end, value = visit(elem, text, end, textEnd, None)
        myContent.append(value)
        #get spaces and commas only
        match = commas_spaces.search(text[end:textEnd])
        for char in match[0]: myContent.append({"syntok": char}) 
        end += len(match[0])
    
    if text[end] == ")":
        myContent.append({"syntok": ")"})
        end += 1
    myEnd = posFromText(text, end)
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me


In [426]:
'''
slice = Slice(expr? lower, expr? upper, expr? step)
          | ExtSlice(slice* dims)
          | Index(expr value)
'''
def visitIndex(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    myContent.append({'syntok': '['})
    end = textStart + 1
    end, symbols = getPunctuationBetween(text,end)
    myContent += symbols
    end, value = visit(node.value, text, end, textEnd, None)
    myContent.append(value)
    end, symbols = getPunctuationBetween(text,end, ']')
    myContent += symbols
    myContent.append({'syntok': ']'})
    end += 1
    
    myStart = {'line': value['start']['line'], 'ch': value['start']['ch'] - 1}
    myEnd = {'line': myStart['line'], 'ch': myStart['ch'] + 1 }
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:", me,"\n")
    return end, me

In [496]:
'''
unaryop = Invert | Not | UAdd | USub
'''
def visitUnary(node, text, textStart, textEnd):
    tokens = {"Invert": '~', "Not": '!', "UAdd": "+", "USub": "-"}
    myType = type(node).__name__
    end = textStart
    myStart = posFromText(text, end)
    myContent = []
    myContent.append({"syntok": tokens[myType]})
    end = textStart + 1
    myEnd = posFromText(text, end)
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:",me,"\n")
    return end, me

In [489]:
'''
excepthandler = ExceptHandler(expr? type, identifier? name, stmt* body)
                    attributes (int lineno, int col_offset)
'''
def visitExceptHandler(node, text, textStart, textEnd):
    me = stmtOrExpr(node)
    end = textStart
    me["content"].append({"syntok": "except"})
    end += len("except")
    end, spaces = getSpacing(text, end, textEnd)
    me["content"] += spaces
    if node.type:
        end, stuff = captureStuff(text, end, node.type, "", True)
        me['content'] += stuff
    if node.name:
        me["content"].append({"syntok": "as"})
        end += len("as")
        end, spaces = getSpacing(text, end, textEnd)
        me["content"] += spaces
        name = str(node.name)
        me["content"].append({"syntok": name})
        end += len(name)
    end, spaces = getSpacing(text, end, textEnd)
    me["content"] += spaces
    me["content"].append({"syntok": ":"})
    end += 1
    #get any new line elements
    end, symbols = getPunctuationBetween(text, end, "", True)
    me["content"] += symbols
    for stmt in node.body:
        end, stuff = captureStuff(text, end, stmt, "", True)
        me['content'] += stuff
    me['end'] = posFromText(text, end)
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return end, me
    

In [490]:
'''
 -- keyword arguments supplied to call (NULL identifier for **kwargs)
    keyword = (identifier? arg, expr value)
'''
def visitKeyword(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    myStart = posFromText(text, textStart)
    end = textStart
    if(node.arg): 
        ar = str(node.arg)
        myContent.append({'syntok':ar})
        end += len(ar)
        end, spaces = getSpacing(text, end, textEnd)
        myContent += spaces
        myContent.append({'syntok':"="})
        end += 1
    end, spaces = getSpacing(text, end, textEnd)
    myContent += spaces
    end, value = visit(node.value, text, end, textEnd, None)
    myContent.append(value)
    myEnd = posFromText(text, end)
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    print("MADE:", me,"\n")
    return end, me
    

In [498]:
def visit(node, text, textStart, textEnd, nextNode):
    
    # 1. first, figure out if we're dealing with a literal or parent
    children = list(ast.iter_child_nodes(node))
    myType = type(node).__name__
    print("\n",type(node).__name__, children, textStart, text[textStart:textStart+3])
    
    visitors = {"Module": visitModule,
                "Interactive": visitModule,
                "Expression": visitModule,
                "Suite": visitModule,
                "Assign": visitAssign,
                "For": visitFor,
                "Try": visitTry,
                "Import": visitImport,
                "ImportFrom": visitImportFrom,
                "Expr": visitExpr,
                "BinOp": visitBinOp,
                "UnaryOp": visitUnaryOp,
                "Compare": visitCompare,
                "Call": visitCall,
                "Num": visitNum,
                "Attribute": visitAttribute,
                "Subscript": visitSubscript,
                "Starred": visitStarred,
                "List": visitList,
                "Tuple": visitTuple,
                "Index": visitIndex,
                "Invert": visitUnary,
                "ExceptHandler": visitExceptHandler,
                "keyword": visitKeyword,
                "alias": visitAlias}
    
    if myType in visitors:
        return visitors[myType](node, text, textStart, textEnd)
    
    
    # necissary to filter children using findNextChild, since there's some
    # metalabels like store or load we don't care about here
    child, child_itr = findNextChild(children, -1)

    
    if not child: # LITERAL
        return visitLiteral(node, text, textStart)
    else:
        print("NO VISITOR FOR "+myType, ast.dump(node, True, False))
        return genericVisit(node, text, textStart, textEnd, nextNode)
    
    
def visitLiteral(node, text, start):
    myType = type(node).__name__
    myStart = None
    if hasattr(node, 'lineno'):
        myStart = {'line': node.lineno, 'ch': node.col_offset}
    else:
        myStart = posFromText(text, start)
    token = text[start]
    end = start
    if(token == '"'):
        pattern = re.compile('".*?"')
    elif(token == "'"):
        pattern = re.compile("'.*?'")
    else:
        pattern = re.compile('\w+')
    match = pattern.search(text[start:])
    #print("LITERAL FOUND", match, text[start:])
    myLiteral = match[0]
    end += len(match[0])
    myEnd = posFromText(text, end)
    me = None
    if(isinstance(myLiteral, str)):
        me = {'type': myType, 'start': myStart, 'end': myEnd, 'literal': myLiteral}
    else: # actually a list of syntok
        me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myLiteral}
    print("MADE:", ast.dump(node, True, False), "\n",me,"\n")
    return (end, me)

In [499]:
def getPunctuationBetween(text, textStart, stopChar = "", allowNewline = False):
    textEnd = len(text)
    if(textStart >= textEnd): return textEnd, []
    i = textStart
    char = text[textStart]
    content = []
    extra = []
    if(allowNewline): extra = newline
    while i < textEnd and (char in punctuation or char in extra) and char != stopChar:
        if char == "#":
            new_i, comment = captureComment(text, i, textEnd)
            content.append({'syntok': str(comment)})
            i = new_i - 1
        else:
            content.append({"syntok": str(char)})
        i += 1
        char = str(text[min(i, textEnd - 1)])
        
    return i, content


In [500]:

def getSpacing(text, textStart, textEnd):
    end = textStart
    content = []
    spacing = re.compile('\s*')
    spaces = spacing.search(text[end:textEnd])
    for char in spaces[0]: content.append({"syntok": char}) 
    end += len(spaces[0])
    return end, content
    

In [501]:
def parse(text):
    node = ast.parse(text)
    print(ast.dump(node, True, True))
    print("\n",visit(node, text, 0, len(text), None))

In [502]:
text = """# For each feature find the data points with extreme high or low values
for feature in log_data.keys():
    display(log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))])
    
# OPTIONAL: Select the indices for data points you wish to remove
outliers  = []

# Remove the outliers, if any were specified
good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)"""


sqParens = set(["[","]"])
parens = set(["(",")"])
brackets = set(["{","}"])
spaces = set(["\t", " "])
newline = set(["\n"]) #todo may vary across platforms
punctuation = set(string.punctuation)
punctuation.add(" ")
punctuation.add("\t")


# to hurry up, reduce ast at this stage?
# match parens [] {} () otherwise those can end up in weird places

parse(text)
#print(json.dumps(main(l, tree),  indent=2))

Module(body=[For(target=Name(id='feature', ctx=Store(), lineno=2, col_offset=4), iter=Call(func=Attribute(value=Name(id='log_data', ctx=Load(), lineno=2, col_offset=15), attr='keys', ctx=Load(), lineno=2, col_offset=15), args=[], keywords=[], lineno=2, col_offset=15), body=[Expr(value=Call(func=Name(id='display', ctx=Load(), lineno=3, col_offset=4), args=[Subscript(value=Name(id='log_data', ctx=Load(), lineno=3, col_offset=12), slice=Index(value=UnaryOp(op=Invert(), operand=BinOp(left=Compare(left=Subscript(value=Name(id='log_data', ctx=Load(), lineno=3, col_offset=24), slice=Index(value=Name(id='feature', ctx=Load(), lineno=3, col_offset=33)), ctx=Load(), lineno=3, col_offset=24), ops=[GtE()], comparators=[BinOp(left=Name(id='Q1', ctx=Load(), lineno=3, col_offset=45), op=Sub(), right=Name(id='step', ctx=Load(), lineno=3, col_offset=50), lineno=3, col_offset=45)], lineno=3, col_offset=24), op=BitAnd(), right=Compare(left=Subscript(value=Name(id='log_data', ctx=Load(), lineno=3, col_off