In [2]:
# coding: utf-8
import sys
import ast
from ast import AST
import tokenize
import token
from numbers import Number
import json
import io

In [9]:
def tokenASTCombine(tok, astNode):
    tok.pop(0)
    nodeyList, looseTokenList, tk = zipTokAST(tok, astNode, []) #skip start marker token
    while(len(tk) > 1): #skip the end maker
        if(tk[0].type != token.NEWLINE):
            nodeyList.append(formatToken(tk[0]))
        tk.pop(0)
    return json.dumps(nodeyList)


def formatToken(tk):
    return {'type': token.tok_name[tk.type], 'start': {'line': tk.start[0], 'ch': tk.start[1]}, 'end': {'line': tk.end[0], 'ch': tk.end[1]}, 'literal': tk.string}


def zipTokAST(tk, node, nodeyList):
    if(len(tk) < 1 or tk[0].type == token.ENDMARKER):
        return nodeyList, [], [] #Stop condition: end of tokens
    elif(node == None):
        return nodeyList, [], [] #Stop condition: end of nodes
        
    marker = {'type': type(node).__name__}
        
    looseTokenList = []
    if(hasattr(node, 'lineno')): #meaning it's a node that appears in the text
        line = node.lineno
        ch = node.col_offset
        # check for nodes that don't belong to anyone
        while(len(tk) > 0 and (tk[0].start[0] < line or (tk[0].start[0] == line and tk[0].start[1] < ch))): #actually starts before this node
            looseTokenList.append(formatToken(tk[0]))
            if(tk[0].type == token.NEWLINE): #stop a newline means this actually goes with the prior node
                if(len(nodeyList) > 0):
                    nodeyList[len(nodeyList) - 1]['content'] += looseTokenList
                    looseTokenList = []
            tk.pop(0)
        print("node is", type(node).__name__, line, ch, "\n found loose nodes:", looseTokenList, "\n\n")

    childrenZip = []
    for child in ast.iter_child_nodes(node): #depth fist
        cz, looseTk, tk = zipTokAST(tk, child, childrenZip)
        if(len(looseTk) > 0):
            childrenZip += looseTk
        if(len(cz) > 0):
            childrenZip += cz

    if(len(childrenZip) > 0):
        marker['content'] = childrenZip
        marker['start'] = childrenZip[0]['start']
        marker['end'] = childrenZip[len(childrenZip) - 1]['end']
        list.sort(childrenZip, key = lambda x : x['start']['line']*10000+x['start']['ch'] )


    if(hasattr(node, 'lineno')): #meaning it's a node that appears in the text
        if(len(tk) > 0 and tk[0].start[0] == line and tk[0].start[1] == ch):
            if(tk[0].type != token.NEWLINE):
                formatted = formatToken(tk[0])
                marker['start'] = formatted['start']
                marker['end'] = formatted['end']
                marker['literal'] = formatted['literal']
            tk.pop(0)

    if('literal' in marker or len(childrenZip) > 0):
        return [marker], looseTokenList, tk

    return [], looseTokenList, tk

In [17]:
def zipTokASTOLD(tk, node, nodeyList, priorNode):
    if(len(tk) < 1 or tk[0].type == token.ENDMARKER):
        return nodeyList, [], [] #Stop condition: end of tokens
    elif(node == None):
        return nodeyList, [], [] #Stop condition: end of nodes

    
    if(hasattr(node, 'lineno')): #meaning it's a node that appears in the text
        line = node.lineno
        ch = node.col_offset
        print("node is", type(node).__name__, line, ch)
        
        
    childrenZip = []
    looseTokenList = []
    priorChild = None
    for child in ast.iter_child_nodes(node): #depth fist
        cz, looseTk, tk = zipTokAST(tk, child, [], priorChild)
        if(len(looseTk) > 0):
            childrenZip += looseTk
        
        if(len(cz) > 0):
            childrenZip += cz
            priorChild = childrenZip[len(childrenZip) - 1]


    list.sort(childrenZip, key = lambda x : x['start']['line']*10000+x['start']['ch'] )

    marker = {'type': type(node).__name__, 'content': childrenZip}
    if(len(childrenZip) > 0):
        marker['start'] = childrenZip[0]['start']
        marker['end'] = childrenZip[len(childrenZip) - 1]['end']

    if(hasattr(node, 'lineno')): #meaning it's a node that appears in the text
        line = node.lineno
        ch = node.col_offset
        #print("node is", type(node).__name__, line, ch, tk[0])
        # check for nodes that don't belong to anyone
        while(len(tk) > 0 and (tk[0].start[0] < line or (tk[0].start[0] == line and tk[0].start[1] < ch))): #actually starts before this node
            if(tk[0].type != token.NEWLINE):
                looseTokenList.append(formatToken(tk[0]))
            tk.pop(0)
        if(len(tk) > 0 and tk[0].start[0] == line and tk[0].start[1] == ch):
            if(tk[0].type != token.NEWLINE):
                formatted = formatToken(tk[0])
                marker['start'] = formatted['start']
                marker['end'] = formatted['end']
                marker['literal'] = formatted['literal']
            tk.pop(0)

    if('literal' in marker or len(childrenZip) > 0):
        nodeyList.append(marker)
    return nodeyList, looseTokenList, tk

In [11]:
def parseCode(code):
    #print("got code", str)
    tree = ast.parse(code)
    bytes = io.BytesIO(code.encode('utf-8'))
    g = tokenize.tokenize(bytes.readline)
    tk = list(g)
    return tokenASTCombine(tk, tree)

In [12]:
text = """
def zipTokAST(tk, node, nodeyList):
    if(len(tk) < 1 or tk[0].type == token.ENDMARKER):
        return nodeyList, [], [] #end of tokens
    elif(node == None):
        return nodeyList, [], [] #end of nodes"""

In [13]:
jsn = parseCode(text)
parsed = json.loads(jsn)
print(json.dumps(parsed, indent=2, sort_keys=False))

node is FunctionDef 2 0 
 found loose nodes: [{'type': 'NL', 'start': {'line': 1, 'ch': 0}, 'end': {'line': 1, 'ch': 1}, 'literal': '\n'}] 


node is arg 2 14 
 found loose nodes: [{'type': 'NAME', 'start': {'line': 2, 'ch': 0}, 'end': {'line': 2, 'ch': 3}, 'literal': 'def'}, {'type': 'NAME', 'start': {'line': 2, 'ch': 4}, 'end': {'line': 2, 'ch': 13}, 'literal': 'zipTokAST'}, {'type': 'OP', 'start': {'line': 2, 'ch': 13}, 'end': {'line': 2, 'ch': 14}, 'literal': '('}] 


node is arg 2 18 
 found loose nodes: [{'type': 'OP', 'start': {'line': 2, 'ch': 16}, 'end': {'line': 2, 'ch': 17}, 'literal': ','}] 


node is arg 2 24 
 found loose nodes: [{'type': 'OP', 'start': {'line': 2, 'ch': 22}, 'end': {'line': 2, 'ch': 23}, 'literal': ','}] 


node is If 3 4 
 found loose nodes: [{'type': 'INDENT', 'start': {'line': 3, 'ch': 0}, 'end': {'line': 3, 'ch': 4}, 'literal': '    '}] 


node is BoolOp 3 7 
 found loose nodes: [{'type': 'NAME', 'start': {'line': 3, 'ch': 4}, 'end': {'line': 3, 'ch'

In [None]:
'''if(hasattr(node, 'lineno')): #meaning it's a node that appears in the text
        ln = node.lineno
        ch = node.col_offset
        start = {'line': ln, 'ch': ch}
        while(tokens[0].start[0] < ln or (tokens[0].start[1] <= ch and tokens[0].start[0] == ln)):
            tk = tokens.pop(0)
            formatted = formatToken(tk)
            end = formatted['end']
            literal = literal + " " + formatted['literal'] if literal else formatted['literal']
    '''

In [14]:
def formatToken(tk):
    return {'type': token.tok_name[tk.type], 'start': {'line': tk.start[0], 'ch': tk.start[1]}, 'end': {'line': tk.end[0], 'ch': tk.end[1]}, 'literal': tk.string}

def formatTokenList(tk_list):
    return [formatToken(tk) for tk in tk_list]
        

def nextLine(tokens):
    while(len(tokens[0]) == 0):
        tokens.pop(0)
    return tokens

def nextChild(children):
    return children.pop(0) if len(children) > 0 else None

def zipTokensAST(tokens, node, lineOffset):
    if len(tokens) < 1 or node == None :
        return #Stop condition
    
    content = []
    
    children = list(ast.iter_child_nodes(node))
    stackOLines = []
    child1 = nextChild(children)
    child2 = nextChild(children)
    print("child #1", child1, "child #2", child2, lineOffset, "\n")
    for lineno, line in enumerate(tokens):
        stackOLines.append(line)
        if(child2 and child2.lineno == lineno + lineOffset): #start of child 2
            #first, give all the lines collected so far to child1. child2 starts with what remains
            remainder, con = consumeLines(stackOLines, child1, lineOffset)
            content += content + con if con else content
            child1 = child2
            child2 = nextChild(children)
            stackOLines = [remainder]
            lineOffset = lineno + lineOffset
            
    remainder, con = consumeLines(stackOLines, child1, lineOffset) if child1 else (stackOLines, [])
    #print(type(node).__name__, " Remainder", remainder, "\n\n")
    content = content + con if con else content
    #content += formatTokenList(remainder) 
    
    ln = None
    ch = None
    if(hasattr(node, 'lineno')): #meaning it's a node that appears in the text
        ln = node.lineno 
        ch = node.col_offset
    if(len(content) > 0):
        return {'type': type(node).__name__, 'start': {'line': ln, 'ch': ch}, 'end': content[-1].end, 'content': content}, tokens
    return None, tokens    
    

def consumeLines(tokenLines, node, lineOffset):
    #first, try to eat as much as you can if you have your own literal
    print("Consume: ", type(node).__name__,  "\n", lineOffset, tokenLines)
    literal = None
    start = None
    if(hasattr(node, 'lineno')): #meaning it's a node that appears in the text
        ln = node.lineno
        ch = node.col_offset
        print(" ", ln, ch, "\n")
        # pull the first line to try to eat
        line = tokenLines[0]
        while(line[0].start[0] + lineOffset < ln or (line[0].start[1] == ch and line[0].start[0] + lineOffset == ln)):
            tk = line.pop(0)
            formatted = formatToken(tk)
            start = start if start else formatted['start']
            end = formatted['end']
            literal = literal + " " + formatted['literal'] if literal else formatted['literal']
            if(len(line) < 1):
                tokenLines.pop(0)
                lineOffset += 1
                line = tokenLines[0]
        tokenLines[0] = line
    
    print("Ate: ", literal, lineOffset)
    nodey, tokenLines = zipTokensAST(tokenLines, node, lineOffset)
    if(nodey):
        nodey['literal'] = literal
    return tokenLines, nodey
        

In [106]:
def startsWith(start, token):
    return start['line'] < token.start[0] or (start['line'] == token.start[0] and start['ch'] <= token.start[1])

def getStart(node):
    if(hasattr(node, 'lineno')):
        return {'line' : node.lineno, 'ch': node.col_offset }
    else: # try to get to the first subnode that has a lineno
        child = next(ast.iter_child_nodes(node), None)
        return getStart(child)


def formatToken(tk):
    return {'type': token.tok_name[tk.type], 'start': {'line': tk.start[0], 'ch': tk.start[1]}, 'end': {'line': tk.end[0], 'ch': tk.end[1]}, 'literal': tk.string}

def getNewBounty(bounty, token):
    target = bounty[-1] if len(bounty) > 0 else None
    
    if(token.string == '['): bounty.append('[')
    if(token.string == ']'):
        if(target == '['): bounty.pop()
        else: raise ValueError('Unmatched target [ '+str(bounty))
            
    if(token.string == '('):
        bounty.append(token.string)
    if(token.string == ')'):
        if(target == '('): bounty.pop()
        else: raise ValueError('Unmatched target ( '+str(bounty))
            
    if(token.string == '{'):
        bounty.append(token.string)
    if(token.string == '}'):
        if(target == '{'): bounty.pop()
        else: raise ValueError('Unmatched target { '+str(bounty))
    
    return bounty
    
    


def processTokenList(tk_list):
    bounty = []
    formatted = []
    for tk in tk_list:
        bounty = getNewBounty(bounty, tk)
        formatted.append(formatToken(tk))
    return bounty, formatted


def consumeTokens(tokens, node, bounty):
    
    print("foelds?", node._fields)
    if len(node._fields) > 0 : # meaning has children
        # do stuff
        print("TODO")
    else :
        # We're at the base case! Should be a literal.
        node = None
        if(hasattr(node, 'lineno')): #meaning it's a node that appears in the text
            end = tokens[-1]['end']
            start = tokens[0]['start']
            content = []
            
            # first, if there are any tokens before the literal, if these fulfill or create bounty, make them siblings, if not, they belong as content to node
            while(tokens[0].start[0] < node.lineno or (tokens[0].start[1] < node.col_offset and tokens[0].start[0] == node.lineno)):
                print("TODO!")
            
            
            newBounty = [] # new bounty generated on this level. Must be solved here since this node has no children
            if(node.col_offset and tokens[0].start[0] == node.lineno):
                tk = tokens.pop(0)
                newBounty, formatted = processTokenList([tk])
                end = formatted['end']
                content.append(formatted)
            afterContent, tokens = solveBounty(newBounty, tokens)
            nodey = {'type': type(node).__name__, 'start': start, 'end': end, 'content': content}
                
        bounty, middleTokens, rightTokens = processTokenList(tokens)    
        return node, bounty, middleTokens, rightTokens
        
    
    
    
def processTokens_before(node, tokenList):
    chunkList = []
    before = []
    child = next(ast.iter_child_nodes(node), None)
    if child:
        start = getStart(child)
        print("start of child", child, start, "")
        for chunk in tokenList:
            if(startsWith(start, chunk)): #start of child
                chunkList.append(chunk)
            else:
                before.append(chunk)
        newBounty, before_formatted = processTokenList(before)
        return newBounty, before_formatted, chunkList
    else:
        return [], [], tokenList
    
 

def processTokens_middle(node, tokenList, bounty):
    children = ast.iter_child_nodes(node)
    child1 = next(children, None)
    content = []
    
    if child1:
        chunkList = []
        child2 = next(children, None)
        if child2:
            child1_start = getStart(child1)
            child2_start = getStart(child2)
            print("child #1", child1, child1_start, "child #2", child2, child2_start, "\n")
            for chunk in tokenList: 
                if(child2 and startsWith(child2_start, chunk)): #start of child 2
                    #first, give all the tokens collected so far to child1. child2 starts with what remains
                    before_tokens, child_nodey, after_tokens = zipTokensAST(chunkList, child1, bounty)
                    content += before_tokens
                    if child_nodey: content.append(child_nodey)
                    content += after_tokens
                    chunkList = []
                    child1_start = child2_start
                    child1 = child2
                    child2 = next(children, None)
                    child2_start = getStart(child2) if child2 else None
                    print("child #1 is now:", child1, child1_start, "child #2", child2, child2_start, "\n")
                chunkList.append(chunk)
        else: 
            chunkList = tokenList
     
        before_tokens, child_nodey, after_tokens = zipTokensAST(chunkList, child1, bounty)
        content = content + before_tokens
        if child_nodey: content.append(child_nodey)
        chunkList = after_tokens
        return bounty, content, chunkList
    else:
        # no children, but eat what can
        start = getStart(node)
        print("my start", start)
        content = []
        if(tokenList[0].start[0] == start['line'] and tokenList[0].start[1] == start['ch']):
            content.append(formatToken(tokenList.pop(0)))
            end = content[-1]
        return bounty, content, tokenList



def zipTokensAST(tokens, node, parentBounty = []):
    print ("\nTreating", type(node).__name__ )
    '''if len(tokens) < 1 or node == None :
        if len(parentBounty) > 0 : raise ValueError('Unmatched bounty '+str(bounty))
        return [], None, [] #Stop condition'''
    
    bounty = []
    
    bounty, before_tokens, tokens = processTokens_before(node, tokens)
    print("got before_tokens", before_tokens, "\nbounty:", bounty, tokens)
    
    bounty, content, remainder = processTokens_middle(node, tokens, bounty)
    #print("got content ", content, "and bounty", bounty, remainder)
    nodey = {'type': type(node).__name__, 'start': content[0]['start'], 'end': content[-1]['end'], 'content': content}
    print("\nnode:", nodey, "\nafter:", remainder, "\n\n\n")
    return before_tokens, nodey, remainder

In [107]:
text = """
def zipTokAST(tk, node, nodeyList):
    if(len(tk) < 1 or tk[0].type == token.ENDMARKER):
        return nodeyList, [], [] #end of tokens
    elif(node == None):
        return nodeyList, [], [] #end of nodes"""

tree = ast.parse(text)
split = text.split('\n')
bytes = io.BytesIO(text.encode())
g = tokenize.tokenize(bytes.readline)
l = list(g)
l.pop(0) #get rid of encoding stuff
print(zipTokensAST(l, tree))


Treating Module
start of child <_ast.FunctionDef object at 0x10d952da0> {'line': 2, 'ch': 0} 
got before_tokens [{'type': 'NL', 'start': {'line': 1, 'ch': 0}, 'end': {'line': 1, 'ch': 1}, 'literal': '\n'}] 
bounty: [] [TokenInfo(type=1 (NAME), string='def', start=(2, 0), end=(2, 3), line='def zipTokAST(tk, node, nodeyList):\n'), TokenInfo(type=1 (NAME), string='zipTokAST', start=(2, 4), end=(2, 13), line='def zipTokAST(tk, node, nodeyList):\n'), TokenInfo(type=53 (OP), string='(', start=(2, 13), end=(2, 14), line='def zipTokAST(tk, node, nodeyList):\n'), TokenInfo(type=1 (NAME), string='tk', start=(2, 14), end=(2, 16), line='def zipTokAST(tk, node, nodeyList):\n'), TokenInfo(type=53 (OP), string=',', start=(2, 16), end=(2, 17), line='def zipTokAST(tk, node, nodeyList):\n'), TokenInfo(type=1 (NAME), string='node', start=(2, 18), end=(2, 22), line='def zipTokAST(tk, node, nodeyList):\n'), TokenInfo(type=53 (OP), string=',', start=(2, 22), end=(2, 23), line='def zipTokAST(tk, node, nodey

AttributeError: 'NoneType' object has no attribute '_fields'