In [106]:
# coding: utf-8
import sys
import ast
from ast import AST
import tokenize
import token
from numbers import Number
import json
import io

In [6]:
def startsWith(start, tk):
    if(start):
        return start['line'] < tk.start[0] or (start['line'] == tk.start[0] and start['ch'] <= tk.start[1])
    return False

def getStart(node):
    if(hasattr(node, 'lineno')):
        return {'line' : node.lineno, 'ch': node.col_offset }
    else: # try to get to the first subnode that has a lineno
        child = next(ast.iter_child_nodes(node), None)
        loc = getStart(child) if child else None
        return loc



def formatToken(tk):
    return {'type': token.tok_name[tk.type], 'start': {'line': tk.start[0], 'ch': tk.start[1]}, 'end': {'line': tk.end[0], 'ch': tk.end[1]}, 'literal': tk.string}

def formatTokenList(tk_list):
    formatted = []
    for tk in tk_list:
        formatted.append(formatToken(tk))
    return formatted

def getNewBounty(bounty, tk):
    target = bounty[-1] if len(bounty) > 0 else None
    
    if(target):
        if(tk.string == '['): bounty.append('[')

        elif(tk.string == '('): bounty.append('(')

        elif(tk.string == '{'): bounty.append('{')

        match = fulfillBounty(bounty, tk)
        if(match) : bounty.pop()
        else: raise ValueError('Unmatched target '+str(target)+": "+str(bounty))
    
    return bounty


def fulfillBounty(bounty, tk):
    target = bounty[-1] if len(bounty) > 0 else None
    if(tk.string == ']'):
        return True
    if(tk.string == ')'):
        return True
    if(tk.string == '}'):
        return True
    return False
            

def processTokenList(tk_list):
    bounty = []
    formatted = []
    for tk in tk_list:
        bounty = getNewBounty(bounty, tk)
        formatted.append(formatToken(tk))
    return bounty, formatted
    
    
def splitBeforeTokens(content, nodey, before_tokens):
    prevNodey = content[-1] if content != [] else None
    middle = []
    nodeyMatch = []
    for tk in before_tokens:
        #print('prevNodey is', prevNodey, 'token is', tk, 'nodey is', nodey)
        if(nodey and tk['start']['line'] == nodey['start']['line']):
            nodeyMatch.append(tk)
        elif(prevNodey and tk['start']['line'] == prevNodey['start']['line']):
            content += tk
        else:
            middle += tk
    if nodeyMatch != []:
        nodey['content'] = nodeyMatch + (nodey['content'] or [])
    return content, middle, nodey


def splitAfterTokens(prevStart, nodeStart, after_tokens):
    middle = []
    nodeyMatch = []
    for tok in after_tokens:
        if nodeStart and tok.start[0] == nodeStart['line'] and ((not prevStart) or tok.start[0] != prevStart['line']):
            nodeyMatch.append(tok)
        else:
            middle.append(tok)
    return middle, nodeyMatch
        
        
    
    
def processTokens_before(node, tokenList):
    chunkList = []
    before = []
    child = next(ast.iter_child_nodes(node), None)
    if child:
        start = getStart(child)
        if(start):
            for chunk in tokenList:
                if(startsWith(start, chunk)): #start of child
                    chunkList.append(chunk)
                else:
                    before.append(chunk)
            newBounty, before_formatted = processTokenList(before)
            return newBounty, before_formatted, chunkList

    return [], [], tokenList



def processTokens_middle(node, tokenList, bounty):
    children = ast.iter_child_nodes(node)
    child1 = next(children, None)
    content = []
    
    if child1:
        chunkList = []
        child2 = next(children, None)
        child2_start = None
        while(child2 and not child2_start):
            child2_start = getStart(child2)
            if(not child2_start): 
                child2 = next(children, None)
        if(child2):
            child1_start = getStart(child1)
            #print("child #1", child1, child1_start, "child #2", child2, child2_start, "\n")
            for chunk in tokenList: 
                if(child2 and startsWith(child2_start, chunk)): #start of child 2
                    #first, give all the tokens collected so far to child1. child2 starts with what remains
                    before_tokens, child_nodey, after_tokens = zipTokensAST(chunkList, child1, bounty)
                    content, before_tokens, child_nodey = splitBeforeTokens(content, child_nodey, before_tokens)
                    content += before_tokens
                    if child_nodey: content.append(child_nodey)
                    chunkList = []
                    if(after_tokens != []):
                        after_tokens, chunkList = splitAfterTokens(child1_start, child2_start, after_tokens)
                        content += formatTokenList(after_tokens)
                    child1_start = child2_start
                    child1 = child2
                    child2 = next(children, None)
                    child2_start = getStart(child2) if child2 else None
                    #print("child #1 is now:", child1, child1_start, "child #2", child2, child2_start, "\n", content)
        
                chunkList.append(chunk)
        else: 
            chunkList = tokenList
     
        before_tokens, child_nodey, after_tokens = zipTokensAST(chunkList, child1, bounty)
        content, before_tokens, child_nodey = splitBeforeTokens(content, child_nodey, before_tokens)
        content += before_tokens
        if child_nodey: content.append(child_nodey)
        chunkList = after_tokens
        return bounty, content, chunkList
    else:
        # no children, but eat what can
        start = getStart(node)
        #print("my start", start)
        content = []
        if(start):
            if(tokenList[0].start[0] == start['line'] and tokenList[0].start[1] == start['ch']):
                content.append(formatToken(tokenList.pop(0)))
                end = content[-1]
        return bounty, content, tokenList
        

def zipTokensAST(tokens, node, parentBounty = []):
    #print ("\nTreating", type(node).__name__ , node._fields)
    
    bounty = []
    
    bounty, before_tokens, tokens = processTokens_before(node, tokens)
    #print("got before_tokens", before_tokens, "\nbounty:", bounty)
    
    bounty, content, remainder = processTokens_middle(node, tokens, bounty)
    #print("got content ", content, "and bounty", bounty, remainder)
    if(content != []):
        nodey = {'type': type(node).__name__, 'start': content[0]['start'], 'end': content[-1]['end'], 'content': content}
    else:
        nodey = None
    #print("\nnode:", nodey, "\nafter:", remainder, "\n bounty remains:", bounty,"\n\n")
    return before_tokens, nodey, remainder


def main(text):
    tree = ast.parse(text)
    split = text.split('\n')
    bytes = io.BytesIO(text.encode())
    g = tokenize.tokenize(bytes.readline)
    tokens = list(g)
    tokens.pop(0) #get rid of encoding stuff
    before_tokens, nodey, remainder = zipTokensAST(tokens, tree)
    nodey['content'] = before_tokens + nodey['content'] + formatTokenList(remainder)
    print (nodey)

In [7]:
text = """
from __future__ import print_function
from scipy.integrate import quad
integral, error = quad(f, a, b)
integral_trapezoid = sum( (xint[1:] - xint[:-1]) * (yint[1:] + yint[:-1]) ) / 2
print("The integral is:", integral, "+/-", error)
print("The trapezoid approximation with", len(xint), "points is:", integral_trapezoid)"""

main(text)
#print(json.dumps(main(l, tree),  indent=2))

{'type': 'Module', 'start': {'line': 2, 'ch': 0}, 'end': {'line': 7, 'ch': 66}, 'content': [{'type': 'NL', 'start': {'line': 1, 'ch': 0}, 'end': {'line': 1, 'ch': 1}, 'literal': '\n'}, {'type': 'NAME', 'start': {'line': 2, 'ch': 0}, 'end': {'line': 2, 'ch': 4}, 'literal': 'from'}, {'type': 'NAME', 'start': {'line': 2, 'ch': 5}, 'end': {'line': 2, 'ch': 15}, 'literal': '__future__'}, {'type': 'NAME', 'start': {'line': 2, 'ch': 16}, 'end': {'line': 2, 'ch': 22}, 'literal': 'import'}, {'type': 'NAME', 'start': {'line': 2, 'ch': 23}, 'end': {'line': 2, 'ch': 37}, 'literal': 'print_function'}, {'type': 'NEWLINE', 'start': {'line': 2, 'ch': 37}, 'end': {'line': 2, 'ch': 38}, 'literal': '\n'}, {'type': 'NAME', 'start': {'line': 3, 'ch': 0}, 'end': {'line': 3, 'ch': 4}, 'literal': 'from'}, {'type': 'NAME', 'start': {'line': 3, 'ch': 5}, 'end': {'line': 3, 'ch': 10}, 'literal': 'scipy'}, {'type': 'OP', 'start': {'line': 3, 'ch': 10}, 'end': {'line': 3, 'ch': 11}, 'literal': '.'}, {'type': 'NAME