In [None]:
# coding: utf-8
import sys
import ast
from ast import AST
import tokenize
import token
from numbers import Number
import json
import io

In [None]:
def startsWith(start, tk):
    if(start):
        return start['line'] < tk.start[0] or (start['line'] == tk.start[0] and start['ch'] <= tk.start[1])
    return False

def getStart(node):
    if(hasattr(node, 'lineno')):
        return {'line' : node.lineno, 'ch': node.col_offset }
    else: # try to get to the first subnode that has a lineno
        child = next(ast.iter_child_nodes(node), None)
        loc = getStart(child) if child else None
        return loc

In [None]:
def formatToken(tk):
    ban = [token.NEWLINE, token.DEDENT, token.INDENT, token.OP]
    range = {'start': {'line': tk.start[0], 'ch': tk.start[1]}, 'end': {'line': tk.end[0], 'ch': tk.end[1]}}
    if (tk.type in ban) or token.tok_name[tk.type] == 'NL':
        range['syntok'] = tk.string
        return range
    range['literal'] = tk.string
    range['type'] = token.tok_name[tk.type]
    return range

def formatTokenList(tk_list):
    formatted = []
    for tk in tk_list:
        fm = formatToken(tk)
        formatted.append(fm)

    return formatted

In [None]:
def splitBeforeTokens(content, nodey, before_tokens):
    prevNodey = content[-1] if content != [] else None
    middle = []
    nodeyMatch = []
    for tk in before_tokens:
        #print('SPLIT prevNodey is', prevNodey, 'token is', tk, 'nodey is', nodey)
        if(nodey and tk['start']['line'] == nodey['start']['line']):
            nodeyMatch.append(tk)
        elif(prevNodey and tk['start']['line'] == prevNodey['start']['line']):
            content.append(tk)
        else:
            middle.append(tk)
    if nodeyMatch != []:
        print("DOING SUMthing with nodeyMatch, ", nodeyMatch)
        nodey['content'] = nodeyMatch + (nodey['content'] or [])
    print(middle)
    return content, middle, nodey


def splitAfterTokens(prevStart, nodeStart, after_tokens):
    middle = []
    nodeyMatch = []
    for tok in after_tokens:
        if nodeStart and tok.start[0] == nodeStart['line'] and ((not prevStart) or tok.start[0] != prevStart['line']):
            nodeyMatch.append(tok)
        else:
            middle.append(tok)
    return middle, nodeyMatch

In [None]:
def processTokens_before(node, tokenList):
    chunkList = []
    before = []
    child = next(ast.iter_child_nodes(node), None)
    if child:
        start = getStart(child)
        if(start):
            for chunk in tokenList:
                if(startsWith(start, chunk)): #start of child
                    chunkList.append(chunk)
                else:
                    before.append(chunk)
            before_formatted = formatTokenList(before)
            return before_formatted, chunkList

    return [], tokenList



def processTokens_middle(node, tokenList):
    children = ast.iter_child_nodes(node)
    child1 = next(children, None)
    content = []
    #print("child is", list(ast.iter_child_nodes(node)))
    
    if child1:
        chunkList = []
        child2 = next(children, None)
        child2_start = None
        while(child2 and not child2_start):
            child2_start = getStart(child2)
            if(not child2_start): 
                child2 = next(children, None)
        if(child2):
            child1_start = getStart(child1)
            #print("child #1", child1, child1_start, "child #2", child2, child2_start, "\n")
            for chunk in tokenList: 
                if(child2 and startsWith(child2_start, chunk)): #start of child 2
                    #first, give all the tokens collected so far to child1. child2 starts with what remains
                    before_tokens, child_nodey, after_tokens = zipTokensAST(chunkList, child1)
                    content, before_tokens, child_nodey = splitBeforeTokens(content, child_nodey, before_tokens)
                    #print("ADDING BEFORE", before_tokens)
                    content += before_tokens
                    if child_nodey: content.append(child_nodey)
                    chunkList = []
                    if(after_tokens != []):
                        after_tokens, chunkList = splitAfterTokens(child1_start, child2_start, after_tokens)
                        content += formatTokenList(after_tokens)
                    child1_start = child2_start
                    child1 = child2
                    child2 = next(children, None)
                    child2_start = getStart(child2) if child2 else None
                    #print("child #1 is now:", child1, child1_start, "child #2", child2, child2_start, "\n", content)
        
                chunkList.append(chunk)
        else: 
            chunkList = tokenList
     
        before_tokens, child_nodey, after_tokens = zipTokensAST(chunkList, child1)
        if(child_nodey):
            content, before_tokens, child_nodey = splitBeforeTokens(content, child_nodey, before_tokens)
        content += before_tokens
        if child_nodey: content.append(child_nodey)
        chunkList = after_tokens
        return content, chunkList
    else:
        # no children, but eat what can
        start = getStart(node)
        content = []
        if(start != None and tokenList != []):
            if(tokenList[0].start[0] == start['line'] and tokenList[0].start[1] == start['ch']):
                content += formatTokenList([tokenList.pop(0)])
                #end = content[-1]
        return content, tokenList
        

def zipTokensAST(tokens, node):
    #print ("\nTreating", type(node).__name__ )
    
    before_tokens, tokens = processTokens_before(node, tokens)
    #if before_tokens !=[] : print("got before_tokens", before_tokens)
    
    content, remainder = processTokens_middle(node, tokens)
    #print("got content ", content)
    if(content != []):
        if(remainder != []):
            print(content[-1])
            remainder, chunkList = splitAfterTokens(None, content[-1]['start'], remainder)
            content += formatTokenList(chunkList)
        nodey = {'type': type(node).__name__, 'start': content[0]['start'], 'end': content[-1]['end'], 'content': content}
        #print ("\nCompiling", type(node).__name__ )
        #print("\nnode:", nodey, "\nafter:", remainder, "\n\n")
    else:
        nodey = None
    return before_tokens, nodey, remainder



def addBackSpaces(tokens):
    prior = None
    fixedList = []
    for tok in tokens:
        if(prior):
            start = tok.start
            #check for a gap. prior end and tok start should match
            if(prior.end[0] == start[0]): # same line
                if(prior.end[1] != start[1]): # different character
                    space = start[1] - prior.end[1]
                    fixedList.append(SpacerToken(token.OP, " "*space, [prior.end[0], prior.end[1]], [start[0], start[1]]))
        fixedList.append(tok)
        prior = tok
    return fixedList


class SpacerToken:
    def __init__(self, ty, st, start, end):
        self.type = ty
        self.string = st
        self.start = start
        self.end = end


def main(text):
    #first, check if the code is empty
    if(text == ""): print("")
    # now check if this file is python 2.7 or python 3
    # can't trust the user's current active kernel, because they may
    # be on a python 3 kernel and then open a file written in python 2.7
    parsePy3(text)
    

In [None]:
def parsePy2(text):
    tree = ast.parse(text)
    split = text.split('\n')
    bytes = io.BytesIO(text.encode())
    g = tokenize.tokenize(bytes.readline)
    
    tokens = list(g)
    tokens = addBackSpaces(tokens)
    #print(tokens)
    tokens.pop(0) #get rid of encoding stuff
    before_tokens, nodey, remainder = zipTokensAST(tokens, tree)
    if nodey is None:
        nodey = {'start': {'line': 0, 'ch': 0}, 'end': {'line': 0, 'ch': 0}, 'content': []}
    nodey['content'] = before_tokens + nodey['content'] + formatTokenList(remainder)
    nodey['start'] = nodey['content'][0]['start']
    nodey['end'] = nodey['content'][-1]['end']
    nodey['content'].pop() #remove end marker
    print (json.dumps(nodey, indent=2))

In [4]:
def parsePy3(text):
    tree = ast.parse(text)
    split = text.split('\n')
    bytes = io.BytesIO(text.encode())
    g = tokenize.tokenize(bytes.readline)
    
    tokens = list(g)
    tokens = addBackSpaces(tokens)
    #print(tokens)
    tokens.pop(0) #get rid of encoding stuff
    before_tokens, nodey, remainder = zipTokensAST(tokens, tree)
    if nodey is None:
        nodey = {'start': {'line': 0, 'ch': 0}, 'end': {'line': 0, 'ch': 0}, 'content': []}
    nodey['content'] = before_tokens + nodey['content'] + formatTokenList(remainder)
    nodey['start'] = nodey['content'][0]['start']
    nodey['end'] = nodey['content'][-1]['end']
    nodey['content'].pop() #remove end marker
    print (json.dumps(nodey, indent=2))

In [12]:
text = """doc_idx = 5

doc_tfidf = tfidf.getrow(doc_idx)
all_terms = vectorizer.get_feature_names()
terms = [all_terms[i] for i in doc_tfidf.indices]
values = doc_tfidf.data

print(docs[doc_idx])
print("document's term-frequency pairs:")
print(", ".join("\\"%s\\"=%0.2f"%(t,v) for t,v in zip(terms,values)))
"""

print(text)
main(text)
#print(json.dumps(main(l, tree),  indent=2))

doc_idx = 5

doc_tfidf = tfidf.getrow(doc_idx)
all_terms = vectorizer.get_feature_names()
terms = [all_terms[i] for i in doc_tfidf.indices]
values = doc_tfidf.data

print(docs[doc_idx])
print("document's term-frequency pairs:")
print(", ".join("\"%s\"=%0.2f"%(t,v) for t,v in zip(terms,values)))

{
  "type": "Module",
  "start": {
    "line": 1,
    "ch": 0
  },
  "end": {
    "line": 10,
    "ch": 68
  },
  "content": [
    {
      "type": "Assign",
      "start": {
        "line": 1,
        "ch": 0
      },
      "end": {
        "line": 1,
        "ch": 12
      },
      "content": [
        {
          "start": {
            "line": 1,
            "ch": 0
          },
          "end": {
            "line": 1,
            "ch": 7
          },
          "literal": "doc_idx",
          "type": "NAME"
        },
        {
          "start": {
            "line": 1,
            "ch": 7
          },
          "end": {
            "line": 1,
            "ch": 8
          },
          "syn