In [5]:
# coding: utf-8
import sys
import re
import string
import ast
from ast import AST
import tokenize
import token
from numbers import Number
import json
import io

In [6]:
def regexEnd(text, start, parentEnd, nodeStart):
    endLineno = None
    endCh = None
    first = text[start]
    m = None
    
    if first in parens:
        # find outer parens
        outer = re.compile("\((.*)\)")
        m = outer.search(text[start:parentEnd+1])
    elif first in sqParens:
        # find outer parens
        outer = re.compile("\[(.*)\]")
        m = outer.search(text[start:parentEnd+1])
    elif first in brackets:
        # find outer parens
        outer = re.compile("\{(.*)\}")
        m = outer.search(text[start:parentEnd+1])
    else: return None
    
    lines = m.group(0).split("\n")
    end = len(m.group(0)) + start
    endLineno = nodeStart['line'] + len(lines)
    endCh = len(lines[-1]) - 1
    if (endLineno == nodeStart['line']): endCh += nodeStart['ch']
    myEnd = {'line': endLineno - 1, 'ch': endCh}
    return (end, myEnd,  m.group(0))

In [7]:
def findNodeStart(node):
    if hasattr(node, 'lineno'):
        return {'line': node.lineno, 'ch': node.col_offset}
    elif  type(node).__name__ == "Module":
        return {'line': 1, 'ch': 0}
    else: # must be some kind of wrapper node
        children = ast.iter_child_nodes(node)
        firstChild = next(children, None)
        if firstChild is None: return None
        return findNodeStart(firstChild)
        
        
def findNodeEnd(node, nextNode, text, start, end, nodeStart):
    endLineno = None
    endCh = None
    
    if(nextNode is None):
        bracketed = regexEnd(text, start, end, nodeStart)
        if(bracketed): return bracketed[0:2]
        else:
            snippet = text[start:end+1]
            lines = snippet.split("\n")
            ln = nodeStart['line'] + len(lines) - 1
            ch = len(lines[-1]) - 1
            if (ln == nodeStart['line']): ch += nodeStart['ch']
            return end, {'line': ln, 'ch': ch}
    else:
        potentialEnd = findNodeStart(nextNode) #may need refinement
        print("END IS", potentialEnd, ast.dump(nextNode))
        snippet = text[start:end+1]
        lines = snippet.split("\n")
        ln = potentialEnd['line'] -  nodeStart['line']
        chars = start
        for x in lines[:ln+1]:
            chars += len(x) + 1 #add 1 for the \n
        chars -= 1 # remove the last \n
        return chars, potentialEnd
        


In [8]:
def findLiteralEnd(snippet, nodeStart): # what to do about multiline literal???
    bracketed = regexEnd(snippet, 0, len(snippet), nodeStart)
    if(bracketed):
        i, nodeEnd, punct = bracketed
        literal = [{'syntok': p} for p in list(punct)]
    else:
        end = 0
        for i, character in enumerate(snippet):
            if character in punctuation:
                end = i - 1
                break
        literal = snippet[0:end+1]
        ch = len(literal) + nodeStart['ch']
        line = nodeStart['line']
        nodeEnd = {'line': line, 'ch': ch}
        
    print("literal |"+str(literal)+"|")
    return (i, nodeEnd, literal)

In [9]:
def visitLiteral(node, text, start):
    myType = type(node).__name__
    myStart = {'line': node.lineno, 'ch': node.col_offset}
    end, myEnd, myLiteral = findLiteralEnd(text[start:], myStart)
    end += start
    if(isinstance(myLiteral, str)):
        return (end, {'type': myType, 'start': myStart, 'end': myEnd, 'literal': myLiteral})
    else: # actually a list of syntok
        return (end, {'type': myType, 'start': myStart, 'end': myEnd, 'content': myLiteral})

In [10]:
def findNextChild(children, itr):
    banned = ["Store", "Load"]
    if(itr + 1 < len(children)):
        child = children[itr + 1]
        if(type(child).__name__ not in banned):
            return child, itr + 1
        else:
            return findNextChild(children, itr + 1)
    else:
        return None, itr + 1

In [11]:
def captureComment(text, textStart, textEnd):
    line = text[textStart:textEnd]
    line = line[:line.find("\n")]
    return textStart + len(line), line

In [29]:
def visitAttribute(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    end, value = visit(node.value, text, textStart, textEnd, None)
    myContent.append(value)
    myContent.append({'syntok': '.'})
    attr = str(node.attr)
    myContent.append({'syntok': attr})
    myStart = value['start']
    myEnd = {'line': myStart['line'], 'ch': myStart['ch'] + 1 + len(attr)}
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    end += 1 + len(attr)
    return end, me

def visitSubscript(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    myStart = findNodeStart(node)
    end, value = visit(node.value, text, textStart, textEnd, None)
    end, slicey = visit(node.slice, text, textStart, end, None)
    myContent.append(value)
    myContent.append(slicey)
    myEnd = slicey['end']
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    return end, me
    
def visitIndex(node, text, textStart, textEnd):
    myType = type(node).__name__
    myContent = []
    myContent.append({'syntok': '['})
    textStart += 1
    end, value = visit(node.value, text, textStart, textEnd, None)
    myContent.append(value)
    myContent.append({'syntok': ']'})
    myStart = {'line': value['start']['line'], 'ch': value['start']['ch'] - 1}
    myEnd = {'line': myStart['line'], 'ch': myStart['ch'] + 1 }
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    end += 1
    return end, me
    

In [30]:
def visit(node, text, textStart, textEnd, nextNode):
    # 1. first, figure out if we're dealing with a literal or parent
    # 2. figure out the start and end positions of this node
    # 3. process children
    # 4. create the formatted Nodey
    
    children = list(ast.iter_child_nodes(node))
    child_itr = -1
    child, child_itr = findNextChild(children, child_itr)
    print("\n",type(node).__name__, list(ast.iter_child_nodes(node)))
    
    if not child: # LITERAL
        return visitLiteral(node, text, textStart)
    
    print(ast.dump(child, True, True))

    myType = type(node).__name__
    if myType == "Attribute":
        return visitAttribute(node, text, textStart, textEnd)
    if myType == "Index":
        return visitIndex(node, text, textStart, textEnd)
    if myType == "Subscript":
        return visitSubscript(node, text, textStart, textEnd)
    
    myContent = [] 
    myStart = findNodeStart(node)
    textEnd, myEnd = findNodeEnd(node, nextNode, text, textStart, textEnd, myStart)
    
    print("POSITION:", myStart, myEnd)

    # now give each child a span of the text
    nextChild, child_itr = findNextChild(children, child_itr)
    i = textStart
    lineno = myStart['line']
    col_offset = myStart['ch']
    print("text?", textEnd, textStart, lineno, col_offset)
    while i < textEnd:
        token = text[i]
        #if child: print(child, lineno, col_offset, child.lineno, child.col_offset, token)
        if child and (not hasattr(child, "lineno") or (lineno == child.lineno and col_offset == child.col_offset)):
            print(myContent)
            new_i, childContent = visit(child, text, i, textEnd, nextChild)
            myContent.append(childContent)
            col_offset += (new_i - i) - 1
            i = new_i - 1
            print(myType," AFTER: ",myContent, i, "\n")
            child = nextChild
            nextChild, child_itr = findNextChild(children, child_itr)
        elif token == "#":
            new_i, comment = captureComment(text, i, textEnd)
            myContent.append({'syntok': comment})
            print("COMMENT?", comment, new_i, "|"+text[new_i]+"|")
            i = new_i - 1
        elif token in newline:
            lineno += 1
            col_offset = -1
            if not child: textEnd = i #end of this node actually
        else:
            myContent.append({'syntok':token})
        i += 1
        col_offset += 1
    
    me = {'type': myType, 'start': myStart, 'end': myEnd, 'content': myContent}
    return textEnd, me
        
    
    

In [31]:
def parse(text):
    node = ast.parse(text)
    print(ast.dump(node, True, True))
    print("\n",visit(node, text, 0, len(text), None))

In [32]:
text = """# TODO: Select three indices of your choice you wish to sample from the dataset
indices = []

# Create a DataFrame of the chosen samples
samples = pd.DataFrame(data.loc[indices], columns = data.keys()).reset_index(drop = True)
print("Chosen samples of wholesale customers dataset:")
display(samples)"""


sqParens = set(["[","]"])
parens = set(["(",")"])
brackets = set(["{","}"])
newline = set(["\n"]) #todo may vary across platforms
punctuation = set(string.punctuation)
punctuation.add(" ")


# to hurry up, reduce ast at this stage?
# match parens [] {} () otherwise those can end up in weird places

parse(text)
#print(json.dumps(main(l, tree),  indent=2))

Module(body=[Assign(targets=[Name(id='indices', ctx=Store(), lineno=2, col_offset=0)], value=List(elts=[], ctx=Load(), lineno=2, col_offset=10), lineno=2, col_offset=0), Assign(targets=[Name(id='samples', ctx=Store(), lineno=5, col_offset=0)], value=Call(func=Attribute(value=Call(func=Attribute(value=Name(id='pd', ctx=Load(), lineno=5, col_offset=10), attr='DataFrame', ctx=Load(), lineno=5, col_offset=10), args=[Subscript(value=Attribute(value=Name(id='data', ctx=Load(), lineno=5, col_offset=23), attr='loc', ctx=Load(), lineno=5, col_offset=23), slice=Index(value=Name(id='indices', ctx=Load(), lineno=5, col_offset=32)), ctx=Load(), lineno=5, col_offset=23)], keywords=[keyword(arg='columns', value=Call(func=Attribute(value=Name(id='data', ctx=Load(), lineno=5, col_offset=52), attr='keys', ctx=Load(), lineno=5, col_offset=52), args=[], keywords=[], lineno=5, col_offset=52))], lineno=5, col_offset=10), attr='reset_index', ctx=Load(), lineno=5, col_offset=10), args=[], keywords=[keyword(ar