In [1]:
from collections import deque
from tree_sitter import Language, Parser
import os
from formast.__main__ import JAVA_LANGUAGE

parser = Parser()
parser.set_language(JAVA_LANGUAGE)

with open("..\java_files\simple.java", "rb") as f:
    code = f.read()
tree = parser.parse(code)
tree

<tree_sitter.Tree at 0x2212e48a4d0>

In [2]:
from tree_sitter import Node

def print_node(node: Node, indent: str = ""):
    print(f"{indent}{node.type} [{node.start_point[0]}, {node.start_point[1]}] - [{node.end_point[0]}, {node.end_point[1]}]")
    for child in node.children:
        print_node(child, indent + "  ")

print_node(tree.root_node)

program [0, 0] - [0, 17]
  class_declaration [0, 0] - [0, 17]
    modifiers [0, 0] - [0, 6]
      public [0, 0] - [0, 6]
    class [0, 7] - [0, 12]
    identifier [0, 13] - [0, 14]
    class_body [0, 15] - [0, 17]
      { [0, 15] - [0, 16]
      } [0, 16] - [0, 17]


In [5]:
## Tokenizer
def traverse(tree):
    cursor = tree.walk()
    descend = [True]

    while True:
        if descend[-1]:
            yield cursor.node
        if descend[-1] and cursor.goto_first_child():
            descend[-1] = False
            descend.append(True)
        elif cursor.goto_next_sibling():
            descend[-1] = True
        elif cursor.goto_parent():
            descend.pop()
        else:
            break
            
for node in traverse(tree):
    if node.child_count == 0:
        print(node.text.decode('utf-8'))

public
class
A
{
}


In [6]:
def process_tree_ast(tree):
    if tree is None:
        raise ValueError("The tree object must not be None")

    lines = []

    def process_node(node):
        if node is None:
            raise ValueError("The tree object does not have the expected structure")

        line = ""

        if node.children:
            children = []
            for child in node.children:
                child_result = process_node(child)
                if child_result is not None:
                    children.append(child_result)
            line = 'B {} {}'.format(node.type, ' '.join(map(str, children)))
        else:
            try:
                text = node.text.decode('utf-8')
            except UnicodeDecodeError:
                raise ValueError("The text of the leaf nodes must be encoded using utf-8")
            line = 'L {}'.format(text)

        lines.append('{}'.format(line))

        return len(lines) - 1

    process_node(tree.root_node)
    return '\n'.join(lines)

print(process_tree_ast(tree))

L public
B modifiers 0
L class
L A
L {
L }
B class_body 4 5
B class_declaration 1 2 3 6
B program 7


In [7]:
def process_tree_ast_relatively(tree):
    if tree is None:
        raise ValueError("The tree object must not be None")

    lines = []

    def process_node(node):
        if node is None:
            raise ValueError("The tree object does not have the expected structure")

        line = ""

        if node.children:
            children = []
            for child in node.children:
                child_result = process_node(child)
                if child_result is not None:
                    children.append(child_result)
            line = 'B {} {}'.format(node.type, ' '.join(map(lambda x: str(x - len(lines)), children)))
        else:
            try:
                text = node.text.decode('utf-8')
            except UnicodeDecodeError:
                raise ValueError("The text of the leaf nodes must be encoded using utf-8")
            line = 'L {}'.format(text)

        lines.append('{}'.format(line))

        return len(lines) - 1

    process_node(tree.root_node)
    return '\n'.join(lines)

print(process_tree_ast_relatively(tree))

L public
B modifiers -1
L class
L A
L {
L }
B class_body -2 -1
B class_declaration -6 -5 -4 -1
B program -1


In [7]:
def process_tree_ast_rel(tree):
    if tree is None:
        raise ValueError("The tree object must not be None")

    lines = []

    def process_node(node):
        if node is None:
            raise ValueError("The tree object does not have the expected structure")

        line = ""

        if node.children:
            children = []
            for child in node.children:
                child_result = process_node(child)
                if child_result is not None:
                    children.append(child_result)
            line = 'B {} {}'.format(node.type, ' '.join(map(lambda x: str(x - len(lines)), children)))
        else:
            try:
                text = node.text.decode('utf-8')
            except UnicodeDecodeError:
                raise ValueError("The text of the leaf nodes must be encoded using utf-8")
            line = 'L {}'.format(text)

        lines.append('{}'.format(line))

        return len(lines) - 1

    process_node(tree.root_node)
    return '\n'.join(lines)

print(process_tree_ast_rel(tree))

L public
B modifiers -1
L class
L A
L {
L }
B class_body -2 -1
B class_declaration -6 -5 -4 -1
B program -1


In [9]:
import base64
import hashlib

def process_tree_comp_sorted(tree):
    if tree is None:
        raise ValueError("The tree object must not be None")

    lookup = {}
    lines = []

    def process_node_sorted(node):
        if node is None:
            raise ValueError("The tree object does not have the expected structure")

        line = ""

        if node.children:
            children = []
            for child in node.children:
                child_result = process_node_sorted(child)
                if child_result is not None:
                    children.append(child_result)
            line = 'B {} {}'.format(node.type, ' '.join(map(str, children)))
        else:
            try:
                text = node.text.decode('utf-8')
            except UnicodeDecodeError:
                raise ValueError("The text of the leaf nodes must be encoded using utf-8")
            line = 'L {}'.format(text)

        idx = lookup.get(line)
        if idx is not None:
            return idx

        hash_object = hashlib.sha256(line.encode('utf-8'))
        hash_value = int.from_bytes(hash_object.digest()[:8], byteorder='big', signed=True)
        idx = base64.urlsafe_b64encode(hash_value.to_bytes(8, byteorder='big', signed=True)).rstrip(b'=').decode('ascii')
        lines.append('{} {}'.format(idx, line))
        lookup[line] = idx

        return idx

    process_node_sorted(tree.root_node)
    sorted_lines = sorted(lines, key=lambda x: x.split()[0])
    return '\n'.join(sorted_lines) 

print(process_tree_comp_sorted(tree))

2usUse1FW98 L class
FwqmimPqmxY B class_declaration iLRoi73V-RI 2usUse1FW98 Jf7gA03GbKE _ETJtzenYkI
Jf7gA03GbKE L A
M8gnXQfEJJs B program FwqmimPqmxY
QRb-VDvN7n0 L }
_6xEEtLT7vg L public
_ETJtzenYkI B class_body f1dtMFaAKXw QRb-VDvN7n0
f1dtMFaAKXw L {
iLRoi73V-RI B modifiers _6xEEtLT7vg
