In [1]:
# Generating lexer and parser from grammar files
! cd _work/06-parsing && antlr4 -Dlanguage=Python3 JavaLexer.g4 JavaParser.g4

In [2]:
# Import the generated lexer and parser
import sys
sys.path.append("_work/06-parsing")

import antlr4
from antlr4 import FileStream, CommonTokenStream
from JavaLexer import JavaLexer
from JavaParser import JavaParser

In [3]:
# Other imports
from pathlib import Path
import seutil as su
import random
import collections
from tqdm import tqdm

work_dir = Path.cwd() / "_work"
downloads_dir = work_dir / "_downloads"

In [4]:
# Download the subject repository
project = su.project.Project(full_name="apache_commons-collections", url="https://github.com/apache/commons-collections.git")
project.clone(downloads_dir)

INFO:su.seutil.project:Project apache_commons-collections: cloning to /home/pynie/projects/cs846mlse-1249-demos/_work/_downloads/apache_commons-collections
INFO:su.seutil.project:Project apache_commons-collections: existing at /home/pynie/projects/cs846mlse-1249-demos/_work/_downloads/apache_commons-collections


In [5]:
# Find subject files
java_files = list(project.dir.rglob("**/*.java"))
print(f"Found {len(java_files)} Java files")
subject_file = random.choice(java_files)
print(f"Selected {subject_file}")
print(su.io.load(subject_file, fmt=su.io.fmts.txt))

Found 653 Java files
Selected /home/pynie/projects/cs846mlse-1249-demos/_work/_downloads/apache_commons-collections/src/test/java/org/apache/commons/collections4/queue/AbstractQueueTest.java
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
packag

In [6]:
# Parse the file
input_stream = FileStream(subject_file)
lexer = JavaLexer(input_stream)
stream = CommonTokenStream(lexer)
parser = JavaParser(stream)
tree = parser.compilationUnit()

In [7]:
# Check tokens
for token in stream.tokens:
    print(lexer.symbolicNames[token.type], token.text)


COMMENT /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
WS 

PACKAGE package
WS  
IDENTIFIER org
DOT .
IDENTIFIER apache
DOT .
IDENTIFIER commons
DOT .
IDENTIFIER collections4
DOT .
IDENTIFIER queue
SEMI ;
WS 


IMPORT import
WS  
STATIC static


In [8]:
def pretty_print_tree(node, rule_names=None, parser=parser, indent_level=0) -> str:
    if rule_names is None:
        rule_names = parser.ruleNames

    s = indent_level * " "

    if isinstance(node, antlr4.RuleNode):
        if (
            node.getAltNumber() != 0
        ):  # should use ATN.INVALID_ALT_NUMBER but won't compile
            s += (
                "<"
                + rule_names[node.getRuleIndex()]
                + ":"
                + str(node.getAltNumber())
                + ">"
            )
        s += "<" + rule_names[node.getRuleIndex()] + ">"
    elif isinstance(node, antlr4.ErrorNode):
        s += "<" + str(node) + ">"
    elif isinstance(node, antlr4.TerminalNode):
        if node.symbol is not None:
            s += node.symbol.text

    if node.getChildCount() == 1:
        s += " : " + pretty_print_tree(
            node.getChild(0), rule_names, parser, indent_level=0
        )
    else:
        for i in range(node.getChildCount()):
            c = node.getChild(i)
            s += "\n" + pretty_print_tree(c, rule_names, parser, indent_level + 2)
    return s

def get_text(node) -> str:
    if isinstance(node, antlr4.TerminalNode):
        return node.getText()
    else:
        return input_stream.getText(node.start.start, node.stop.stop)



In [9]:
# Pretty print the tree
print(pretty_print_tree(tree))

<compilationUnit>
  <packageDeclaration>
    package
    <qualifiedName>
      <identifier> : org
      .
      <identifier> : apache
      .
      <identifier> : commons
      .
      <identifier> : collections4
      .
      <identifier> : queue
    ;
  <importDeclaration>
    import
    static
    <qualifiedName>
      <identifier> : org
      .
      <identifier> : junit
      .
      <identifier> : jupiter
      .
      <identifier> : api
      .
      <identifier> : Assertions
      .
      <identifier> : assertEquals
    ;
  <importDeclaration>
    import
    static
    <qualifiedName>
      <identifier> : org
      .
      <identifier> : junit
      .
      <identifier> : jupiter
      .
      <identifier> : api
      .
      <identifier> : Assertions
      .
      <identifier> : assertNotNull
    ;
  <importDeclaration>
    import
    static
    <qualifiedName>
      <identifier> : org
      .
      <identifier> : junit
      .
      <identifier> : jupiter
      .
      <ident

In [10]:
# Find all method declarations in the tree
method_declarations = []
deque = collections.deque([tree])
while deque:
    node = deque.popleft()
    if isinstance(node, antlr4.RuleNode) and node.getRuleIndex() == parser.RULE_methodDeclaration:
        method_declarations.append(node)
    else:
        for i in range(node.getChildCount()):
            deque.append(node.getChild(i))

print(f"Found {len(method_declarations)} method declarations")
for n in method_declarations:
    print("-----")
    print(get_text(n))


Found 16 method declarations
-----
Queue<E> getCollection() {
        return (Queue<E>) super.getCollection();
    }
-----
boolean isSetSupported() {
        return true;
    }
-----
Collection<E> makeConfirmedCollection() {
        return new ArrayList<>();
    }
-----
Collection<E> makeConfirmedFullCollection() {
        return new ArrayList<>(Arrays.asList(getFullElements()));
    }
-----
Queue<E> makeFullCollection() {
        // only works if queue supports optional "addAll(Collection)"
        final Queue<E> queue = makeObject();
        queue.addAll(Arrays.asList(getFullElements()));
        return queue;
    }
-----
Queue<E> makeObject();
-----
void testEmptyQueueCompatibility() throws IOException, ClassNotFoundException {
        /*
         * Create canonical objects with this code
        Queue queue = makeEmptyQueue();
        if (!(queue instanceof Serializable)) return;

        writeExternalFormToDisk((Serializable) queue, getCanonicalEmptyCollectionName(queue));
       