In [4]:
import json
import pandas as pd
from tree_sitter_languages import get_language, get_parser

In [None]:
def remove_import_nodes(code, language_name):
    """
    Remove import nodes from the code by splitlines()
    and reconstruct the code without import statements.
    """
    lines = code.splitlines()
    file_name = lines[0]
    original_code = "\n".join(lines[1:])
    language = get_language(language_name)
    parser = get_parser(language_name)
    tree = parser.parse(bytes(original_code, "utf8"))
    root_node = tree.root_node
    query_string = "(import_list) @import\n(import_header) @import"
    query = language.query(query_string)
    matches = query.captures(root_node)
    start_bytes = [match[0].start_byte for match in matches]
    end_bytes = [match[0].end_byte for match in matches]
    new_code = ""
    for start_byte, end_byte in zip(start_bytes, end_bytes):
        new_code = original_code[start_byte:] + original_code[end_byte:]
    cleaned_code = f"{file_name}\n{new_code}"
    cleaned_code = cleaned_code.replace("\n\n", "\n").strip()
    return cleaned_code

In [57]:
remove_import_nodes("import kotlin.io.ansk\nfun main() {\n    println(\"Hello, World!\")\n}", "kotlin")

'import kotlin.io.ansk'

In [50]:
file_separator = "<|file_sep|>"

cleaned_context = dataframe['context'].apply(lambda x: file_separator.join([remove_import_nodes(y, 'kotlin') for y in x.split(file_separator)]))


In [41]:
with open("../predictions/kotlin_public_context-heuristic-max-file-3-cleaned.jsonl", 'w') as f:
    for index, row in dataframe.iterrows():
        f.write(json.dumps({

            "context": cleaned_context[index], "prefix": row['prefix'], "suffix": row['suffix'],
        }) + "\n")

In [42]:
cleaned_context[12]

'{"context": "\n{"context": "<|file_sep|>app/src/main/java/org/koitharu/kotatsu/shikimori/data/ShikimoriRepository.kt\\npackage org.koitharu.kotatsu.shikimori.data\\n\\nimport okhttp3.FormBody\\nimport okhttp3.HttpUrl.Companion.toHttpUrl\\nimport okhttp3.OkHttpClient\\nimport okhttp3.Request\\nimport org.koitharu.kotatsu.parsers.model.Manga\\nimport org.koitharu.kotatsu.parsers.util.await\\nimport org.koitharu.kotatsu.parsers.util.json.mapJSON\\nimport org.koitharu.kotatsu.parsers.util.parseJson\\nimport org.koitharu.kotatsu.parsers.util.parseJsonArray\\nimport org.koitharu.kotatsu.parsers.util.urlEncoded\\nimport org.koitharu.kotatsu.shikimori.data.model.ShikimoriManga\\nimport org.koitharu.kotatsu.shikimori.data.model.ShikimoriMangaInfo\\nimport org.koitharu.kotatsu.shikimori.data.model.ShikimoriUser\\n\\nprivate const val CLIENT_ID = \\"Mw6F0tPEOgyV7F9U9Twg50Q8SndMY7hzIOfXg0AX_XU\\"\\nprivate const val CLIENT_SECRET = \\"euBMt1GGRSDpVIFQVPxZrO7Kh6X4gWyv0dABuj4B-M8\\"\\nprivate const