## Preprocess

In [60]:
import jsonlines
import pandas
import json
# Load the data from the JSON line file
with open('../data/kotlin-public.jsonl', 'r') as f:
    dataframe = pandas.DataFrame([json.loads(line) for line in f])

In [None]:
dataframe

Unnamed: 0,id,repo,revision,path,modified,prefix,suffix,archive
0,797c52,Anthonyy232/Paperize,09a6e2f031bd9a350ba17b398813d62c37982f5c,app/src/main/java/com/anthonyla/livewallpaper/...,[app/src/main/java/com/anthonyla/livewallpaper...,package com.anthonyla.livewallpaper.navigation...,}\n ...,Anthonyy232__Paperize-09a6e2f031bd9a350ba17b39...
1,35f83f,Anthonyy232/Paperize,322a10bf0c9f9bbef49f4fa75f149f198f9598a8,app/src/main/java/com/anthonyla/paperize/featu...,[app/src/main/java/com/anthonyla/paperize/core...,package com.anthonyla.paperize.feature.wallpap...,if (shouldScheduleAlar...,Anthonyy232__Paperize-322a10bf0c9f9bbef49f4fa7...
2,44457a,Anthonyy232/Paperize,37858a3e5c1bb7cfd8eb3e0bf769502fb2668db0,app/src/main/java/com/anthonyla/paperize/featu...,[app/src/main/java/com/anthonyla/paperize/App....,package com.anthonyla.paperize.feature.wallpap...,is SettingsEvent.RefreshWallpaperS...,Anthonyy232__Paperize-37858a3e5c1bb7cfd8eb3e0b...
3,a23204,Anthonyy232/Paperize,48a730e4a9be938118cae2bb66eef528fdbd7c2b,app/src/main/java/com/anthonyla/paperize/featu...,[app/src/main/java/com/anthonyla/paperize/core...,package com.anthonyla.paperize.feature.wallpap...,"inverseSurface = Color.White,\n inverse...",Anthonyy232__Paperize-48a730e4a9be938118cae2bb...
4,0fa786,Anthonyy232/Paperize,5eb55575b4a33d5f9206780ff5e1316f13354de6,app/src/main/java/com/anthonyla/paperize/featu...,[app/src/main/java/com/anthonyla/paperize/App....,package com.anthonyla.paperize.feature.wallpap...,"),\n enterTransition = ...",Anthonyy232__Paperize-5eb55575b4a33d5f9206780f...
...,...,...,...,...,...,...,...,...
395,7e5c05,xebia-functional/xef,cc264d2383f63569c9fee1446a9a655e91b624ca,kotlin/src/commonMain/kotlin/com/xebia/functio...,[kotlin/src/commonMain/kotlin/com/xebia/functi...,package com.xebia.functional.prompt\n\nimport ...,\ndata class HumanMessage(override val content...,xebia-functional__xef-cc264d2383f63569c9fee144...
396,6c96f1,xebia-functional/xef,d46e052218799ad1b8941eb7001af4d1c3063473,core/src/commonMain/kotlin/com/xebia/functiona...,[core/src/commonMain/kotlin/com/xebia/function...,package com.xebia.functional.xef.llm\n\nimport...,"scope,\n functions,\n { json -...",xebia-functional__xef-d46e052218799ad1b8941eb7...
397,430599,xebia-functional/xef,e42d443fff42efaed67c27aaf9e46aba484be9d9,core/src/commonMain/kotlin/com/xebia/functiona...,[core/src/commonMain/kotlin/com/xebia/function...,package com.xebia.functional.xef.metrics\n\nim...,}\n\n override suspend fun assistantCreated...,xebia-functional__xef-e42d443fff42efaed67c27aa...
398,d2b778,xebia-functional/xef,fe88efd640047ad0435362b1e744543c5f304b76,gpt4all-kotlin/src/jvmMain/kotlin/com/xebia/fu...,[core/src/commonMain/kotlin/com/xebia/function...,package com.xebia.functional.gpt4all\n\nimport...,"\n operator fun invoke(\n url: String,...",xebia-functional__xef-fe88efd640047ad0435362b1...


In [67]:
import os
SEPARATOR_COMMENT = "\n/**<FIM>*/\n"
DATA_ROOT = "../data/repositories-kotlin-public/"



def extract_original_code_from_row(row: pandas.Series) -> str:
    """
    Extract the original code from the file path.
    """
    repo_path = os.path.join(DATA_ROOT, "-".join([row["repo"].replace("/", "__"), row['revision']]))
    file_path = repo_path + "/" + row['path']
    with open(file_path, 'r') as file:
        content = file.read()
    return content

dataframe['original_code'] = dataframe.apply(extract_original_code_from_row, axis=1)
dataframe['incomplete_code'] = dataframe.apply(lambda row: SEPARATOR_COMMENT.join([row['prefix'], row['suffix']]), axis=1)


In [71]:
import diff_match_patch

def extract_diff_from_original_code_and_incomplete_code(row) -> str:
    """
    Extract the diff from the original code.
    """
    dmp = diff_match_patch.diff_match_patch()
    original_code = row['original_code']
    incomplete_code = row['incomplete_code']
    
    diffs = dmp.diff_lineMode(original_code, incomplete_code, deadline=None)
    # Convert the diffs to a single diff string
    diffs = "\n".join([diff[1] for diff in diffs if diff[0] != 0])
    return diffs

def extract_patch_from_original_code_and_incomplete_code(row) -> str:
    """
    Extract the patch from the original code.
    """
    dmp = diff_match_patch.diff_match_patch()
    original_code = row['original_code']
    incomplete_code = row['incomplete_code']
    
    diffs = dmp.patch_make(original_code, incomplete_code)
    # Convert the patches to a single patch string
    patches = dmp.patch_toText(diffs)
    return patches


In [72]:
for index, row in dataframe.iterrows():
    output_root = "../samples/"  
    repo_revision = "-".join([row["repo"].replace("/", "__"), row['revision']])
    dir_name = os.path.join(output_root, repo_revision)
    os.makedirs(dir_name, exist_ok=True)
    original_file_path = os.path.join(dir_name, "original.kt")
    incomplete_file_path = os.path.join(dir_name, "incomplete.kt")
    diff_file_path = os.path.join(dir_name, "diff.diff")
    patch_file_path = os.path.join(dir_name, "patch.patch")
    
    with open(original_file_path, 'w') as original_file:
        original_file.write(row['original_code'])
    with open(incomplete_file_path, 'w') as incomplete_file:
        incomplete_file.write(row['incomplete_code'])
    with open(diff_file_path, 'w') as diff_file:
        diff_file.write(extract_diff_from_original_code_and_incomplete_code(row))
    with open(patch_file_path, 'w') as patch_file:
        patch_file.write(extract_patch_from_original_code_and_incomplete_code(row))
        
        
        


## Syntax tree analysis

In [6]:
# Let's create a simple example to demonstrate the tree-sitter query matching
import tree_sitter

# Sample Kotlin code
kotlin_code = """
class Person(val name: String) {
    var age: Int = 0
    
    fun getInfo(): String {
        return "Name: $name, Age: $age"
    }
    
    // This is a comment
    @Override
    fun toString() = "Person(name=$name)"
}
"""

# Create a parser
parser = tree_sitter.Parser()
parser.set_language(language)

# Parse the code
tree = parser.parse(bytes(kotlin_code, "utf8"))

# Print the syntax tree
print(tree.root_node.sexp())

(source_file (class_declaration name: (type_identifier) primary_constructor: (primary_constructor parameters: (class_parameter (simple_identifier) (user_type (type_identifier)))) body: (class_body (property_declaration variable: (variable_declaration (simple_identifier) (user_type (type_identifier))) expression: (integer_literal)) (function_declaration name: (simple_identifier) parameters: (function_value_parameters) return_type: (user_type (type_identifier)) body: (function_body (statements (jump_expression (string_literal (interpolated_identifier) (interpolated_identifier)))))) (line_comment) (function_declaration modifiers: (modifiers (annotation (user_type (type_identifier)))) name: (simple_identifier) parameters: (function_value_parameters) body: (function_body (string_literal (interpolated_identifier)))))))


In [17]:
def highlight_matches(code: str, query_string: str):
    """
    Highlight matches from a tree-sitter query in the given code
    """
    # Create a query
    query = language.query(query_string)
    
    # Get matches
    tree = parser.parse(bytes(code, "utf8"))
    matches = query.captures(tree.root_node)
    if not matches:
        print("No matches found.")
        return
    
    
    # Print matches with their capture types
    for match in matches:
        print(match)

In [21]:
# Let's try some specific queries

# Find all function declarations
function_query = """
(function_declaration 
    . (simple_identifier) @function_name) @function
"""
print("Function declarations:")
highlight_matches(kotlin_code, function_query)

# Find class declarations
class_query = """(class_declaration
    (simple_identifier) @class_name) @class
"""

# Find class properties
property_query = """
(class_parameter
    (simple_identifier) @property)
(class_body
    (property_declaration
        (variable_declaration
            (simple_identifier) @property)))
"""
print("\nProperties:")
highlight_matches(kotlin_code, property_query)

Function declarations:
(<Node type=function_declaration, start_point=(4, 4), end_point=(6, 5)>, 'function')
(<Node type=simple_identifier, start_point=(4, 8), end_point=(4, 15)>, 'function_name')
(<Node type=function_declaration, start_point=(9, 4), end_point=(10, 41)>, 'function')

Properties:
(<Node type=simple_identifier, start_point=(1, 17), end_point=(1, 21)>, 'property')
(<Node type=simple_identifier, start_point=(2, 8), end_point=(2, 11)>, 'property')


In [22]:
# Function to analyze package and import relationships
def analyze_package_imports(code: str) -> tuple[str, list[str]]:
    """
    Analyze a Kotlin file to extract its package and imports,
    returning the package name and list of import statements
    """
    # Query to capture package and import identifiers
    query_str = """
    (package_header
        (identifier) @package)
        
    (import_header
        (identifier) @import)
    """
    
    # Parse the code
    tree = parser.parse(bytes(code, "utf8"))
    query = language.query(query_str)
    
    # Get matches
    matches = query.captures(tree.root_node)
    if not matches:
        print("No matches found.")
        return "", []
    package_name = ""
    imports = []
    
    for (node, node_name) in matches:
        if node_name == "package":
            package_name= node.text.decode("utf-8")
        elif node_name == "import":
            imports.append(node.text.decode("utf-8"))

    return package_name, imports

def check_same_repository_imports(package_name: str, imports: list[str]) -> list[str]:
    """
    Check which imports are from the same repository as the package
    by comparing their root package names
    """
    # Get the root package (first component)
    root_package = package_name.split('.')[0]
    
    # Find imports that start with the same root package
    same_repo_imports = [
        imp for imp in imports
        if imp.split('.')[0] == root_package
    ]
    
    return same_repo_imports

# Test with the sample code
sample_kotlin = '''package fe.linksheet.module.resolver
import android.content.Context
import android.content.Intent
import android.content.IntentFilter
import android.content.IntentFilter.AuthorityEntry
import android.content.pm.PackageManager
import android.content.pm.ResolveInfo
import android.content.pm.queryIntentActivitiesCompat
import android.net.Uri
import fe.linksheet.util.BitFlagUtil
import fe.linksheet.util.LinkSheetCompat
'''

package_name, imports = analyze_package_imports(sample_kotlin)
print(f"Package: {package_name}")
print("\nImports:")
for imp in imports:
    print(f"  {imp}")

same_repo_imports = check_same_repository_imports(package_name, imports)
print("\nImports from same repository:")
for imp in same_repo_imports:
    print(f"  {imp}")

Package: fe.linksheet.module.resolver

Imports:
  android.content.Context
  android.content.Intent
  android.content.IntentFilter
  android.content.IntentFilter.AuthorityEntry
  android.content.pm.PackageManager
  android.content.pm.ResolveInfo
  android.content.pm.queryIntentActivitiesCompat
  android.net.Uri
  fe.linksheet.util.BitFlagUtil
  fe.linksheet.util.LinkSheetCompat

Imports from same repository:
  fe.linksheet.util.BitFlagUtil
  fe.linksheet.util.LinkSheetCompat
