# helpers

> This module contains all the helper functions for this library.

In [None]:
#| default_exp helpers

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#|eval: false
from datasets import load_dataset
from tqdm.auto import tqdm

ds = load_dataset("bigcode/the-stack-smol", data_dir="data/python", split="train")

repo_files = {}
for repo_name in tqdm(set(ds["repository_name"]), desc="Processing repos"):
    rows_w_repo = ds.filter(lambda example: example["repository_name"] == repo_name)

    if len(rows_w_repo) > 1:
        repo_files[repo_name] = [row["content"] for row in rows_w_repo]
        if len(repo_files) > 10:
            break

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [None]:
#| export
def get_query(language, program_lang):
    "Get a query based on the language"
    if program_lang == "python":
        return language.query("""
            (function_definition
                name: (identifier) @func.name)

            (class_definition
                name: (identifier) @class.name)
            """
        )

In [None]:
#| export
def get_internal_methods(file_contents, tokenizer):
    """
    Get all the internal methods in a set of files
    """
    project_content = "\n\n".join(file_contents)
    tree = tokenizer.parser.parse(project_content.encode())
    root_node = tree.root_node
    query = get_query(tokenizer.language, tokenizer.program_lang)
    captures = query.captures(root_node)
    # make sure to ignore dunders
    internal_methods = {node.text.decode() for node, _ in captures if not node.text.decode().startswith("__")}
    return internal_methods

In [None]:
#|eval: false
from code_tokenizers.core import CodeTokenizer

py_tokenizer = CodeTokenizer.from_pretrained("gpt2", "python")

internal_methods = get_internal_methods(repo_files["reduceus/connect-python-sdk"], py_tokenizer)
internal_methods

{'V1ListItemsRequest',
 'V1RetrieveBusinessRequest',
 'batch_token',
 'to_dict',
 'to_str'}

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()