In [75]:
TYPES_TO_INCLUDE = ["class", "def", "identifier", "parameters", "argument_list", "->", ":", "type"]

In [77]:
from typing import Any, List


def include_parent(
    node: Any,
    text: str,
    include_parent_attributes: bool,
    include_parent_init: bool,
    depth: int,
    buffer: str,
) -> str:
    """Recursively include the parent of a node in the buffer. Depth 1 includes the same level nodes, depth 2 includes the same level and the parent's level, etc."""
    if node.type == "block":
        parent = node.parent
    else:
        parent = node.parent.parent
        if parent is None or parent.type == "module":
            # pass for root or module
            return buffer
        if parent.type == "block":
            # depending which level the node starts, a single level parent could be a block. We need a class of function definition which is one level up.
            parent = parent.parent

    parent_str = ""
    byte_ranges = []
    for child in parent.children:
        if child.type in TYPES_TO_INCLUDE:
            byte_ranges.append(child.byte_range)

    # merge byte ranges
    new_byte_ranges = []
    merged_byte_range = ()
    (1,3), (4,6), (7,9)
    for i in range(len(byte_ranges) - 1):
        if byte_ranges[i][1] == byte_ranges[i + 1][0] - 1:
            if merged_byte_range == ():
                merged_byte_range = (byte_ranges[i][0], byte_ranges[i + 1][1])
            else:
                merged_byte_range = (merged_byte_range[0], byte_ranges[i + 1][1])
        else:
            if merged_byte_range == ():
                new_byte_ranges.append(byte_ranges[i])
            else:
                new_byte_ranges.append(merged_byte_range)
                merged_byte_range = ()
            if i == len(byte_ranges) - 2:
                new_byte_ranges.append(byte_ranges[i + 1])

    for byte_range in new_byte_ranges:
        reach_the_begin = False
        left_end_byte_of_chunk = byte_range[0]
        while not reach_the_begin:
            if text[left_end_byte_of_chunk - 1] != " ":
                reach_the_begin = True
            else:
                left_end_byte_of_chunk -= 1
        parent_str += text[left_end_byte_of_chunk : byte_range[1] + 1]

    # Add extra information about the parent node
    if include_parent_attributes:
        pass

    if include_parent_init:
        pass

    if depth == 1:  # last level
        return parent_str + "\n" + buffer
    else:
        return include_parent(
            parent,
            text,
            include_parent_attributes,
            include_parent_init,
            depth - 1,
            parent_str + "\n" + buffer,
        )

In [78]:
def chunk_node(
    node: Any,
    text: str,
    last_end: int = 0,
    max_chars: int = 1500,
    include_related_imports: bool = True,
    include_parent_depth: int = 1,
    include_parent_attributes: bool = False,
    summarize_parent_attributes: bool = False,
    include_parent_init: bool = False,
    summarize_parent_init: bool = False,
) -> List[str]:
    new_chunks = []
    imports = []
    current_chunk = ""
    is_first_node_for_chunk = True
    for child in node.children:
        if child.end_byte - child.start_byte > max_chars:
            # Child is too big, recursively chunk the child
            if len(current_chunk) > 0:
                new_chunks.append(current_chunk)
            current_chunk = ""
            new_chunks.extend(
                chunk_node(
                    child,
                    text,
                    last_end,
                    max_chars,
                    include_related_imports,
                    include_parent_depth,
                    include_parent_attributes,
                    summarize_parent_attributes,
                    include_parent_init,
                    summarize_parent_init
                )
            )
            is_first_node_for_chunk = True
        else:
            # add related parts only for the first node of a chunk
            if is_first_node_for_chunk and include_parent_depth > 0 and len(current_chunk.strip()) == 0:
                parent_information = include_parent(
                    child,
                    text,
                    include_parent_attributes,
                    include_parent_init,
                    include_parent_depth,
                    "",
                )
                current_chunk = parent_information + text[last_end : child.end_byte]
            else:
                if len(current_chunk) + child.end_byte - last_end < max_chars:
                    current_chunk += text[last_end : child.end_byte]
                else:
                    new_chunks.append(current_chunk)
                    if include_parent_depth > 0:
                        parent_information = include_parent(
                            child,
                            text,
                            include_parent_attributes,
                            include_parent_init,
                            include_parent_depth,
                            "",
                        )
                        current_chunk = (
                            parent_information
                            + text[last_end : child.end_byte]
                        )
                    else:
                        current_chunk = text[last_end : child.end_byte]
                    
            is_first_node_for_chunk = False

        last_end = child.end_byte

    if len(current_chunk) > 0:
        new_chunks.append(current_chunk)

    return new_chunks

In [79]:
import tree_sitter_languages
parser = tree_sitter_languages.get_parser("python")

with open(
    "/Users/minkijung/Documents/2PetProjects/AI_README_Generator/testing/test2.py", "r"
) as file:
    text = file.read()
tree = parser.parse(bytes(text, "utf-8"))
root = tree.root_node

In [80]:
chunks = chunk_node(root, text, 0, 1500, 2, True, True, True)

In [81]:
for i,result in enumerate(chunks):
    print(result)
    print("---------------")

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union, cast

from langchain_core.load.serializable import Serializable
from langchain_core.pydantic_v1 import Extra, Field
from langchain_core.utils import get_bolded_text
from langchain_core.utils._merge import merge_dicts, merge_lists
from langchain_core.utils.interactive_env import is_interactive_env

if TYPE_CHECKING:
    from langchain_core.prompts.chat import ChatPromptTemplate
---------------



class BaseMessage(Serializable):
---------------
class BaseMessage((Serializable)::


    """Base abstract message class.

    Messages are the inputs and outputs of ChatModels.
    """

    content: Union[str, List[Union[str, Dict]]]
    """The string contents of the message."""

    additional_kwargs: dict = Field(default_factory=dict)
    """Reserved for additional payload data associated with the message.
    
    For example, for a message from an AI, this could include tool ca

In [82]:
from app.agents.subgraphs.retrieval.chunking import chunk_with_AST_parser


result = chunk_with_AST_parser(
    "/Users/minkijung/Documents/2PetProjects/AI_README_Generator/testing"
)
for i, r in enumerate(result):
    print(r.text)
    print("---------------")

language:  python
"""Code splitter."""

import os
from typing import Any, Callable, List, Optional

from llama_index.core.bridge.pydantic import Field, PrivateAttr
from llama_index.core.callbacks.base import CallbackManager
from llama_index.core.callbacks.schema import CBEventType, EventPayload
from llama_index.core.node_parser.interface import TextSplitter
from llama_index.core.node_parser.node_utils import default_id_func
from llama_index.core.schema import Document

DEFAULT_CHUNK_LINES = 40
DEFAULT_LINES_OVERLAP = 15
DEFAULT_MAX_CHARS = 1500
---------------
class CodeSplitter(TextSplitter):
---------------
"""Split code using a AST parser.

    Thank you to Kevin Lu / SweepAI for suggesting this elegant code splitting solution.
    https://docs.sweep.dev/blogs/chunking-2m-files
    """

    language: str = Field(
        description="The programming language of the code being split."
    )
    chunk_lines: int = Field(
        default=DEFAULT_CHUNK_LINES,
        description="The nu