-
Notifications
You must be signed in to change notification settings - Fork 45
Add Splitter classes #51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
fad24d0
fix: update __init__.py in models
ChenZiHong-Gavin a430183
refactor: add datatypes
ChenZiHong-Gavin 0bffcfc
feat: add splitter classes
ChenZiHong-Gavin b02307f
Update graphgen/bases/datatypes.py
ChenZiHong-Gavin 86e9082
Update tests/integration_tests/models/splitter/test_markdown_splitter.py
ChenZiHong-Gavin 797781d
Update graphgen/models/splitter/recursive_character_splitter.py
ChenZiHong-Gavin 6a6cb34
feat(webui): update webui with splitter config
ChenZiHong-Gavin d439262
Update graphgen/models/splitter/markdown_splitter.py
ChenZiHong-Gavin fdaef0e
Update graphgen/graphgen.py
ChenZiHong-Gavin File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,135 @@ | ||
| import copy | ||
| import re | ||
| from abc import ABC, abstractmethod | ||
| from dataclasses import dataclass | ||
| from typing import Callable, Iterable, List, Literal, Optional, Union | ||
|
|
||
| from graphgen.bases.datatypes import Chunk | ||
| from graphgen.utils import logger | ||
|
|
||
|
|
||
| @dataclass | ||
| class BaseSplitter(ABC): | ||
| """ | ||
| Abstract base class for splitting text into smaller chunks. | ||
| """ | ||
|
|
||
| chunk_size: int = 1024 | ||
| chunk_overlap: int = 100 | ||
| length_function: Callable[[str], int] = len | ||
| keep_separator: bool = False | ||
| add_start_index: bool = False | ||
| strip_whitespace: bool = True | ||
|
|
||
| @abstractmethod | ||
| def split_text(self, text: str) -> List[str]: | ||
| """ | ||
| Split the input text into smaller chunks. | ||
|
|
||
| :param text: The input text to be split. | ||
| :return: A list of text chunks. | ||
| """ | ||
|
|
||
| def create_chunks( | ||
| self, texts: List[str], metadatas: Optional[List[dict]] = None | ||
| ) -> List[Chunk]: | ||
| """Create chunks from a list of texts.""" | ||
| _metadatas = metadatas or [{}] * len(texts) | ||
| chunks = [] | ||
| for i, text in enumerate(texts): | ||
| index = 0 | ||
| previous_chunk_len = 0 | ||
| for chunk in self.split_text(text): | ||
| metadata = copy.deepcopy(_metadatas[i]) | ||
| if self.add_start_index: | ||
| offset = index + previous_chunk_len - self.chunk_overlap | ||
| index = text.find(chunk, max(0, offset)) | ||
| metadata["start_index"] = index | ||
| previous_chunk_len = len(chunk) | ||
| new_chunk = Chunk(content=chunk, metadata=metadata) | ||
| chunks.append(new_chunk) | ||
| return chunks | ||
|
|
||
| def _join_chunks(self, chunks: List[str], separator: str) -> Optional[str]: | ||
| text = separator.join(chunks) | ||
| if self.strip_whitespace: | ||
| text = text.strip() | ||
| if text == "": | ||
| return None | ||
| return text | ||
|
|
||
| def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]: | ||
| # We now want to combine these smaller pieces into medium size chunks to send to the LLM. | ||
| separator_len = self.length_function(separator) | ||
|
|
||
| chunks = [] | ||
| current_chunk: List[str] = [] | ||
| total = 0 | ||
| for d in splits: | ||
| _len = self.length_function(d) | ||
| if ( | ||
| total + _len + (separator_len if len(current_chunk) > 0 else 0) | ||
| > self.chunk_size | ||
| ): | ||
| if total > self.chunk_size: | ||
| logger.warning( | ||
| "Created a chunk of size %s, which is longer than the specified %s", | ||
| total, | ||
| self.chunk_size, | ||
| ) | ||
| if len(current_chunk) > 0: | ||
| chunk = self._join_chunks(current_chunk, separator) | ||
| if chunk is not None: | ||
| chunks.append(chunk) | ||
| # Keep on popping if: | ||
| # - we have a larger chunk than in the chunk overlap | ||
| # - or if we still have any chunks and the length is long | ||
| while total > self.chunk_overlap or ( | ||
| total + _len + (separator_len if len(current_chunk) > 0 else 0) | ||
| > self.chunk_size | ||
| and total > 0 | ||
| ): | ||
| total -= self.length_function(current_chunk[0]) + ( | ||
| separator_len if len(current_chunk) > 1 else 0 | ||
| ) | ||
| current_chunk = current_chunk[1:] | ||
| current_chunk.append(d) | ||
| total += _len + (separator_len if len(current_chunk) > 1 else 0) | ||
| chunk = self._join_chunks(current_chunk, separator) | ||
| if chunk is not None: | ||
| chunks.append(chunk) | ||
| return chunks | ||
|
|
||
| @staticmethod | ||
| def _split_text_with_regex( | ||
| text: str, separator: str, keep_separator: Union[bool, Literal["start", "end"]] | ||
| ) -> List[str]: | ||
| # Now that we have the separator, split the text | ||
| if separator: | ||
| if keep_separator: | ||
| # The parentheses in the pattern keep the delimiters in the result. | ||
| _splits = re.split(f"({separator})", text) | ||
| splits = ( | ||
| ( | ||
| [ | ||
| _splits[i] + _splits[i + 1] | ||
| for i in range(0, len(_splits) - 1, 2) | ||
| ] | ||
| ) | ||
| if keep_separator == "end" | ||
| else ( | ||
| [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] | ||
| ) | ||
| ) | ||
| if len(_splits) % 2 == 0: | ||
| splits += _splits[-1:] | ||
| splits = ( | ||
| (splits + [_splits[-1]]) | ||
| if keep_separator == "end" | ||
| else ([_splits[0]] + splits) | ||
| ) | ||
| else: | ||
| splits = re.split(separator, text) | ||
| else: | ||
| splits = list(text) | ||
| return [s for s in splits if s != ""] | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| from dataclasses import dataclass, field | ||
|
|
||
|
|
||
| @dataclass | ||
| class Chunk: | ||
| id: str | ||
| content: str | ||
| metadata: dict = field(default_factory=dict) | ||
|
|
||
|
|
||
| @dataclass | ||
| class QAPair: | ||
| """ | ||
| A pair of question and answer. | ||
| """ | ||
|
|
||
| question: str | ||
| answer: str |
File renamed without changes.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Creating Chunk instances without providing the required
idfield will cause runtime errors. The Chunk dataclass requires all three fields (id, content, metadata) but only content and metadata are being provided.