#### Packages installation

In [1]:
!pip install "git+https://github.com/panalexeu/horchunk.git"
!pip install "git+https://github.com/panalexeu/horchunk.git"
!pip install numpy
!pip install rich

/bin/bash: line 1: pip: command not found
/bin/bash: line 1: pip: command not found
/bin/bash: line 1: pip: command not found
/bin/bash: line 1: pip: command not found


#### Define chunker wrapper to evaluate over chunking_evaluation framework

In [2]:
from horchunk.chunkers import WindowChunker 
from horchunk.splitters import SentenceSplitter 
from chromadb.utils import embedding_functions
from chunking_evaluation.chunking.base_chunker import BaseChunker

ef = embedding_functions.DefaultEmbeddingFunction() # all-MiniLM-L6-v2

# thresh: 0.61, max_chunk_size=6
class WinChunkerWrapper(BaseChunker):    
    def __init__(self, thresh: float, max_chunk_size: int):
        self.thresh = thresh
        self.max_chunk_size = max_chunk_size
        
    def split_text(self, text: str) -> list[str]:
        chunker = WindowChunker(
            ef,
            thresh=self.thresh,
            max_chunk_size=self.max_chunk_size
        )
        splits = SentenceSplitter(text).__call__()
        chunks = chunker.__call__(splits)

        return [chunk.join() for chunk in chunks]

  from .autonotebook import tqdm as notebook_tqdm


#### Start evaluation

In [3]:
def parse_res(res: dict) -> dict:
    return dict(
        iou_mean=res['iou_mean'],
        iou_std=res['iou_std'],
        recall_mean=res['recall_mean'],
        recall_std=res['recall_std'],
        precision_omeaga_mean=res['precision_omega_mean'],
        precision_omeaga_std=res['precision_omega_std'],
        precision_mean=res['precision_mean'], 
        precision_std=res['precision_std']
    )

In [4]:
from chunking_evaluation import GeneralEvaluation
from rich import print

chunker = WinChunkerWrapper(thresh=0.72, max_chunk_size=3)
res = GeneralEvaluation().run(chunker, ef)
print(parse_res(res))

100%|█████████████████████████████████████████| 658/658 [01:05<00:00, 10.06it/s]
100%|███████████████████████████████████████| 1090/1090 [01:36<00:00, 11.28it/s]
100%|███████████████████████████████████████| 6320/6320 [09:39<00:00, 10.90it/s]
100%|█████████████████████████████████████████| 257/257 [00:23<00:00, 11.16it/s]
100%|███████████████████████████████████████| 4458/4458 [06:29<00:00, 11.44it/s]
⚠️ It looks like you upgraded from a version below 0.6 and could benefit from vacuuming your database. Run chromadb utils vacuum --help for more information.


In [5]:
from chunking_evaluation import GeneralEvaluation
from rich import print

chunker = WinChunkerWrapper(thresh=0.72, max_chunk_size=6)
res = GeneralEvaluation().run(chunker, ef)
print(parse_res(res))

100%|█████████████████████████████████████████| 658/658 [01:00<00:00, 10.90it/s]
100%|███████████████████████████████████████| 1090/1090 [01:54<00:00,  9.50it/s]
100%|███████████████████████████████████████| 6320/6320 [09:33<00:00, 11.03it/s]
100%|█████████████████████████████████████████| 257/257 [00:21<00:00, 11.72it/s]
100%|███████████████████████████████████████| 4458/4458 [06:28<00:00, 11.47it/s]


In [6]:
from chunking_evaluation import GeneralEvaluation
from rich import print

chunker = WinChunkerWrapper(thresh=0.72, max_chunk_size=9)
res = GeneralEvaluation().run(chunker, ef)
print(parse_res(res))

100%|█████████████████████████████████████████| 658/658 [00:58<00:00, 11.26it/s]
100%|███████████████████████████████████████| 1090/1090 [01:48<00:00, 10.02it/s]
100%|███████████████████████████████████████| 6320/6320 [09:23<00:00, 11.22it/s]
100%|█████████████████████████████████████████| 257/257 [00:25<00:00, 10.00it/s]
100%|███████████████████████████████████████| 4458/4458 [06:28<00:00, 11.47it/s]
